├── README.md
├── src
    ├── dajnorm.h
    ├── dajdef.h
    ├── dajfunc.h
    ├── dajmodel.h
    ├── dajnorm.cpp
    ├── dajdense.h
    ├── dajgemm.h
    ├── dajutil.h
    ├── dajconv.h
    ├── dajmodel.cpp
    ├── dajnn.cpp
    ├── dajnn.h
    ├── dajdense.cpp
    ├── dajtensor.h
    ├── dajutil.cpp
    ├── dajfunc.cpp
    ├── dajconv.cpp
    ├── dajgemm.cpp
    └── dajtensor.cpp
├── vsproj
    ├── main.cpp
    ├── dajnn.vcxproj.user
    ├── dajnn.sln
    ├── dajnn.vcxproj.filters
    └── dajnn.vcxproj
└── paddle
    ├── paddle_lite_factory_helper.h
    ├── paddle_use_ops.h
    ├── paddle_use_passes.h
    ├── paddle_use_kernels.h
    ├── paddle_image_preprocess.h
    ├── paddle_place.h
    ├── paddle_api_2.h
    └── paddle_api.h


/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # dajnn
3 | Customized C++ Deep Learning Framework (Multiplatform, Inference Only)
4 | 
5 | Fast, optimized, portable, easy C++ inference framework for Deep Learning.
6 | 


--------------------------------------------------------------------------------
/src/dajnorm.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dajnn.h"
 5 | 
 6 | namespace dajnn {
 7 | namespace norm {
 8 | 
 9 | void batch_norm_with_precomputed(FTensor* tensor, FTensor* pc_gamma, FTensor* pc_beta);
10 | 
11 | }
12 | }
13 | 


--------------------------------------------------------------------------------
/vsproj/main.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dajmodel.h"
 3 | 
 4 | using namespace dajnn;
 5 | 
 6 | int main(int argc, const char** argv) {
 7 | 	Model* model = new Model("../../pymaster/duel/export/koni_p2_d5.daj");
 8 | 	delete model;
 9 | 	return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/dajdef.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #ifdef _WIN32
 5 | 
 6 | #ifdef _DEBUG
 7 | #define TRACE_MEMORY_LEAK
 8 | #endif
 9 | 
10 | #else // _WIN32
11 | 
12 | #define LITE_WITH_ARM
13 | #define PADDLE
14 | #define PADDLE_THREADS 2
15 | #define PADDLE_CLS 1
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/dajfunc.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dajnn.h"
 5 | 
 6 | namespace dajnn {
 7 | namespace func {
 8 | 
 9 | void relu(FTensor* tensor);
10 | void tanh(FTensor* tensor);
11 | 
12 | void scale(FTensor* tensor, FTensor* weight, FTensor* bias, bool is_first_batch_dim);
13 | void add(FTensor* dst, FTensor* oprd);
14 | 
15 | }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/dajmodel.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dajtensor.h"
 5 | 
 6 | namespace dajnn {
 7 | 
 8 | class Model {
 9 | public:
10 | 	Model();
11 | 	Model(ByteStream* stream);
12 | 	virtual ~Model();
13 | 
14 | public:
15 | 	uint length();
16 | 
17 | 	FTensor* get_f(uint idx);
18 | 	ITensor* get_i(uint idx);
19 | 
20 | protected:
21 | 	Tensor** weights;
22 | 	uint weights_len;
23 | };
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/dajnorm.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dajnorm.h"
 3 | #include "dajtensor.h"
 4 | #include "dajfunc.h"
 5 | 
 6 | #ifdef PADDLE
 7 | #include "paddle_api_2.h"
 8 | using namespace paddle::lite_api;
 9 | #endif
10 | 
11 | namespace dajnn {
12 | namespace norm {
13 | 
14 | void batch_norm_with_precomputed(FTensor* tensor, FTensor* pc_gamma, FTensor* pc_beta) {
15 | 	func::scale(tensor, pc_gamma, pc_beta, true);	
16 | }
17 | 
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/dajdense.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dajnn.h"
 5 | 
 6 | namespace dajnn {
 7 | namespace dense {
 8 | 
 9 | /*
10 | 	full-connected layer
11 | 	@param input: 2-d tensor with shape (n, m)
12 | 	@param kernel: 2-d tensor with shape (m, p)
13 | 	@param bias: null or 1-d tensor with shape (p)
14 | 	@return: 2-d tensor with shape (n, p)
15 | */
16 | FTensor* dense(FTensor* input, FTensor* kernel, FTensor* bias);
17 | 
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/dajgemm.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | namespace dajnn {
 5 | 
 6 | void gemm_bin(int M, int N, int K, float ALPHA, char* A, int lda, float* B, int ldb, float* C, int ldc);
 7 | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc);
 8 | void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc);
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/vsproj/dajnn.vcxproj.user:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 4 |     <LocalDebuggerWorkingDirectory>$(OutDir)</LocalDebuggerWorkingDirectory>
 5 |     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
 6 |   </PropertyGroup>
 7 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 8 |     <LocalDebuggerWorkingDirectory>$(OutDir)</LocalDebuggerWorkingDirectory>
 9 |     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
10 |   </PropertyGroup>
11 | </Project>


--------------------------------------------------------------------------------
/src/dajutil.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dajtensor.h"
 5 | 
 6 | namespace dajnn {
 7 | 
 8 | void log_i(const char* format, ...);
 9 | void log_w(const char* format, ...);
10 | void log_d(const char* format, ...);
11 | void log_e(const char* format, ...);
12 | void log_x(const char* type_str, const char* format, va_list ap);
13 | 
14 | void exit_if(bool condition, const char* format = nullptr, ...);
15 | 
16 | float get_max(float* arr, uint len);
17 | float get_min(float* arr, uint len);
18 | 
19 | int get_max(int* arr, uint len);
20 | int get_min(int* arr, uint len);
21 | 
22 | uint get_span(vector<uint>* shape);
23 | string get_shape_str(vector<uint>* shape);
24 | string format_str(const char* format, ...);
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/dajconv.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dajnn.h"
 5 | 
 6 | namespace dajnn {
 7 | namespace conv {
 8 | 
 9 | /*
10 | 	2-d convolutional layer
11 | 	@param input: 4-d tensor with shape (n, c, h, w)
12 | 	@param kernel: 4-d tensor with shape (f, c, k_h, k_w)
13 | 	@param bias: null or 1-d tensor with shape (f)
14 | 	@param padding_x: padding sizes (-1 for auto, 0 for no padding)
15 | 	@param stride_x: strides
16 | 	@param dilation_x: dilations
17 | 	@return: 4-d tensor with shape (n, f, _h_, _w_)
18 | 
19 | 	CAUTION:
20 | 		f (# of filters) must be >1 for mobile forward (paddle-lite's bug)
21 | 		dilation_x must be =1 for win32 forward (darknet's limitance)
22 | */
23 | FTensor* conv2d(FTensor* input, FTensor* kernel, FTensor* bias = nullptr,
24 | 	int padding_h = -1, int padding_w = -1, int stride_h = 1, int stride_w = 1,
25 | 	int dilation_h = 1, int dilation_w = 1);
26 | 
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/vsproj/dajnn.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2013
 4 | VisualStudioVersion = 12.0.21005.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dajnn", "dajnn.vcxproj", "{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Win32 = Debug|Win32
11 | 		Release|Win32 = Release|Win32
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Debug|Win32.ActiveCfg = Debug|Win32
15 | 		{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Debug|Win32.Build.0 = Debug|Win32
16 | 		{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Release|Win32.ActiveCfg = Release|Win32
17 | 		{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Release|Win32.Build.0 = Release|Win32
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | EndGlobal
23 | 


--------------------------------------------------------------------------------
/src/dajmodel.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dajmodel.h"
 3 | #include "dajutil.h"
 4 | 
 5 | namespace dajnn {
 6 | 
 7 | Model::Model() {
 8 | 	weights = nullptr;
 9 | 	weights_len = 0;
10 | }
11 | 
12 | Model::~Model() {
13 | 	if (weights) {
14 | 		for (uint i = 0; i < weights_len; ++i) {
15 | 			delete weights[i];
16 | 		}
17 | 		delete[] weights;
18 | 	}
19 | }
20 | 
21 | Model::Model(ByteStream* stream) : Model() {
22 | 	string header = stream->read_str();
23 | 	
24 | 	if (header.compare(MODEL_HEADER)) {
25 | 		log_w("invalid model header : %s", header);
26 | 		return;
27 | 	}
28 | 	vector<Tensor*> tensors;
29 | 
30 | 	while (true) {
31 | 		string mode = stream->read_str();
32 | 
33 | 		if (!mode.compare("f")) {
34 | 			tensors.push_back(new FTensor(stream));
35 | 		} else if (!mode.compare("i")) {
36 | 			tensors.push_back(new ITensor(stream));
37 | 		} else if (!mode.compare(MODEL_FOOTER)) {
38 | 			break;
39 | 		} else {
40 | 			log_w("invalid tensor mode (%s) from model", mode.c_str());
41 | 		}
42 | 	}
43 | 	weights_len = tensors.size();
44 | 	weights = new Tensor*[weights_len];
45 | 
46 | 	for (uint i = 0; i < weights_len; ++i) {
47 | 		weights[i] = tensors[i];
48 | 	}
49 | }
50 | 
51 | uint Model::length() {
52 | 	return weights_len;
53 | }
54 | 
55 | FTensor* Model::get_f(uint idx) {
56 | 	return (FTensor*) weights[idx];
57 | }
58 | 
59 | ITensor* Model::get_i(uint idx) {
60 | 	return (ITensor*) weights[idx];
61 | }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/dajnn.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dajnn.h"
 3 | #include "dajutil.h"
 4 | 
 5 | #ifdef PADDLE
 6 | #include "paddle_api_2.h"
 7 | using namespace paddle::lite_api;
 8 | #endif
 9 | 
10 | namespace dajnn {
11 | 
12 | #ifdef TRACE_MEMORY_LEAK
13 | vector<Tensor*> _tensor_trace_pool_;
14 | vector<uint> _tensor_unique_indice_;
15 | uint _tensor_trace_len_ = 0;
16 | 
17 | void push_tensor_trace(Tensor* tensor) {
18 | 	if (_tensor_trace_pool_.empty()) _tensor_trace_len_ = 0;
19 | 
20 | 	_tensor_trace_pool_.push_back(tensor);
21 | 	_tensor_unique_indice_.push_back(_tensor_trace_len_++);
22 | }
23 | 
24 | uint pop_tensor_trace(Tensor* tensor) {
25 | 	uint i = 0;
26 | 
27 | 	for (vector<Tensor*>::iterator ti = _tensor_trace_pool_.begin();
28 | 		ti != _tensor_trace_pool_.end(); ++ti, ++i) {
29 | 		if (tensor == *ti) break;
30 | 	}
31 | 	exit_if(i == _tensor_trace_pool_.size(), "cannot find tensor to pop from trace");
32 | 
33 | 	uint idx = _tensor_unique_indice_[i];
34 | 	_tensor_unique_indice_.erase(_tensor_unique_indice_.begin() + i);
35 | 	_tensor_trace_pool_.erase(_tensor_trace_pool_.begin() + i);
36 | 	return idx;
37 | }
38 | 
39 | vector<uint> get_leaked_tensor_indice() {
40 | 	return _tensor_unique_indice_;
41 | }
42 | #endif
43 | 
44 | void init_dajnn() {
45 | #ifdef PADDLE
46 | 	paddle_DeviceInit();
47 | #endif
48 | }
49 | 
50 | void finish_dajnn() {
51 | #ifdef TRACE_MEMORY_LEAK
52 | 	if (!_tensor_unique_indice_.empty()) {
53 | 		string msg = "leaked tensors : ";
54 | 
55 | 		for (vector<uint>::iterator idx = _tensor_unique_indice_.begin(); idx != _tensor_unique_indice_.end(); ++idx) {
56 | 			msg += format_str("%d ", *idx);
57 | 		}
58 | 		log_e("%s", msg);
59 | 	}
60 | #endif
61 | }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/dajnn.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <stdarg.h>
 7 | #include <memory.h>
 8 | #include <math.h>
 9 | #include <float.h>
10 | #include <time.h>
11 | #include <random>
12 | #include <vector>
13 | #include <algorithm>
14 | #include <string>
15 | #include "dajdef.h"
16 | 
17 | #ifdef PADDLE
18 | #include <jni.h>
19 | #include <android/log.h>
20 | #endif
21 | 
22 | using namespace std;
23 | 
24 | namespace dajnn {
25 | 
26 | #ifndef uchar
27 | #define uchar unsigned char
28 | #endif
29 | 
30 | #ifndef ushort
31 | #define ushort unsigned short
32 | #endif
33 | 
34 | #ifndef uint
35 | #define uint unsigned int
36 | #endif
37 | 
38 | #if defined(_MSC_VER) && _MSC_VER < 1900
39 | #define inline __inline
40 | #endif
41 | 
42 | #ifndef INT_MIN
43 | #define INT_MIN -2147483648
44 | #define INT_MAX 2147483647
45 | #endif
46 | 
47 | #ifndef FLOAT_MIN
48 | #define FLOAT_MIN -1e10f
49 | #define FLOAT_MAX 1e10f
50 | #endif
51 | 
52 | #ifndef SHRT_MIN
53 | #define SHRT_MIN -32768
54 | #define SHRT_MAX 32767
55 | #endif
56 | 
57 | #ifndef MIN
58 | #define MIN(x, y) (((x) < (y)) ? (x) : (y))
59 | #endif
60 | 
61 | #ifndef MAX
62 | #define MAX(x, y) (((x) > (y)) ? (x) : (y))
63 | #endif
64 | 
65 | #define END_DIM 0xFFFFFFFF
66 | #define MAX_TENSOR_DIM 16
67 | #define MAX_MODEL_STR 256
68 | 
69 | #define MODEL_HEADER "MRB_NN_DAJ_MODEL_V1_BEGIN"
70 | #define MODEL_FOOTER "MRB_NN_DAJ_MODEL_END"
71 | 
72 | class Tensor;
73 | class ITensor;
74 | class FTensor;
75 | 
76 | void init_dajnn();
77 | void finish_dajnn();
78 | 
79 | #ifdef TRACE_MEMORY_LEAK
80 | void push_tensor_trace(Tensor* tensor);
81 | uint pop_tensor_trace(Tensor* tensor);
82 | 
83 | vector<uint> get_leaked_tensor_indice();
84 | #endif
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/paddle/paddle_lite_factory_helper.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | /*
16 |  * This file defines some MACROS that explicitly determine the op, kernel, mir
17 |  * passes used in the inference lib.
18 |  */
19 | #pragma once
20 | 
21 | // some platform-independent defintion
22 | 
23 | #if defined(_WIN32)
24 | #define UNUSED
25 | #define __builtin_expect(EXP, C) (EXP)
26 | #else
27 | #define UNUSED __attribute__((unused))
28 | #endif
29 | 
30 | #define USE_LITE_OP(op_type__)       \
31 |   extern int touch_op_##op_type__(); \
32 |   int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
33 | 
34 | #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
35 |   extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
36 |   int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
37 |       UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
38 | 
39 | #define USE_MIR_PASS(name__)                      \
40 |   extern bool mir_pass_registry##name__##_fake(); \
41 |   static bool mir_pass_usage##name__ UNUSED =     \
42 |       mir_pass_registry##name__##_fake();
43 | 
44 | #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
45 | 


--------------------------------------------------------------------------------
/src/dajdense.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dajdense.h"
 3 | #include "dajtensor.h"
 4 | #include "dajutil.h"
 5 | #include "dajgemm.h"
 6 | 
 7 | #ifdef PADDLE
 8 | #include "paddle_api_2.h"
 9 | using namespace paddle::lite_api;
10 | #endif
11 | 
12 | namespace dajnn {
13 | namespace dense {
14 | 
15 | FTensor* dense(FTensor* input, FTensor* kernel, FTensor* bias) {
16 | 	exit_if(input->shape.size() != 2, "input dim of dense expects to be 2, but got %d", input->shape.size());
17 | 	exit_if(kernel->shape.size() != 2, "kernel dim of dense expects to be 2, but got %d", kernel->shape.size());
18 | 	exit_if(bias && (bias->shape.size() != 1), "bias dim of dense expects to be null or 1, but got %d", bias->shape.size());
19 | 
20 | 	uint n = input->shape[0];
21 | 	uint m = input->shape[1];
22 | #ifdef PADDLE
23 | 	uint p = kernel->shape[1];
24 | 	bool shape_ok = kernel->shape[0] == m;
25 | #else
26 | 	uint p = kernel->shape[0];
27 | 	bool shape_ok = kernel->shape[1] == m;
28 | #endif
29 | 
30 | 	exit_if(!shape_ok, "dense input and kernel shapes mismatch : %s and %s",
31 | 		get_shape_str(&input->shape).c_str(),
32 | 		get_shape_str(&kernel->shape).c_str());
33 | 	exit_if(bias && (bias->span != p), "dense kernel and bias shapes mismatch : %s and %s",
34 | 		get_shape_str(&kernel->shape).c_str(),
35 | 		get_shape_str(&bias->shape).c_str());
36 | 
37 | 	FTensor* output = new FTensor(n, p, END_DIM);	
38 | #ifdef PADDLE
39 | 	paddle_matmul(n, p, m, input->val, kernel->val, output->val);
40 | 	
41 | 	if (bias) {
42 | 		float* op = output->val;
43 | 
44 | 		for (uint i = 0; i < n; ++i, op += p) {
45 | 			paddle_elementwise_add(op, bias->val, op, p);
46 | 		}
47 | 	}
48 | #else
49 | 	memset(output->val, 0, 4 * output->span);
50 | 	gemm(0, 1, n, p, m, 1, input->val, m, kernel->val, m, 1, output->val, p);
51 | 
52 | 	if (bias) {
53 | 		float* op = output->val;
54 | 
55 | 		for (uint i = 0; i < n; ++i) {
56 | 			for (float* bp = bias->val; bp < bias->val + p; ++bp, ++op) {
57 | 				*op += *bp;
58 | 			}
59 | 		}
60 | 	}
61 | #endif
62 | 	return output;
63 | }
64 | 
65 | }
66 | }


--------------------------------------------------------------------------------
/src/dajtensor.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #pragma once
  3 | 
  4 | #include "dajnn.h"
  5 | 
  6 | namespace dajnn {
  7 | 
  8 | class ByteStream {
  9 | public:
 10 | 	ByteStream();
 11 | 	ByteStream(const void* buff);
 12 | 	ByteStream(FILE* fp);
 13 | 
 14 | 	string read_str();
 15 | 	uint read(void* dst, int ele_size, int ele_count);
 16 | 	void write(void* src, int ele_size, int ele_count);
 17 | 	int seek();
 18 | 
 19 | private:
 20 | 	const char* buff;
 21 | 	FILE* fp;
 22 | 	int pointer;
 23 | };
 24 | 
 25 | class Tensor {
 26 | public:
 27 | 	Tensor();
 28 | 	virtual ~Tensor();
 29 | 
 30 | 	void reshape(vector<uint>* shape);
 31 | 	void reshape(uint dim1, ...);
 32 | 	bool is_shape(vector<uint>* shape);
 33 | 	bool is_shape(uint dim1, ...);
 34 | 	void set_releasable(bool releasable);
 35 | 
 36 | public:
 37 | 	vector<uint> shape;
 38 | 	uint span;
 39 | 	bool releasable;
 40 | 
 41 | protected:
 42 | 	void* _init_(Tensor* tensor, bool copy_val = true);
 43 | 	void* _init_(vector<uint>* shape, void* val = nullptr, bool copy_val = true);
 44 | 	void* _init_(void* val, bool copy_val, uint dim1, va_list ap);
 45 | 	void* _init_(ByteStream* stream);
 46 | 
 47 | 	void _read_meta_(ByteStream* stream);
 48 | 	void _write_meta_(ByteStream* stream);
 49 | 	void _write_val_(ByteStream* stream);
 50 | 	void _save_(ByteStream* stream);
 51 | 	
 52 | protected:
 53 | 	void* _val_;
 54 | };
 55 | 
 56 | class ITensor : public Tensor {
 57 | public:
 58 | 	ITensor();
 59 | 	ITensor(ITensor* tensor, bool copy_val = true);
 60 | 	ITensor(vector<uint>* shape, int* val = nullptr, bool copy_val = true);
 61 | 	ITensor(int* val, bool copy_val, uint dim1, ...);
 62 | 	ITensor(uint dim1, ...);
 63 | 	ITensor(ByteStream* stream);
 64 | 
 65 | 	void save(ByteStream* stream, bool compressed = false);
 66 | 	int compare(ITensor* tensor);
 67 | 	int compare(int* val, uint len);
 68 | 	
 69 | 	int get_max();
 70 | 	int get_min();
 71 | 
 72 | public:
 73 | 	int* val;
 74 | };
 75 | 
 76 | class FTensor : public Tensor {
 77 | public:
 78 | 	FTensor();
 79 | 	FTensor(FTensor* tensor, bool copy_val = true);
 80 | 	FTensor(ITensor* tensor);
 81 | 	FTensor(vector<uint>* shape, float* val = nullptr, bool copy_val = true);
 82 | 	FTensor(float* val, bool copy_val, uint dim1, ...);
 83 | 	FTensor(uint dim1, ...);
 84 | 	FTensor(ByteStream* stream);
 85 | 
 86 | 	void save(ByteStream* stream, bool compressed = false);
 87 | 	void print(uint start = END_DIM, uint end = END_DIM);
 88 | 
 89 | 	float compare(FTensor* tensor);
 90 | 	float compare(float* val, uint len);
 91 | 
 92 | 	float get_max();
 93 | 	float get_min();
 94 | 	
 95 | public:
 96 | 	float* val;
 97 | };
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/paddle/paddle_use_ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "paddle_lite_factory_helper.h"
 3 | 
 4 | USE_LITE_OP(conv2d);
 5 | USE_LITE_OP(depthwise_conv2d);
 6 | USE_LITE_OP(unsqueeze);
 7 | USE_LITE_OP(unsqueeze2);
 8 | USE_LITE_OP(pool2d);
 9 | USE_LITE_OP(fc);
10 | USE_LITE_OP(nearest_interp);
11 | USE_LITE_OP(bilinear_interp);
12 | USE_LITE_OP(batch_norm);
13 | USE_LITE_OP(sync_batch_norm);
14 | USE_LITE_OP(reduce_mean);
15 | USE_LITE_OP(layout);
16 | USE_LITE_OP(assign_value);
17 | USE_LITE_OP(scale);
18 | USE_LITE_OP(fusion_elementwise_sub_activation);
19 | USE_LITE_OP(fusion_elementwise_add_activation);
20 | USE_LITE_OP(fusion_elementwise_mul_activation);
21 | USE_LITE_OP(fusion_elementwise_max_activation);
22 | USE_LITE_OP(fusion_elementwise_min_activation);
23 | USE_LITE_OP(fusion_elementwise_div_activation);
24 | USE_LITE_OP(io_copy_once);
25 | USE_LITE_OP(concat);
26 | USE_LITE_OP(layout_once);
27 | USE_LITE_OP(multiclass_nms);
28 | USE_LITE_OP(multiclass_nms2);
29 | USE_LITE_OP(multiclass_nms3);
30 | USE_LITE_OP(density_prior_box);
31 | USE_LITE_OP(io_copy);
32 | USE_LITE_OP(shuffle_channel);
33 | USE_LITE_OP(elementwise_sub);
34 | USE_LITE_OP(elementwise_add);
35 | USE_LITE_OP(elementwise_mul);
36 | USE_LITE_OP(elementwise_max);
37 | USE_LITE_OP(elementwise_min);
38 | USE_LITE_OP(elementwise_div);
39 | USE_LITE_OP(elementwise_mod);
40 | USE_LITE_OP(elementwise_pow);
41 | USE_LITE_OP(grid_sampler);
42 | USE_LITE_OP(expand_as);
43 | USE_LITE_OP(instance_norm);
44 | USE_LITE_OP(pad2d);
45 | USE_LITE_OP(box_coder);
46 | USE_LITE_OP(sigmoid);
47 | USE_LITE_OP(tanh);
48 | USE_LITE_OP(relu);
49 | USE_LITE_OP(leaky_relu);
50 | USE_LITE_OP(relu6);
51 | USE_LITE_OP(prelu);
52 | USE_LITE_OP(thresholded_relu);
53 | USE_LITE_OP(elu);
54 | USE_LITE_OP(bilinear_interp_v2);
55 | USE_LITE_OP(nearest_interp_v2);
56 | USE_LITE_OP(fill_constant);
57 | USE_LITE_OP(softmax);
58 | USE_LITE_OP(split);
59 | USE_LITE_OP(subgraph);
60 | USE_LITE_OP(slice);
61 | USE_LITE_OP(cast);
62 | USE_LITE_OP(search_fc);
63 | USE_LITE_OP(prior_box);
64 | USE_LITE_OP(conv2d_transpose);
65 | USE_LITE_OP(depthwise_conv2d_transpose);
66 | USE_LITE_OP(squeeze);
67 | USE_LITE_OP(squeeze2);
68 | USE_LITE_OP(arg_max);
69 | USE_LITE_OP(affine_channel);
70 | USE_LITE_OP(fill_constant_batch_size_like);
71 | USE_LITE_OP(affine_grid);
72 | USE_LITE_OP(expand);
73 | USE_LITE_OP(feed);
74 | USE_LITE_OP(yolo_box);
75 | USE_LITE_OP(sequence_topk_avg_pooling);
76 | USE_LITE_OP(mul);
77 | USE_LITE_OP(reshape);
78 | USE_LITE_OP(reshape2);
79 | USE_LITE_OP(fetch);
80 | USE_LITE_OP(matmul);
81 | USE_LITE_OP(calib);
82 | USE_LITE_OP(transpose);
83 | USE_LITE_OP(transpose2);
84 | USE_LITE_OP(range);
85 | USE_LITE_OP(dropout);
86 | USE_LITE_OP(flatten);
87 | USE_LITE_OP(flatten2);
88 | USE_LITE_OP(flatten_contiguous_range);
89 | USE_LITE_OP(stack);
90 | USE_LITE_OP(lod_array_length);


--------------------------------------------------------------------------------
/src/dajutil.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "dajutil.h"
  3 | 
  4 | namespace dajnn {
  5 | 
  6 | void log_i(const char* format, ...) {
  7 | 	va_list ap;
  8 |     va_start(ap, format);
  9 |     log_x("[info]", format, ap);
 10 | 	va_end(ap);
 11 | }
 12 | 
 13 | void log_w(const char* format, ...) {
 14 | 	va_list ap;
 15 |     va_start(ap, format);
 16 |     log_x("[warn]", format, ap);
 17 | 	va_end(ap);
 18 | }
 19 | 
 20 | void log_d(const char* format, ...) {
 21 | 	va_list ap;
 22 |     va_start(ap, format);
 23 |     log_x("[debug]", format, ap);
 24 | 	va_end(ap);
 25 | }
 26 | 
 27 | void log_e(const char* format, ...) {
 28 | 	va_list ap;
 29 |     va_start(ap, format);
 30 |     log_x("[error]", format, ap);
 31 | 	va_end(ap);
 32 | }
 33 | 
 34 | void log_x(const char* type_str, const char* format, va_list ap) {
 35 | 	char msg[1024];
 36 | #ifdef _WIN32
 37 | 	sprintf_s(msg, 1024, format, ap);
 38 | 	printf_s("%s %s\n", type_str, msg);
 39 | #else
 40 | 	vsprintf(msg, format, ap);
 41 | 	__android_log_print(ANDROID_LOG_ERROR, type_str, msg)
 42 | #endif	
 43 | }
 44 | 
 45 | void exit_if(bool condition, const char* format, ...) {
 46 | 	if (condition) {
 47 | 		if (format) {
 48 | 			va_list ap;
 49 | 			va_start(ap, format);
 50 | 			log_e(format, ap);
 51 | 			va_end(ap);
 52 | 		}
 53 | 		exit(-1);
 54 | 	}
 55 | }
 56 | 
 57 | float get_max(float* arr, uint len) {
 58 | 	float m = FLOAT_MIN;
 59 | 
 60 | 	for (float* ap = arr; ap < arr + len; ++ap) {
 61 | 		if (m < *ap) m = *ap;
 62 | 	}
 63 | 	return m;
 64 | }
 65 | 
 66 | float get_min(float* arr, uint len) {
 67 | 	float m = FLOAT_MAX;
 68 | 
 69 | 	for (float* ap = arr; ap < arr + len; ++ap) {
 70 | 		if (m > *ap) m = *ap;
 71 | 	}
 72 | 	return m;
 73 | }
 74 | 
 75 | int get_max(int* arr, uint len) {
 76 | 	int m = INT_MIN;
 77 | 
 78 | 	for (int* ap = arr; ap < arr + len; ++ap) {
 79 | 		if (m < *ap) m = *ap;
 80 | 	}
 81 | 	return m;
 82 | }
 83 | 
 84 | int get_min(int* arr, uint len) {
 85 | 	int m = INT_MAX;
 86 | 
 87 | 	for (int* ap = arr; ap < arr + len; ++ap) {
 88 | 		if (m > *ap) m = *ap;
 89 | 	}
 90 | 	return m;
 91 | }
 92 | 
 93 | uint get_span(vector<uint>* shape) {
 94 | 	uint span = 1;
 95 | 
 96 | 	for (vector<uint>::iterator dim = shape->begin(); dim != shape->end(); ++dim) {
 97 | 		span *= *dim;
 98 | 	}
 99 | 	return span;
100 | }
101 | 
102 | string get_shape_str(vector<uint>* shape) {
103 | 	string str = "(";
104 | 
105 | 	for (uint i = 0; i < shape->size(); ++i) {
106 | 		if (i > 0) str += ",";
107 | 		str += format_str("%d", shape->at(i));
108 | 	}
109 | 	str += ")";
110 | 	return str;
111 | }
112 | 
113 | string format_str(const char* format, ...) {
114 | 	char str[1024];
115 | 	va_list ap;
116 | 	va_start(ap, format);
117 | #ifdef _WIN32
118 | 	sprintf_s(str, 1024, format, ap);	
119 | #else
120 | 	vsprintf(str, format, ap);
121 | #endif
122 | 	va_end(ap);
123 | 	return string(str);
124 | }
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/src/dajfunc.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "dajfunc.h"
  3 | #include "dajtensor.h"
  4 | #include "dajutil.h"
  5 | 
  6 | #ifdef PADDLE
  7 | #include "paddle_api_2.h"
  8 | using namespace paddle::lite_api;
  9 | #endif
 10 | 
 11 | namespace dajnn {
 12 | namespace func {
 13 | 
 14 | void relu(FTensor* tensor) {
 15 | #ifdef PADDLE
 16 | 	paddle_act_relu(tensor->val, tensor->val, tensor->span, PADDLE_THREADS);
 17 | #else
 18 | 	for (float* v = tensor->val; v < tensor->val + tensor->span; ++v) {
 19 | 		if (*v < 0) *v = 0;
 20 | 	}
 21 | #endif
 22 | }
 23 | 
 24 | void tanh(FTensor* tensor) {
 25 | #ifdef PADDLE
 26 | 	act_tanh(tensor->val, tensor->val, tensor->span, PADDLE_THREADS);
 27 | #else
 28 | 	for (float* v = tensor->val; v < tensor->val + tensor->span; ++v) {
 29 | 		*v = 2.f / (1.f + expf(-2.f * *v)) - 1;
 30 | 	}
 31 | #endif
 32 | }
 33 | 
 34 | void scale(FTensor* tensor, FTensor* weight, FTensor* bias, bool is_first_batch_dim) {
 35 | 	uint span = tensor->span;
 36 | 	int num_batches = is_first_batch_dim ? tensor->shape[0] : 1;
 37 | 	int num_channels = tensor->shape[is_first_batch_dim ? 1 : 0];
 38 | 	int num_features = span / num_batches / num_channels;
 39 | 
 40 | 	exit_if((weight->shape.size() != 1) || (weight->span != num_channels),
 41 | 		"invalid scale weight shape with tensor shape : %s and %s",
 42 | 		get_shape_str(&weight->shape).c_str(),
 43 | 		get_shape_str(&tensor->shape).c_str());
 44 | 	exit_if(bias && ((bias->shape.size() != 1) || (bias->span != num_channels)),
 45 | 		"invalid scale bias shape with weight shape : %s and %s",
 46 | 		get_shape_str(&bias->shape).c_str(),
 47 | 		get_shape_str(&weight->shape).c_str());
 48 | 
 49 | #ifdef PADDLE
 50 | 	paddle::lite_api::scale(tensor->val, tensor->val, num_batches, num_channels, num_features, weight->val, bias ? bias->val : nullptr);
 51 | #else
 52 | 	float* v = tensor->val;
 53 | 		
 54 | 	if (bias) {
 55 | 		for (int i = 0; i < num_batches; ++i) {
 56 | 			float* w = weight->val;
 57 | 			float* b = bias->val;
 58 | 
 59 | 			for (int j = 0; j < num_channels; ++j, ++w, ++b) {
 60 | 				for (int k = 0; k < num_features; ++k, ++v) {
 61 | 					*v = *v * *w + *b;
 62 | 				}
 63 | 			}
 64 | 		}
 65 | 	} else {
 66 | 		for (int i = 0; i < num_batches; ++i) {
 67 | 			float* w = weight->val;
 68 | 
 69 | 			for (int j = 0; j < num_channels; ++w) {
 70 | 				for (int k = 0; k < num_features; ++k, ++v) {
 71 | 					*v = *v * *w;
 72 | 				}
 73 | 			}
 74 | 		}
 75 | 	}
 76 | #endif
 77 | }
 78 | 
 79 | void add(FTensor* dst, FTensor* oprd) {
 80 | 	int span = dst->span;
 81 | 
 82 | 	exit_if(!dst->is_shape(&oprd->shape), "shapes mismatch for add operation : %s and %s",
 83 | 		get_shape_str(&dst->shape).c_str(),
 84 | 		get_shape_str(&oprd->shape).c_str());
 85 | 
 86 | #ifdef PADDLE
 87 | 	paddle_elementwise_add(dst->val, oprd->val, dst->val, span);
 88 | #else
 89 | 	float* v = dst->val;
 90 | 	float* o = oprd->val;
 91 | 
 92 | 	for (int i = 0; i < span; ++i, ++v, ++o) {
 93 | 		*v += *o;
 94 | 	}
 95 | #endif
 96 | }
 97 | 
 98 | }
 99 | }
100 | 


--------------------------------------------------------------------------------
/vsproj/dajnn.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="dajnn">
 5 |       <UniqueIdentifier>{fe6241e9-2e92-445d-abaa-fd89a812ded6}</UniqueIdentifier>
 6 |     </Filter>
 7 |     <Filter Include="paddle">
 8 |       <UniqueIdentifier>{625277b8-cb1e-43da-874e-7c76e1f7fef9}</UniqueIdentifier>
 9 |     </Filter>
10 |   </ItemGroup>
11 |   <ItemGroup>
12 |     <ClInclude Include="paddle\paddle_api.h">
13 |       <Filter>paddle</Filter>
14 |     </ClInclude>
15 |     <ClInclude Include="paddle\paddle_api_2.h">
16 |       <Filter>paddle</Filter>
17 |     </ClInclude>
18 |     <ClInclude Include="paddle\paddle_image_preprocess.h">
19 |       <Filter>paddle</Filter>
20 |     </ClInclude>
21 |     <ClInclude Include="paddle\paddle_lite_factory_helper.h">
22 |       <Filter>paddle</Filter>
23 |     </ClInclude>
24 |     <ClInclude Include="paddle\paddle_place.h">
25 |       <Filter>paddle</Filter>
26 |     </ClInclude>
27 |     <ClInclude Include="paddle\paddle_use_kernels.h">
28 |       <Filter>paddle</Filter>
29 |     </ClInclude>
30 |     <ClInclude Include="paddle\paddle_use_ops.h">
31 |       <Filter>paddle</Filter>
32 |     </ClInclude>
33 |     <ClInclude Include="paddle\paddle_use_passes.h">
34 |       <Filter>paddle</Filter>
35 |     </ClInclude>
36 |     <ClInclude Include="..\src\dajnn\dajdense.h">
37 |       <Filter>dajnn</Filter>
38 |     </ClInclude>
39 |     <ClInclude Include="..\src\dajnn\dajfunc.h">
40 |       <Filter>dajnn</Filter>
41 |     </ClInclude>
42 |     <ClInclude Include="..\src\dajnn\dajgemm.h">
43 |       <Filter>dajnn</Filter>
44 |     </ClInclude>
45 |     <ClInclude Include="..\src\dajnn\dajmodel.h">
46 |       <Filter>dajnn</Filter>
47 |     </ClInclude>
48 |     <ClInclude Include="..\src\dajnn\dajnn.h">
49 |       <Filter>dajnn</Filter>
50 |     </ClInclude>
51 |     <ClInclude Include="..\src\dajnn\dajnorm.h">
52 |       <Filter>dajnn</Filter>
53 |     </ClInclude>
54 |     <ClInclude Include="..\src\dajnn\dajtensor.h">
55 |       <Filter>dajnn</Filter>
56 |     </ClInclude>
57 |     <ClInclude Include="..\src\dajnn\dajutil.h">
58 |       <Filter>dajnn</Filter>
59 |     </ClInclude>
60 |     <ClInclude Include="..\src\dajnn\dajconv.h">
61 |       <Filter>dajnn</Filter>
62 |     </ClInclude>
63 |     <ClInclude Include="..\src\dajnn\dajdef.h">
64 |       <Filter>dajnn</Filter>
65 |     </ClInclude>
66 |   </ItemGroup>
67 |   <ItemGroup>
68 |     <ClCompile Include="main.cpp" />
69 |     <ClCompile Include="..\src\dajnn\dajdense.cpp">
70 |       <Filter>dajnn</Filter>
71 |     </ClCompile>
72 |     <ClCompile Include="..\src\dajnn\dajfunc.cpp">
73 |       <Filter>dajnn</Filter>
74 |     </ClCompile>
75 |     <ClCompile Include="..\src\dajnn\dajgemm.cpp">
76 |       <Filter>dajnn</Filter>
77 |     </ClCompile>
78 |     <ClCompile Include="..\src\dajnn\dajmodel.cpp">
79 |       <Filter>dajnn</Filter>
80 |     </ClCompile>
81 |     <ClCompile Include="..\src\dajnn\dajnn.cpp">
82 |       <Filter>dajnn</Filter>
83 |     </ClCompile>
84 |     <ClCompile Include="..\src\dajnn\dajnorm.cpp">
85 |       <Filter>dajnn</Filter>
86 |     </ClCompile>
87 |     <ClCompile Include="..\src\dajnn\dajtensor.cpp">
88 |       <Filter>dajnn</Filter>
89 |     </ClCompile>
90 |     <ClCompile Include="..\src\dajnn\dajutil.cpp">
91 |       <Filter>dajnn</Filter>
92 |     </ClCompile>
93 |     <ClCompile Include="..\src\dajnn\dajconv.cpp">
94 |       <Filter>dajnn</Filter>
95 |     </ClCompile>
96 |   </ItemGroup>
97 | </Project>


--------------------------------------------------------------------------------
/paddle/paddle_use_passes.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | #include "paddle_lite_factory_helper.h"  // NOLINT
17 | 
18 | USE_MIR_PASS(demo);
19 | USE_MIR_PASS(static_kernel_pick_pass);
20 | USE_MIR_PASS(variable_place_inference_pass);
21 | USE_MIR_PASS(type_target_cast_pass);
22 | USE_MIR_PASS(generate_program_pass);
23 | 
24 | USE_MIR_PASS(io_copy_kernel_pick_pass);
25 | USE_MIR_PASS(argument_type_display_pass);
26 | USE_MIR_PASS(runtime_context_assign_pass);
27 | USE_MIR_PASS(graph_visualize_pass);
28 | 
29 | USE_MIR_PASS(adaptive_1x1_pool2d_convert_global_pass);
30 | USE_MIR_PASS(remove_tf_redundant_ops_pass);
31 | USE_MIR_PASS(lite_conv_bn_fuse_pass);
32 | USE_MIR_PASS(lite_conv_conv_fuse_pass);
33 | USE_MIR_PASS(lite_squeeze2_matmul_fuse_pass);
34 | USE_MIR_PASS(lite_reshape2_matmul_fuse_pass);
35 | USE_MIR_PASS(lite_matmul_fuse_pass);
36 | USE_MIR_PASS(lite_fc_fuse_pass);
37 | USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
38 | USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
39 | USE_MIR_PASS(lite_interpolate_fuse_pass);
40 | USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
41 | USE_MIR_PASS(identity_scale_eliminate_pass);
42 | USE_MIR_PASS(identity_dropout_eliminate_pass);
43 | USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
44 | USE_MIR_PASS(lite_conv_activation_fuse_pass);
45 | USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
46 | USE_MIR_PASS(lite_match_matrix_activation_fuse_pass);
47 | USE_MIR_PASS(lite_scales_fuse_pass);
48 | USE_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass);
49 | USE_MIR_PASS(lite_elementwise_activation_fuse_pass);
50 | USE_MIR_PASS(lite_quant_dequant_fuse_pass);
51 | USE_MIR_PASS(type_precision_cast_pass);
52 | USE_MIR_PASS(type_layout_cast_pass);
53 | USE_MIR_PASS(type_layout_cast_preprocess_pass);
54 | USE_MIR_PASS(memory_optimize_pass);
55 | USE_MIR_PASS(lite_reshape_fuse_pass);
56 | USE_MIR_PASS(multi_stream_analysis_pass);
57 | USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
58 | USE_MIR_PASS(npu_subgraph_pass);
59 | USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
60 | USE_MIR_PASS(imagination_nna_subgraph_pass);
61 | USE_MIR_PASS(xpu_subgraph_pass);
62 | USE_MIR_PASS(mlu_subgraph_pass);
63 | USE_MIR_PASS(mlu_postprocess_pass);
64 | USE_MIR_PASS(weight_quantization_preprocess_pass);
65 | USE_MIR_PASS(post_quant_dynamic_pass);
66 | USE_MIR_PASS(apu_subgraph_pass);
67 | USE_MIR_PASS(quantized_op_attributes_inference_pass);
68 | USE_MIR_PASS(restrict_quantized_op_with_same_input_output_scale_pass);
69 | USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
70 | USE_MIR_PASS(lite_scale_activation_fuse_pass);
71 | USE_MIR_PASS(lite_instance_norm_activation_fuse_pass);
72 | USE_MIR_PASS(__xpu__resnet_fuse_pass);
73 | USE_MIR_PASS(__xpu__resnet_d_fuse_pass);
74 | USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
75 | USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
76 | USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
77 | USE_MIR_PASS(__xpu__fc_fuse_pass);
78 | USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
79 | USE_MIR_PASS(__xpu__conv2d_fuse_pass);
80 | USE_MIR_PASS(__xpu__resblock_reduction_fuse_pass);
81 | USE_MIR_PASS(__xpu__resblock_normal_fuse_pass);
82 | USE_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass);
83 | USE_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass);
84 | USE_MIR_PASS(__xpu__sfa_head_moment_fuse_pass);
85 | USE_MIR_PASS(__xpu__softmax_topk_fuse_pass);
86 | USE_MIR_PASS(__xpu__multi_encoder_slice_link_fuse_pass);
87 | 


--------------------------------------------------------------------------------
/src/dajconv.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "dajdense.h"
  3 | #include "dajtensor.h"
  4 | #include "dajutil.h"
  5 | #include "dajgemm.h"
  6 | 
  7 | #ifdef PADDLE
  8 | #include "paddle_api_2.h"
  9 | using namespace paddle::lite_api;
 10 | #endif
 11 | 
 12 | namespace dajnn {
 13 | namespace conv {
 14 | 
 15 | #ifndef PADDLE
 16 | float im2col_get_pixel(float* im, int height, int width, int channels,
 17 | 		int row, int col, int channel, int pad_h, int pad_w) {
 18 | 
 19 |     row -= pad_h;
 20 |     col -= pad_w;
 21 | 
 22 |     if (row < 0 || col < 0 || row >= height || col >= width) return 0;
 23 |     return im[col + width * (row + height * channel)];
 24 | }
 25 | 
 26 | void im2col_cpu(float* data_im, int channels, int height, int width,
 27 | 		int kernel_h, int kernel_w, int stride_h, int stride_w,
 28 | 		int pad_h, int pad_w, float* data_col) {
 29 | 
 30 |     int c, h, w;
 31 |     int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
 32 |     int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
 33 | 
 34 | 	int ksize = kernel_h * kernel_w;
 35 |     int channels_col = channels * ksize;
 36 | 
 37 |     for (c = 0; c < channels_col; ++c) {
 38 |         int w_offset = c % kernel_w;
 39 |         int h_offset = (c / kernel_w) % kernel_h;
 40 |         int c_im = c / ksize;
 41 | 
 42 |         for (h = 0; h < height_col; ++h) {
 43 |             for (w = 0; w < width_col; ++w) {
 44 |                 int im_row = h_offset + h * stride_h;
 45 |                 int im_col = w_offset + w * stride_w;
 46 |                 int col_index = (c * height_col + h) * width_col + w;
 47 | 
 48 |                 data_col[col_index] = im2col_get_pixel(
 49 | 					data_im, height, width, channels,
 50 |                     im_row, im_col, c_im, pad_h, pad_w);
 51 |             }
 52 |         }
 53 |     }
 54 | }
 55 | #endif
 56 | 
 57 | FTensor* conv2d(FTensor* input, FTensor* kernel, FTensor* bias,
 58 | 		int padding_h, int padding_w, int stride_h, int stride_w,
 59 | 		int dilation_h, int dilation_w) {
 60 | 	
 61 | 	exit_if(input->shape.size() != 4, "input dim of conv2d expects to be 4, but got %d", input->shape.size());
 62 | 	exit_if(kernel->shape.size() != 4, "kernel dim of conv2d expects to be 4, but got %d", kernel->shape.size());
 63 | 	exit_if(bias && (bias->shape.size() != 1), "bias dim of conv2d expects to be null or 1, but got %d", bias->shape.size());
 64 | 
 65 | 	int num_batches = input->shape[0];
 66 | 	int num_channels = input->shape[1];
 67 | 	int h = input->shape[2];
 68 | 	int w = input->shape[3];
 69 | 	int num_filters = kernel->shape[0];
 70 | 	int kernel_h = kernel->shape[2];
 71 | 	int kernel_w = kernel->shape[3];
 72 | 
 73 | 	exit_if(kernel->shape[1] != num_channels, "second dim of conv2d kernel (# of channels) expects to be %d, but got %d", num_channels, kernel->shape[1]);
 74 | 	exit_if(bias->span != num_filters, "span of conv2d bias (# of filters) expects to be %d, but got %d", num_filters, bias->span);
 75 | 
 76 | 	if (padding_h < 0) padding_h = (kernel_h - 1) * dilation_h / 2;
 77 | 	if (padding_w < 0) padding_w = (kernel_w - 1) * dilation_w / 2;
 78 | 
 79 | 	int _h_ = (h + 2 * padding_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
 80 | 	int _w_ = (w + 2 * padding_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
 81 | 
 82 | 	FTensor* output = new FTensor(num_batches, num_filters, _h_, _w_, END_DIM);
 83 | 		
 84 | 	int chw = input->span / num_batches;
 85 | 	int _chw_ = output->span / num_batches;
 86 | 
 87 | 	float* ip = input->val;
 88 | 	float* op = output->val;
 89 | 
 90 | #ifdef PADDLE
 91 | 	exit_if(num_filters == 1, "single conv2d filter is not available for mobile forward");
 92 | 
 93 | 	paddle_conv2d(num_batches, h, w, num_channels, ip, num_filters, kernel_h, kernel_w, kernel->val,
 94 | 		op, bias ? bias->val : nullptr, padding_h, padding_w, dilation_h, dilation_w,
 95 | 		stride_h, stride_w, 0, 0, PADDLE_CLS, PADDLE_THREADS);
 96 | #else
 97 | 	exit_if((dilation_h != 1) || (dilation_w != 1), "only single dilation is available for win32 forward");
 98 | 	memset(output->val, 0, 4 * output->span);
 99 | 	
100 | 	int _hw_ = _h_ * _w_;
101 | 	int _kkc_ = kernel_h * kernel_w * num_channels;
102 | 	float* workspace = (float*) malloc(4 * _kkc_ * _hw_);
103 | 
104 | 	for (int i = 0; i < num_batches; ++i, ip += chw, op += _chw_) {
105 |         im2col_cpu(ip, num_channels, h, w, kernel_h, kernel_w, stride_h, stride_w,
106 | 			padding_h, padding_w, workspace);
107 |         gemm(0, 0, num_filters, _hw_, _kkc_, 1, kernel->val, _kkc_, workspace, _hw_, 1, op, _hw_);
108 |     }
109 | 	if (bias) {
110 | 		op = output->val;
111 | 
112 | 		for (int i = 0; i < num_batches; ++i) {
113 | 			for (int j = 0; j < num_filters; ++j) {
114 | 				float b = bias->val[j];
115 | 
116 | 				for (int k = 0; k < _hw_; ++k) {
117 | 					op[i * _chw_ + j * _hw_ + k] += b;
118 | 				}
119 | 			}
120 | 		}
121 | 	}
122 | 	free(workspace);
123 | #endif
124 | 	return output;
125 | }
126 | 
127 | }
128 | }
129 | 


--------------------------------------------------------------------------------
/vsproj/dajnn.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |   </ItemGroup>
 13 |   <PropertyGroup Label="Globals">
 14 |     <ProjectGuid>{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}</ProjectGuid>
 15 |     <Keyword>Win32Proj</Keyword>
 16 |     <RootNamespace>dajnn</RootNamespace>
 17 |   </PropertyGroup>
 18 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 19 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 20 |     <ConfigurationType>Application</ConfigurationType>
 21 |     <UseDebugLibraries>true</UseDebugLibraries>
 22 |     <PlatformToolset>v120_xp</PlatformToolset>
 23 |     <CharacterSet>Unicode</CharacterSet>
 24 |   </PropertyGroup>
 25 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 26 |     <ConfigurationType>Application</ConfigurationType>
 27 |     <UseDebugLibraries>false</UseDebugLibraries>
 28 |     <PlatformToolset>v120_xp</PlatformToolset>
 29 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 30 |     <CharacterSet>Unicode</CharacterSet>
 31 |   </PropertyGroup>
 32 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 33 |   <ImportGroup Label="ExtensionSettings">
 34 |   </ImportGroup>
 35 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 36 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 37 |   </ImportGroup>
 38 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 39 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 40 |   </ImportGroup>
 41 |   <PropertyGroup Label="UserMacros" />
 42 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 43 |     <LinkIncremental>true</LinkIncremental>
 44 |     <OutDir>$(SolutionDir)_bin\</OutDir>
 45 |     <IntDir>$(SolutionDir)_obj\</IntDir>
 46 |   </PropertyGroup>
 47 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 48 |     <LinkIncremental>false</LinkIncremental>
 49 |     <OutDir>$(SolutionDir)_bin\</OutDir>
 50 |     <IntDir>$(SolutionDir)_obj\</IntDir>
 51 |   </PropertyGroup>
 52 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 53 |     <ClCompile>
 54 |       <PrecompiledHeader>
 55 |       </PrecompiledHeader>
 56 |       <WarningLevel>Level3</WarningLevel>
 57 |       <Optimization>Disabled</Optimization>
 58 |       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 59 |       <AdditionalIncludeDirectories>$(SolutionDir)..\src\dajnn;$(SolutionDir)paddle;</AdditionalIncludeDirectories>
 60 |       <AdditionalOptions>/wd4996 %(AdditionalOptions)</AdditionalOptions>
 61 |     </ClCompile>
 62 |     <Link>
 63 |       <SubSystem>Console</SubSystem>
 64 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 65 |     </Link>
 66 |   </ItemDefinitionGroup>
 67 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 68 |     <ClCompile>
 69 |       <WarningLevel>Level3</WarningLevel>
 70 |       <PrecompiledHeader>
 71 |       </PrecompiledHeader>
 72 |       <Optimization>MaxSpeed</Optimization>
 73 |       <FunctionLevelLinking>true</FunctionLevelLinking>
 74 |       <IntrinsicFunctions>true</IntrinsicFunctions>
 75 |       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 76 |       <AdditionalIncludeDirectories>$(SolutionDir)..\src\dajnn;$(SolutionDir)paddle;</AdditionalIncludeDirectories>
 77 |       <AdditionalOptions>/wd4996 %(AdditionalOptions)</AdditionalOptions>
 78 |     </ClCompile>
 79 |     <Link>
 80 |       <SubSystem>Console</SubSystem>
 81 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 82 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
 83 |       <OptimizeReferences>true</OptimizeReferences>
 84 |     </Link>
 85 |   </ItemDefinitionGroup>
 86 |   <ItemGroup>
 87 |     <ClInclude Include="..\src\dajnn\dajconv.h" />
 88 |     <ClInclude Include="..\src\dajnn\dajdef.h" />
 89 |     <ClInclude Include="..\src\dajnn\dajdense.h" />
 90 |     <ClInclude Include="..\src\dajnn\dajfunc.h" />
 91 |     <ClInclude Include="..\src\dajnn\dajgemm.h" />
 92 |     <ClInclude Include="..\src\dajnn\dajmodel.h" />
 93 |     <ClInclude Include="..\src\dajnn\dajnn.h" />
 94 |     <ClInclude Include="..\src\dajnn\dajnorm.h" />
 95 |     <ClInclude Include="..\src\dajnn\dajtensor.h" />
 96 |     <ClInclude Include="..\src\dajnn\dajutil.h" />
 97 |     <ClInclude Include="paddle\paddle_api.h" />
 98 |     <ClInclude Include="paddle\paddle_api_2.h" />
 99 |     <ClInclude Include="paddle\paddle_image_preprocess.h" />
100 |     <ClInclude Include="paddle\paddle_lite_factory_helper.h" />
101 |     <ClInclude Include="paddle\paddle_place.h" />
102 |     <ClInclude Include="paddle\paddle_use_kernels.h" />
103 |     <ClInclude Include="paddle\paddle_use_ops.h" />
104 |     <ClInclude Include="paddle\paddle_use_passes.h" />
105 |   </ItemGroup>
106 |   <ItemGroup>
107 |     <ClCompile Include="..\src\dajnn\dajconv.cpp" />
108 |     <ClCompile Include="..\src\dajnn\dajdense.cpp" />
109 |     <ClCompile Include="..\src\dajnn\dajfunc.cpp" />
110 |     <ClCompile Include="..\src\dajnn\dajgemm.cpp" />
111 |     <ClCompile Include="..\src\dajnn\dajmodel.cpp" />
112 |     <ClCompile Include="..\src\dajnn\dajnn.cpp" />
113 |     <ClCompile Include="..\src\dajnn\dajnorm.cpp" />
114 |     <ClCompile Include="..\src\dajnn\dajtensor.cpp" />
115 |     <ClCompile Include="..\src\dajnn\dajutil.cpp" />
116 |     <ClCompile Include="main.cpp" />
117 |   </ItemGroup>
118 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
119 |   <ImportGroup Label="ExtensionTargets">
120 |   </ImportGroup>
121 | </Project>


--------------------------------------------------------------------------------
/src/dajgemm.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "dajgemm.h"
  3 | #include "dajutil.h"
  4 | 
  5 | namespace dajnn {
  6 | 
  7 | void gemm_bin(int M, int N, int K, float ALPHA, char* A, int lda, float* B, int ldb, float* C, int ldc) {
  8 | 	int i, j, k;
  9 | 
 10 | 	for (i = 0; i < M; ++i) {
 11 | 		for (k = 0; k < K; ++k) {
 12 | 			char A_PART = A[i * lda + k];
 13 | 
 14 | 			if (A_PART) {
 15 | 				for (j = 0; j < N; ++j) {
 16 | 					C[i * ldc + j] += B[k * ldb + j];
 17 | 				}
 18 | 			} else {
 19 | 				for (j = 0; j < N; ++j) {
 20 | 					C[i * ldc + j] -= B[k * ldb + j];
 21 | 				}
 22 | 			}
 23 | 		}
 24 | 	}
 25 | }
 26 | 
 27 | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc) {
 28 | 	gemm_cpu(TA, TB, M, N, K, ALPHA, A, lda, B, ldb, BETA, C, ldc);
 29 | }
 30 | 
 31 | //#if (defined(__AVX__) && defined(__x86_64__)) || defined(_WIN64)
 32 | #if defined(__AVX__) || defined(_WIN64)
 33 | #define OSXSAVEFlag (1UL << 27)
 34 | #define AVXFlag     ((1UL << 28) | OSXSAVEFlag)
 35 | #define FMAFlag     ((1UL << 12) | AVXFlag | OSXSAVEFlag)
 36 | #define CLMULFlag   ((1UL << 1) | AVXFlag | OSXSAVEFlag)
 37 | #define VAESFlag    ((1UL << 25) | AVXFlag | OSXSAVEFlag)
 38 | 
 39 | #include <stdint.h>
 40 | 
 41 | //#ifdef _WIN64
 42 | #ifdef _WIN32
 43 | #include <intrin.h>
 44 | #include <ammintrin.h>
 45 | #include <immintrin.h>
 46 | #include <smmintrin.h>
 47 | #else	// Linux GCC/Clang
 48 | #include <x86intrin.h>
 49 | #include <ammintrin.h>
 50 | #include <immintrin.h>
 51 | #include <smmintrin.h>
 52 | #include <cpuid.h>
 53 | 
 54 | void asm_cpuid(uint32_t* abcd, uint32_t eax) {
 55 | 	uint32_t ebx = 0, edx = 0, ecx = 0;
 56 | 
 57 | 	// EBX is saved to EDI and later restored
 58 | 	__asm__("movl %%ebx, %%edi;"
 59 | 		"cpuid;"
 60 | 		"xchgl %%ebx, %%edi;"
 61 | 		: "=D"(ebx),
 62 | 		"+a"(eax), "+c"(ecx), "=d"(edx));
 63 | 
 64 | 	abcd[0] = eax;
 65 | 	abcd[1] = ebx;
 66 | 	abcd[2] = ecx;
 67 | 	abcd[3] = edx;
 68 | }
 69 | #endif
 70 | 
 71 | int simd_detect_x86(unsigned int idFeature) {
 72 | 	uint32_t regs[4];	// EAX, EBX, ECX, EDX;
 73 | #ifdef _WIN32
 74 | 	__cpuid((int*) regs, 0);
 75 | 	if (regs[0] > 1U) __cpuid((int*) regs, 1);
 76 | #else
 77 | 	__get_cpuid(0, &regs[0], &regs[1], &regs[2], &regs[3]);
 78 | 	if (regs[0] > 1U) __get_cpuid(1, &regs[0], &regs[1], &regs[2], &regs[3]);
 79 | #endif
 80 | 	if ((regs[2] & idFeature) != idFeature) return 0;
 81 | 	return 1;
 82 | }
 83 | 
 84 | int is_fma_avx() {
 85 | 	static int result = -1;
 86 | 
 87 | 	if (result == -1) {
 88 | 		result = simd_detect_x86(AVXFlag);
 89 | 		
 90 | 		if (result == 1) {
 91 | 			log_i(" used AVX");
 92 | 		} else {
 93 | 			log_i(" not used AVX");
 94 | 		}
 95 | 	}
 96 | 	return result;
 97 | }
 98 | 
 99 | void gemm_nn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
100 | 	int i, j, k;
101 | 
102 | 	if (is_fma_avx() == 1) {	// AVX
103 | 		for (i = 0; i < M; ++i) {
104 | 			for (k = 0; k < K; ++k) {
105 | 				float A_PART = ALPHA * A[i * lda + k];
106 | 				__m256 a256, b256, c256, result256;	// AVX
107 | 				a256 = _mm256_set1_ps(A_PART);
108 | 
109 | 				for (j = 0; j < N - 8; j += 8) {
110 | 					b256 = _mm256_loadu_ps(&B[k * ldb + j]);
111 | 					c256 = _mm256_loadu_ps(&C[i * ldc + j]);
112 | 					
113 | 					// FMA - Intel Haswell (2013), AMD Piledriver (2012)
114 | 					result256 = _mm256_fmadd_ps(a256, b256, c256);
115 | 					//result256 = _mm256_mul_ps(a256, b256);
116 | 					//result256 = _mm256_add_ps(result256, c256);
117 | 					
118 | 					_mm256_storeu_ps(&C[i * ldc + j], result256);
119 | 				}
120 | 				int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
121 | 
122 | 				for (j = prev_end; j < N; ++j)
123 | 					C[i * ldc + j] += A_PART * B[k * ldb + j];
124 | 			}
125 | 		}
126 | 	} else {
127 | 		for (i = 0; i < M; ++i) {
128 | 			for (k = 0; k < K; ++k) {
129 | 				register float A_PART = ALPHA * A[i * lda + k];
130 | 				
131 | 				for (j = 0; j < N; ++j) {
132 | 					C[i * ldc + j] += A_PART * B[k * ldb + j];
133 | 				}
134 | 				/* // SSE
135 | 				__m128 a128, b128, c128, result128;	// SSE
136 | 				a128 = _mm_set1_ps(A_PART);
137 | 				for (j = 0; j < N - 4; j += 4) {
138 | 				b128 = _mm_loadu_ps(&B[k*ldb + j]);
139 | 				c128 = _mm_loadu_ps(&C[i*ldc + j]);
140 | 				//result128 = _mm_fmadd_ps(a128, b128, c128);
141 | 				result128 = _mm_mul_ps(a128, b128);
142 | 				result128 = _mm_add_ps(result128, c128);
143 | 				_mm_storeu_ps(&C[i*ldc + j], result128);
144 | 				}
145 | 
146 | 				int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
147 | 				for (j = prev_end; j < N; ++j){
148 | 				C[i*ldc + j] += A_PART*B[k*ldb + j];
149 | 				}
150 | 				*/
151 | 			}
152 | 		}
153 | 	}
154 | }
155 | #else
156 | 
157 | void gemm_nn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
158 | 	int i, j, k;
159 | 
160 | 	for (i = 0; i < M; ++i) {
161 | 		for (k = 0; k < K; ++k) {
162 | 			register float A_PART = ALPHA * A[i * lda + k];
163 | 
164 | 			for (j = 0; j < N; ++j) {
165 | 				C[i * ldc + j] += A_PART * B[k * ldb + j];
166 | 			}
167 | 		}
168 | 	}
169 | }
170 | 
171 | #endif	// __x86_64
172 | 
173 | void gemm_nt(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
174 | 	int i, j, k;
175 | 
176 | 	for (i = 0; i < M; ++i) {
177 | 		for (j = 0; j < N; ++j) {
178 | 			register float sum = 0;
179 | 
180 | 			for (k = 0; k < K; ++k) {
181 | 				sum += ALPHA * A[i * lda + k] * B[j * ldb + k];
182 | 			}
183 | 			C[i * ldc + j] += sum;
184 | 		}
185 | 	}
186 | }
187 | 
188 | void gemm_tn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
189 | 	int i, j, k;
190 | 
191 | 	for (i = 0; i < M; ++i) {
192 | 		for (k = 0; k < K; ++k) {
193 | 			register float A_PART = ALPHA * A[k * lda + i];
194 | 
195 | 			for (j = 0; j < N; ++j) {
196 | 				C[i * ldc + j] += A_PART * B[k * ldb + j];
197 | 			}
198 | 		}
199 | 	}
200 | }
201 | 
202 | void gemm_tt(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
203 | 	int i, j, k;
204 | 
205 | 	for (i = 0; i < M; ++i) {
206 | 		for (j = 0; j < N; ++j) {
207 | 			register float sum = 0;
208 | 
209 | 			for (k = 0; k < K; ++k) {
210 | 				sum += ALPHA * A[i + k * lda] * B[k + j * ldb];
211 | 			}
212 | 			C[i * ldc + j] += sum;
213 | 		}
214 | 	}
215 | }
216 | 
217 | void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc) {
218 | 	int i, j;
219 | 
220 | 	/*for (i = 0; i < M; ++i) {
221 | 		for (j = 0; j < N; ++j) {
222 | 			C[i * ldc + j] *= BETA;
223 | 		}
224 | 	}*/
225 | 	int t;
226 | 
227 | #pragma omp parallel for
228 | 	for (t = 0; t < M; ++t) {
229 | 		if (!TA && !TB) {
230 | 			gemm_nn(1, N, K, ALPHA, A + t * lda, lda, B, ldb, C + t * ldc, ldc);
231 | 		} else if (TA && !TB) {
232 | 			gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t * ldc, ldc);
233 | 		} else if (!TA && TB) {
234 | 			gemm_nt(1, N, K, ALPHA, A + t * lda, lda, B, ldb, C + t * ldc, ldc);
235 | 		} else {
236 | 			gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t * ldc, ldc);
237 | 		}
238 | 	}
239 | }
240 | 
241 | }
242 | 


--------------------------------------------------------------------------------
/paddle/paddle_use_kernels.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "paddle_lite_factory_helper.h"
  3 | 
  4 | USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
  5 | USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
  6 | USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
  7 | USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
  8 | USE_LITE_KERNEL(unsqueeze, kHost, kAny, kAny, def);
  9 | USE_LITE_KERNEL(unsqueeze2, kHost, kAny, kAny, def);
 10 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
 11 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, bool_slice);
 12 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, int32_slice);
 13 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def_int64);
 14 | USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
 15 | USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
 16 | USE_LITE_KERNEL(fill_constant, kHost, kAny, kNCHW, def);
 17 | USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 18 | USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
 19 | USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
 20 | USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
 21 | USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
 22 | USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
 23 | USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
 24 | USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
 25 | USE_LITE_KERNEL(thresholded_relu, kARM, kFloat, kNCHW, def);
 26 | USE_LITE_KERNEL(elu, kARM, kFloat, kNCHW, def);
 27 | USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
 28 | USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw);
 29 | USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc);
 30 | USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw);
 31 | USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
 32 | USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
 33 | USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
 34 | USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
 35 | USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
 36 | USE_LITE_KERNEL(split, kARM, kInt64, kNCHW, def);
 37 | USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def);
 38 | USE_LITE_KERNEL(expand, kHost, kFloat, kAny, def);
 39 | USE_LITE_KERNEL(expand, kHost, kInt32, kAny, def);
 40 | USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
 41 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int32int32);
 42 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int64int64);
 43 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int64int32);
 44 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int32int64);
 45 | USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
 46 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, int32_to_fp32);
 47 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, int32_to_int64);
 48 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, fp32_to_int32);
 49 | USE_LITE_KERNEL(calib, kARM, kInt64, kNCHW, int64_to_fp32);
 50 | USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
 51 | USE_LITE_KERNEL(calib, kARM, kInt8, kNHWC, fp32_to_int8);
 52 | USE_LITE_KERNEL(calib, kARM, kInt8, kNHWC, int8_to_fp32);
 53 | USE_LITE_KERNEL(calib, kARM, kInt64, kNCHW, int64_to_int32);
 54 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
 55 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
 56 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNHWC, fp32_to_int8);
 57 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNHWC, int8_to_fp32);
 58 | USE_LITE_KERNEL(calib_once, kARM, kInt64, kNCHW, int64_to_int32);
 59 | USE_LITE_KERNEL(arg_max, kHost, kAny, kNCHW, fp32);
 60 | USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 61 | USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
 62 | USE_LITE_KERNEL(multiclass_nms2, kHost, kFloat, kNCHW, def);
 63 | USE_LITE_KERNEL(multiclass_nms3, kHost, kFloat, kNCHW, def);
 64 | USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
 65 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
 66 | USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
 67 | USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
 68 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
 69 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
 70 | USE_LITE_KERNEL(box_coder, kHost, kFloat, kNCHW, def);
 71 | USE_LITE_KERNEL(assign_value, kARM, kAny, kNCHW, def);
 72 | USE_LITE_KERNEL(squeeze, kHost, kAny, kAny, def);
 73 | USE_LITE_KERNEL(squeeze2, kHost, kAny, kAny, def);
 74 | USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
 75 | USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
 76 | USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
 77 | USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
 78 | USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
 79 | USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
 80 | USE_LITE_KERNEL(sqrt, kARM, kFloat, kNCHW, def);
 81 | USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
 82 | USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
 83 | USE_LITE_KERNEL(hard_swish, kARM, kFloat, kNCHW, def);
 84 | USE_LITE_KERNEL(reciprocal, kARM, kFloat, kNCHW, def);
 85 | USE_LITE_KERNEL(abs, kARM, kFloat, kNCHW, def);
 86 | USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
 87 | USE_LITE_KERNEL(range, kARM, kInt32, kNCHW, def);
 88 | USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 89 | USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
 90 | USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
 91 | USE_LITE_KERNEL(grid_sampler, kARM, kFloat, kNCHW, def);
 92 | USE_LITE_KERNEL(instance_norm, kARM, kFloat, kNCHW, def);
 93 | USE_LITE_KERNEL(stack, kHost, kFloat, kAny, def);
 94 | USE_LITE_KERNEL(stack, kHost, kInt32, kAny, def);
 95 | USE_LITE_KERNEL(lod_array_length, kHost, kAny, kAny, def);
 96 | USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
 97 | USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
 98 | USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
 99 | USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
100 | USE_LITE_KERNEL(scale, kARM, kInt32, kNCHW, def);
101 | USE_LITE_KERNEL(scale, kARM, kInt64, kNCHW, def);
102 | USE_LITE_KERNEL(arg_max, kARM, kAny, kNCHW, fp32);
103 | USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
104 | USE_LITE_KERNEL(range, kHost, kFloat, kAny, def);
105 | USE_LITE_KERNEL(range, kHost, kInt32, kAny, def);
106 | USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
107 | USE_LITE_KERNEL(deformable_conv, kHost, kFloat, kNCHW, def);
108 | USE_LITE_KERNEL(affine_grid, kARM, kFloat, kNCHW, def);
109 | USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
110 | USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
111 | USE_LITE_KERNEL(elementwise_add, kARM, kInt32, kNCHW, def);
112 | USE_LITE_KERNEL(elementwise_add, kARM, kInt64, kNCHW, def);
113 | USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
114 | USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
115 | USE_LITE_KERNEL(elementwise_sub, kARM, kInt32, kNCHW, def);
116 | USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def);
117 | USE_LITE_KERNEL(elementwise_mul, kARM, kInt64, kNCHW, def);
118 | USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
119 | USE_LITE_KERNEL(elementwise_mul, kARM, kInt32, kNCHW, def);
120 | USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
121 | USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kInt64, kNCHW, def);
122 | USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
123 | USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
124 | USE_LITE_KERNEL(elementwise_min, kARM, kFloat, kNCHW, def);
125 | USE_LITE_KERNEL(fusion_elementwise_min_activation, kARM, kFloat, kNCHW, def);
126 | USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
127 | USE_LITE_KERNEL(elementwise_div, kARM, kInt32, kNCHW, def);
128 | USE_LITE_KERNEL(elementwise_div, kARM, kInt64, kNCHW, def);
129 | USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
130 | USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def);
131 | USE_LITE_KERNEL(elementwise_pow, kARM, kFloat, kNCHW, def);
132 | USE_LITE_KERNEL(elementwise_pow, kARM, kInt32, kNCHW, def);
133 | USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
134 | USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
135 | USE_LITE_KERNEL(bilinear_interp_v2, kARM, kFloat, kNCHW, def);
136 | USE_LITE_KERNEL(nearest_interp_v2, kARM, kFloat, kNCHW, def);
137 | USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
138 | USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
139 | USE_LITE_KERNEL(expand_as, kHost, kFloat, kAny, def);
140 | USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
141 | USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
142 | USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
143 | USE_LITE_KERNEL(sync_batch_norm, kARM, kFloat, kNCHW, def);
144 | USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
145 | USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
146 | USE_LITE_KERNEL(cast, kARM, kAny, kNCHW, def);
147 | USE_LITE_KERNEL(fill_constant_batch_size_like, kHost, kAny, kNCHW, def);
148 | USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
149 | USE_LITE_KERNEL(stack, kARM, kInt32, kNCHW, def);


--------------------------------------------------------------------------------
/paddle/paddle_image_preprocess.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include <stdint.h>
 18 | #include <stdio.h>
 19 | #include <vector>
 20 | #include "lite/api/paddle_api.h"
 21 | #include "lite/api/paddle_place.h"
 22 | 
 23 | namespace paddle {
 24 | namespace lite {
 25 | namespace utils {
 26 | namespace cv {
 27 | typedef paddle::lite_api::Tensor Tensor;
 28 | typedef paddle::lite_api::DataLayoutType LayoutType;
 29 | // color enum
 30 | enum ImageFormat {
 31 |   RGBA = 0,
 32 |   BGRA,
 33 |   RGB,
 34 |   BGR,
 35 |   GRAY,
 36 |   NV21 = 11,
 37 |   NV12,
 38 | };
 39 | // flip enum
 40 | enum FlipParam {
 41 |   XY = -1,  // flip along the XY axis
 42 |   X = 0,    // flip along the X axis
 43 |   Y         // flip along the Y axis
 44 | };
 45 | // transform param
 46 | typedef struct {
 47 |   int ih;                // input height
 48 |   int iw;                // input width
 49 |   int oh;                // outpu theight
 50 |   int ow;                // output width
 51 |   FlipParam flip_param;  // flip, support x, y, xy
 52 |   float rotate_param;    // rotate, support 90, 180, 270
 53 | } TransParam;
 54 | 
 55 | class ImagePreprocess {
 56 |  public:
 57 |   /*
 58 |   * init
 59 |   * param srcFormat: input image color
 60 |   * param dstFormat: output image color
 61 |   * param param: input image parameter, egs: input size
 62 |   */
 63 |   ImagePreprocess(ImageFormat srcFormat,
 64 |                   ImageFormat dstFormat,
 65 |                   TransParam param);
 66 | 
 67 |   /*
 68 |   * image color convert
 69 |   * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
 70 |   * BGR(RGB)and BGRA(RGBA) transform,
 71 |   * BGR(RGB)and RGB(BGR) transform,
 72 |   * BGR(RGB)and RGBA(BGRA) transform,
 73 |   * BGR(RGB) and GRAY transform,
 74 |   * BGRA(RGBA) and GRAY transform,
 75 |   * param src: input image data
 76 |   * param dst: output image data
 77 |   */
 78 |   void image_convert(const uint8_t* src, uint8_t* dst);
 79 | 
 80 |   /*
 81 |   * image color convert
 82 |   * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
 83 |   * BGR(RGB)and BGRA(RGBA) transform,
 84 |   * BGR(RGB)and RGB(BGR) transform,
 85 |   * BGR(RGB)and RGBA(BGRA) transform,
 86 |   * BGR(RGB)and GRAY transform,
 87 |   * BGRA(RGBA) and GRAY transform,
 88 |   * param src: input image data
 89 |   * param dst: output image data
 90 |   * param srcFormat: input image image format support: GRAY, NV12(NV21),
 91 |   * BGR(RGB) and BGRA(RGBA)
 92 |   * param dstFormat: output image image format, support GRAY, BGR(RGB) and
 93 |   * BGRA(RGBA)
 94 |   */
 95 |   void image_convert(const uint8_t* src,
 96 |                      uint8_t* dst,
 97 |                      ImageFormat srcFormat,
 98 |                      ImageFormat dstFormat);
 99 | 
100 |   /*
101 |   * image color convert
102 |   * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
103 |   * BGR(RGB)and BGRA(RGBA) transform,
104 |   * BGR(RGB)and RGB(BGR) transform,
105 |   * BGR(RGB)and RGBA(BGRA) transform,
106 |   * BGR(RGB)and GRAY transform,
107 |   * BGRA(RGBA) and GRAY transform,
108 |   * param src: input image data
109 |   * param dst: output image data
110 |   * param srcFormat: input image image format support: GRAY, NV12(NV21),
111 |   * BGR(RGB) and BGRA(RGBA)
112 |   * param dstFormat: output image image format, support GRAY, BGR(RGB) and
113 |   * BGRA(RGBA)
114 |   * param srcw: input image width
115 |   * param srch: input image height
116 |   */
117 |   void image_convert(const uint8_t* src,
118 |                      uint8_t* dst,
119 |                      ImageFormat srcFormat,
120 |                      ImageFormat dstFormat,
121 |                      int srcw,
122 |                      int srch);
123 | 
124 |   /*
125 |   * image resize, use bilinear method
126 |   * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
127 |   * NV12, NV21), 3-channel(egs: BGR), 4-channel(egs: BGRA)
128 |   * param src: input image data
129 |   * param dst: output image data
130 |   */
131 |   void image_resize(const uint8_t* src, uint8_t* dst);
132 | 
133 |   /*
134 |    image resize, use bilinear method
135 |   * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
136 |   NV12, NV21), 3-channel image(egs: BGR), 4-channel image(egs: BGRA)
137 |   * param src: input image data
138 |   * param dst: output image data
139 |   * param srcw: input image width
140 |   * param srch: input image height
141 |   * param dstw: output image width
142 |   * param dsth: output image height
143 |   */
144 |   void image_resize(const uint8_t* src,
145 |                     uint8_t* dst,
146 |                     ImageFormat srcFormat,
147 |                     int srcw,
148 |                     int srch,
149 |                     int dstw,
150 |                     int dsth);
151 | 
152 |   /*
153 |   * image Rotate
154 |   * support 90, 180 and 270 Rotate process
155 |   * color format support 1-channel image, 3-channel image and 4-channel image
156 |   * param src: input image data
157 |   * param dst: output image data
158 |   */
159 |   void image_rotate(const uint8_t* src, uint8_t* dst);
160 | 
161 |   /*
162 |   * image Rotate
163 |   * support 90, 180 and 270 Rotate process
164 |   * color format support 1-channel image, 3-channel image and 4-channel image
165 |   * param src: input image data
166 |   * param dst: output image data
167 |   * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
168 |   * param srcw: input image width
169 |   * param srch: input image height
170 |   * param degree: Rotate degree, support 90, 180 and 270
171 |   */
172 |   void image_rotate(const uint8_t* src,
173 |                     uint8_t* dst,
174 |                     ImageFormat srcFormat,
175 |                     int srcw,
176 |                     int srch,
177 |                     float degree);
178 | 
179 |   /*
180 |   * image Flip
181 |   * support X, Y and XY flip process
182 |   * color format support 1-channel image, 3-channel image and 4-channel image
183 |   * param src: input image data
184 |   * param dst: output image data
185 |   */
186 |   void image_flip(const uint8_t* src, uint8_t* dst);
187 | 
188 |   /*
189 |   * image Flip
190 |   * support X, Y and XY flip process
191 |   * color format support 1-channel image, 3-channel image and 4-channel image
192 |   * param src: input image data
193 |   * param dst: output image data
194 |   * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
195 |   * param srcw: input image width
196 |   * param srch: input image height
197 |   * param flip_param: flip parameter, support X, Y and XY
198 |   */
199 |   void image_flip(const uint8_t* src,
200 |                   uint8_t* dst,
201 |                   ImageFormat srcFormat,
202 |                   int srcw,
203 |                   int srch,
204 |                   FlipParam flip_param);
205 | 
206 |   /*
207 |   * change image data to tensor data
208 |   * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
209 |   * and
210 |   * NCHW
211 |   * param src: input image data
212 |   * param dstTensor: output tensor data
213 |   * param layout: output tensor layout，support NHWC and NCHW
214 |   * param means: means of image
215 |   * param scales: scales of image
216 |   */
217 |   void image_to_tensor(const uint8_t* src,
218 |                        Tensor* dstTensor,
219 |                        LayoutType layout,
220 |                        float* means,
221 |                        float* scales);
222 | 
223 |   /*
224 |   * change image data to tensor data
225 |   * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
226 |   * and
227 |   * NCHW
228 |   * param src: input image data
229 |   * param dstTensor: output tensor data
230 |   * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
231 |   * param srcw: input image width
232 |   * param srch: input image height
233 |   * param layout: output tensor layout，support NHWC and NCHW
234 |   * param means: means of image
235 |   * param scales: scales of image
236 |   */
237 |   void image_to_tensor(const uint8_t* src,
238 |                        Tensor* dstTensor,
239 |                        ImageFormat srcFormat,
240 |                        int srcw,
241 |                        int srch,
242 |                        LayoutType layout,
243 |                        float* means,
244 |                        float* scales);
245 | 
246 |   /*
247 |   * image crop process
248 |   * color format support 1-channel image, 3-channel image and 4-channel image
249 |   * param src: input image data
250 |   * param dst: output image data
251 |   */
252 |   void image_crop(const uint8_t* src,
253 |                   uint8_t* dst,
254 |                   ImageFormat srcFormat,
255 |                   int srcw,
256 |                   int srch,
257 |                   int left_x,
258 |                   int left_y,
259 |                   int dstw,
260 |                   int dsth);
261 | 
262 |  private:
263 |   ImageFormat srcFormat_;
264 |   ImageFormat dstFormat_;
265 |   TransParam transParam_;
266 | };
267 | }  // namespace cv
268 | }  // namespace utils
269 | }  // namespace lite
270 | }  // namespace paddle
271 | 


--------------------------------------------------------------------------------
/paddle/paddle_place.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #pragma once
 16 | #include <set>
 17 | #include <string>
 18 | 
 19 | // Generic helper definitions for shared library support
 20 | #if defined _WIN32 || defined __CYGWIN__
 21 | #define PADDLE_LITE_HELPER_DLL_IMPORT __declspec(dllimport)
 22 | #define PADDLE_LITE_HELPER_DLL_EXPORT __declspec(dllexport)
 23 | #define PADDLE_LITE_HELPER_DLL_LOCAL
 24 | #else
 25 | #if __GNUC__ >= 4
 26 | #define PADDLE_LITE_HELPER_DLL_IMPORT __attribute__((visibility("default")))
 27 | #define PADDLE_LITE_HELPER_DLL_EXPORT __attribute__((visibility("default")))
 28 | #else
 29 | #define PADDLE_LITE_HELPER_DLL_IMPORT
 30 | #define PADDLE_LITE_HELPER_DLL_EXPORT
 31 | #endif
 32 | #endif
 33 | 
 34 | #ifdef LITE_ON_TINY_PUBLISH
 35 | #define LITE_API PADDLE_LITE_HELPER_DLL_EXPORT
 36 | #define LITE_API_IMPORT PADDLE_LITE_HELPER_DLL_IMPORT
 37 | #else
 38 | #define LITE_API
 39 | #define LITE_API_IMPORT
 40 | #endif
 41 | 
 42 | namespace paddle {
 43 |     namespace lite_api {
 44 | 
 45 |         enum class TargetType : int {
 46 |             kUnk = 0,
 47 |             kHost = 1,
 48 |             kX86 = 2,
 49 |             kCUDA = 3,
 50 |             kARM = 4,
 51 |             kOpenCL = 5,
 52 |             kAny = 6,  // any target
 53 |             kFPGA = 7,
 54 |             kNPU = 8,
 55 |             kXPU = 9,
 56 |             kBM = 10,
 57 |             kMLU = 11,
 58 |             kRKNPU = 12,
 59 |             kAPU = 13,
 60 |             kHuaweiAscendNPU = 14,
 61 |             kImaginationNNA = 15,
 62 |             NUM = 16,  // number of fields.
 63 |         };
 64 |         enum class PrecisionType : int {
 65 |             kUnk = 0,
 66 |             kFloat = 1,
 67 |             kInt8 = 2,
 68 |             kInt32 = 3,
 69 |             kAny = 4,  // any precision
 70 |             kFP16 = 5,
 71 |             kBool = 6,
 72 |             kInt64 = 7,
 73 |             kInt16 = 8,
 74 |             kUInt8 = 9,
 75 |             kFP64 = 10,
 76 |             NUM = 11,  // number of fields.
 77 |         };
 78 |         enum class DataLayoutType : int {
 79 |             kUnk = 0,
 80 |             kNCHW = 1,
 81 |             kNHWC = 3,
 82 |             kImageDefault = 4,  // for opencl image2d
 83 |             kImageFolder = 5,   // for opencl image2d
 84 |             kImageNW = 6,       // for opencl image2d
 85 |             kAny = 2,           // any data layout
 86 |             NUM = 7,            // number of fields.
 87 |         };
 88 | 
 89 |         typedef enum {
 90 |             LITE_POWER_HIGH = 0,
 91 |             LITE_POWER_LOW = 1,
 92 |             LITE_POWER_FULL = 2,
 93 |             LITE_POWER_NO_BIND = 3,
 94 |             LITE_POWER_RAND_HIGH = 4,
 95 |             LITE_POWER_RAND_LOW = 5
 96 |         } PowerMode;
 97 | 
 98 |         typedef enum {
 99 |             CL_TUNE_NONE = 0,
100 |             CL_TUNE_RAPID = 1,
101 |             CL_TUNE_NORMAL = 2,
102 |             CL_TUNE_EXHAUSTIVE = 3
103 |         } CLTuneMode;
104 | 
105 |         typedef enum {
106 |             CL_PRECISION_AUTO = 0,
107 |             CL_PRECISION_FP32 = 1,
108 |             CL_PRECISION_FP16 = 2
109 |         } CLPrecisionType;
110 | 
111 |         typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
112 | 
113 |         enum class ActivationType : int {
114 |             kIndentity = 0,
115 |             kRelu = 1,
116 |             kRelu6 = 2,
117 |             kPRelu = 3,
118 |             kLeakyRelu = 4,
119 |             kSigmoid = 5,
120 |             kTanh = 6,
121 |             kSwish = 7,
122 |             kExp = 8,
123 |             kAbs = 9,
124 |             kHardSwish = 10,
125 |             kReciprocal = 11,
126 |             kThresholdedRelu = 12,
127 |             kElu = 13,
128 |             kHardSigmoid = 14,
129 |             kLog = 15,
130 |             kSigmoid_v2 = 16,
131 |             kTanh_v2 = 17,
132 |             NUM = 18,
133 |         };
134 | 
135 |         static size_t PrecisionTypeLength(PrecisionType type) {
136 |           switch (type) {
137 |             case PrecisionType::kFloat:
138 |               return 4;
139 |             case PrecisionType::kFP64:
140 |               return 8;
141 |             case PrecisionType::kUInt8:
142 |               return 1;
143 |             case PrecisionType::kInt8:
144 |               return 1;
145 |             case PrecisionType::kInt32:
146 |               return 4;
147 |             case PrecisionType::kInt64:
148 |               return 8;
149 |             case PrecisionType::kFP16:
150 |               return 2;
151 |             case PrecisionType::kInt16:
152 |               return 2;
153 |             default:
154 |               return 0;
155 |           }
156 |         }
157 | 
158 |         enum class QuantType : int {
159 |             QUANT_INT8,
160 |             QUANT_INT16,
161 |         };
162 | 
163 |         template <typename T>
164 |         struct PrecisionTypeTrait {
165 |             constexpr static PrecisionType Type() { return PrecisionType::kUnk; }
166 |         };
167 | 
168 | #define _ForEachPrecisionTypeHelper(callback, cpp_type, precision_type) \
169 |   callback(cpp_type, ::paddle::lite_api::PrecisionType::precision_type);
170 | 
171 | #define _ForEachPrecisionType(callback)                   \
172 |   _ForEachPrecisionTypeHelper(callback, bool, kBool);     \
173 |   _ForEachPrecisionTypeHelper(callback, float, kFloat);   \
174 |   _ForEachPrecisionTypeHelper(callback, double, kFP64);   \
175 |   _ForEachPrecisionTypeHelper(callback, uint8_t, kUInt8); \
176 |   _ForEachPrecisionTypeHelper(callback, int8_t, kInt8);   \
177 |   _ForEachPrecisionTypeHelper(callback, int16_t, kInt16); \
178 |   _ForEachPrecisionTypeHelper(callback, int, kInt32);     \
179 |   _ForEachPrecisionTypeHelper(callback, int64_t, kInt64);
180 | 
181 | #define DefinePrecisionTypeTrait(cpp_type, precision_type)           \
182 |   template <>                                                        \
183 |   struct PrecisionTypeTrait<cpp_type> {                              \
184 |     constexpr static PrecisionType Type() { return precision_type; } \
185 |   }
186 | 
187 |         _ForEachPrecisionType(DefinePrecisionTypeTrait);
188 | 
189 | #undef _ForEachPrecisionTypeHelper
190 | #undef _ForEachPrecisionType
191 | #undef DefinePrecisionTypeTrait
192 | 
193 | #define TARGET(item__) paddle::lite_api::TargetType::item__
194 | #define PRECISION(item__) paddle::lite_api::PrecisionType::item__
195 | #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
196 | 
197 |         const std::string& ActivationTypeToStr(ActivationType act);
198 | 
199 |         const std::string& TargetToStr(TargetType target);
200 | 
201 |         const std::string& PrecisionToStr(PrecisionType precision);
202 | 
203 |         const std::string& DataLayoutToStr(DataLayoutType layout);
204 | 
205 |         const std::string& TargetRepr(TargetType target);
206 | 
207 |         const std::string& PrecisionRepr(PrecisionType precision);
208 | 
209 |         const std::string& DataLayoutRepr(DataLayoutType layout);
210 | 
211 | // Get a set of all the elements represented by the target.
212 |         std::set<TargetType> ExpandValidTargets(TargetType target = TARGET(kAny));
213 | 
214 | // Get a set of all the elements represented by the precision.
215 |         std::set<PrecisionType> ExpandValidPrecisions(
216 |                 PrecisionType precision = PRECISION(kAny));
217 | 
218 | // Get a set of all the elements represented by the layout.
219 |         std::set<DataLayoutType> ExpandValidLayouts(
220 |                 DataLayoutType layout = DATALAYOUT(kAny));
221 | 
222 | /*
223 |  * Place specifies the execution context of a Kernel or input/output for a
224 |  * kernel. It is used to make the analysis of the MIR more clear and accurate.
225 |  */
226 |         struct LITE_API Place {
227 |             TargetType target{TARGET(kUnk)};
228 |             PrecisionType precision{PRECISION(kUnk)};
229 |             DataLayoutType layout{DATALAYOUT(kUnk)};
230 |             int16_t device{0};  // device ID
231 | 
232 |             Place() = default;
233 |             Place(TargetType target,
234 |                   PrecisionType precision = PRECISION(kFloat),
235 |                   DataLayoutType layout = DATALAYOUT(kNCHW),
236 |                   int16_t device = 0)
237 |                     : target(target), precision(precision), layout(layout), device(device) {}
238 | 
239 |             bool is_valid() const {
240 |               return target != TARGET(kUnk) && precision != PRECISION(kUnk) &&
241 |                      layout != DATALAYOUT(kUnk);
242 |             }
243 | 
244 |             size_t hash() const;
245 | 
246 |             bool operator==(const Place& other) const {
247 |               return target == other.target && precision == other.precision &&
248 |                      layout == other.layout && device == other.device;
249 |             }
250 | 
251 |             bool operator!=(const Place& other) const { return !(*this == other); }
252 | 
253 |             friend bool operator<(const Place& a, const Place& b);
254 | 
255 |             std::string DebugString() const;
256 |         };
257 | 
258 |     }  // namespace lite_api
259 | }  // namespace paddle
260 | 


--------------------------------------------------------------------------------
/src/dajtensor.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "dajtensor.h"
  3 | #include "dajutil.h"
  4 | 
  5 | namespace dajnn {
  6 | 
  7 | Tensor::Tensor() {
  8 | 	_val_ = nullptr;
  9 | 	span = 0;
 10 | 	releasable = true;
 11 | 
 12 | #ifdef TRACE_MEMORY_LEAK
 13 | 	push_tensor_trace(this);
 14 | #endif
 15 | }
 16 | 
 17 | Tensor::~Tensor() {
 18 | 	if (releasable && _val_) free(_val_);
 19 | 
 20 | #ifdef TRACE_MEMORY_LEAK
 21 | 	pop_tensor_trace(this);
 22 | #endif
 23 | }
 24 | 
 25 | void Tensor::reshape(vector<uint>* shape) {
 26 | 	exit_if(span != get_span(shape), "unable to reshape from %s to %s",
 27 | 		get_shape_str(&this->shape).c_str(), get_shape_str(shape).c_str());
 28 | 	this->shape = *shape;
 29 | }
 30 | 
 31 | void Tensor::reshape(uint dim1, ...) {
 32 | 	if (dim1 == END_DIM) return;
 33 | 
 34 | 	vector<uint> running_shape;
 35 | 	running_shape.push_back(dim1);
 36 | 
 37 | 	va_list ap;
 38 | 	va_start(ap, dim1);
 39 | 	uint adim = va_arg(ap, uint);
 40 | 
 41 | 	while (adim != END_DIM) {
 42 | 		running_shape.push_back(adim);
 43 | 		adim = va_arg(ap, uint);
 44 | 	}
 45 | 	va_end(ap);
 46 | 	reshape(&running_shape);
 47 | }
 48 | 
 49 | bool Tensor::is_shape(vector<uint>* shape) {
 50 | 	if (this->shape.size() != shape->size()) return false;
 51 | 
 52 | 	for (uint i = 0; i < shape->size(); ++i) {
 53 | 		if (this->shape[i] != shape->at(i)) return false;
 54 | 	}
 55 | 	return true;
 56 | }
 57 | 
 58 | bool Tensor::is_shape(uint dim1, ...) {
 59 | 	if (dim1 != END_DIM) {
 60 | 		if (shape.empty()) return false;
 61 | 		if (shape[0] != dim1) return false;
 62 | 	} else if (!shape.empty()) return false;
 63 | 
 64 | 	va_list ap;
 65 | 	va_start(ap, dim1);
 66 | 	uint adim = va_arg(ap, uint);
 67 | 
 68 | 	for (uint i = 1; i < shape.size(); ++i) {
 69 | 		if (shape[i] != adim) return false;
 70 | 		adim = va_arg(ap, uint);
 71 | 	}
 72 | 	if (adim != END_DIM) return false;
 73 | 	va_end(ap);
 74 | 	return true;
 75 | }
 76 | 
 77 | void Tensor::set_releasable(bool releasable) {
 78 | 	this->releasable = releasable;
 79 | }
 80 | 
 81 | void* Tensor::_init_(Tensor* tensor, bool copy_val) {
 82 | 	exit_if(!tensor, "cannot clone tensor from empty tensor");
 83 | 	shape = tensor->shape;
 84 | 	span = get_span(&shape);
 85 | 
 86 | 	if (copy_val) {
 87 | 		_val_ = malloc(4 * span);
 88 | 		memcpy(_val_, tensor->_val_, 4 * span);
 89 | 	} else {
 90 | 		_val_ = tensor->_val_;
 91 | 	}
 92 | 	return _val_;
 93 | }
 94 | 
 95 | void* Tensor::_init_(vector<uint>* shape, void* val, bool copy_val) {
 96 | 	this->shape = *shape;
 97 | 	span = get_span(shape);
 98 | 	
 99 | 	if (val && copy_val) {
100 | 		_val_ = malloc(4 * span);
101 | 		memcpy(_val_, val, 4 * span);
102 | 	} else if (!val) {
103 | 		_val_ = malloc(4 * span);
104 | 	} else {
105 | 		_val_ = val;
106 | 	}
107 | 	return _val_;
108 | }
109 | 
110 | void* Tensor::_init_(void* val, bool copy_val, uint dim1, va_list ap) {
111 | 	if (dim1 == END_DIM) return nullptr;
112 | 	vector<uint> running_shape;
113 | 
114 | 	running_shape.push_back(dim1);
115 | 	uint adim = va_arg(ap, uint);
116 | 
117 | 	while (adim != END_DIM) {
118 | 		running_shape.push_back(adim);
119 | 		exit_if(running_shape.size() == MAX_TENSOR_DIM,
120 | 			"tensor shape with too many dimensions (%s) : did you forget to end with END_DIM?",
121 | 			get_shape_str(&running_shape).c_str());
122 | 		adim = va_arg(ap, uint);
123 | 	}
124 | 	return _init_(&running_shape, val, copy_val);
125 | }
126 | 
127 | void* Tensor::_init_(ByteStream* stream) {
128 | 	_read_meta_(stream);
129 | 	_val_ = malloc(4 * span);
130 | 	stream->read(_val_, 4, span);
131 | 	return _val_;
132 | }
133 | 
134 | void Tensor::_read_meta_(ByteStream* stream) {
135 | 	unsigned char len = 0;
136 | 	stream->read(&len, 1, 1);
137 | 
138 | 	for (unsigned char i = 0; i < len; ++i) {
139 | 		unsigned int dim = 0;
140 | 		stream->read(&dim, 4, 1);
141 | 		shape.push_back(dim);
142 | 	}
143 | 	span = get_span(&shape);
144 | }
145 | 
146 | void Tensor::_write_meta_(ByteStream* stream) {
147 | 	unsigned char len = (unsigned char) shape.size();
148 | 	stream->write(&len, 1, 1);
149 | 
150 | 	for (unsigned char i = 0; i < len; ++i) {
151 | 		stream->write(&shape[i], 4, 1);
152 | 	}
153 | }
154 | 
155 | void Tensor::_write_val_(ByteStream* stream) {
156 | 	stream->write(_val_, 4, span);
157 | }
158 | 
159 | void Tensor::_save_(ByteStream* stream) {
160 | 	_write_meta_(stream);
161 | 	_write_val_(stream);
162 | }
163 | 
164 | ITensor::ITensor() : Tensor() {
165 | 	val = nullptr;
166 | }
167 | 
168 | ITensor::ITensor(ITensor* tensor, bool copy_val) : ITensor() {
169 | 	this->val = (int*) _init_(tensor, copy_val);
170 | }
171 | 
172 | ITensor::ITensor(vector<uint>* shape, int* val, bool copy_val) : ITensor() {
173 | 	this->val = (int*) _init_(shape, val, copy_val);
174 | }
175 | 
176 | ITensor::ITensor(int* val, bool copy_val, uint dim1, ...) : ITensor() {
177 | 	va_list ap;
178 | 	va_start(ap, dim1);
179 | 	this->val = (int*) _init_(val, copy_val, dim1, ap);
180 | 	va_end(ap);
181 | }
182 | 
183 | ITensor::ITensor(uint dim1, ...) : ITensor() {
184 | 	va_list ap;
185 | 	va_start(ap, dim1);
186 | 	this->val = (int*) _init_(nullptr, false, dim1, ap);
187 | 	va_end(ap);
188 | };
189 | 
190 | ITensor::ITensor(ByteStream* stream) : ITensor() {
191 | 	char compressed = 0;
192 | 	stream->read(&compressed, 1, 1);
193 | 
194 | 	if (compressed) {
195 | 		_read_meta_(stream);
196 | 		short sh = 0;
197 | 		_val_ = val = (int*) malloc(span * 4);
198 | 
199 | 		for (int* vp = val; vp < val + span; ++vp) {
200 | 			stream->read(&sh, 2, 1);
201 | 			*vp = sh;
202 | 		}
203 | 	} else {
204 | 		val = (int*) _init_(stream);
205 | 	}
206 | }
207 | 
208 | void ITensor::save(ByteStream* stream, bool compressed) {
209 | 	char flag = compressed ? 1 : 0;
210 | 	stream->write(&flag, 1, 1);
211 | 
212 | 	if (compressed) {
213 | 		_write_meta_(stream);
214 | 		short sh = 0;
215 | 
216 | 		for (int* vp = val; vp < val + span; ++vp) {
217 | 			sh = (short) *vp;
218 | 			stream->write(&sh, 2, 1);
219 | 		}
220 | 	} else {
221 | 		_save_(stream);
222 | 	}
223 | }
224 | 
225 | int ITensor::compare(ITensor* tensor) {
226 | 	return compare(tensor->val, tensor->span);
227 | }
228 | 
229 | int ITensor::compare(int* val, uint len) {
230 | 	uint comp_len = MIN(span, len);
231 | 	int max_abs = 0;
232 | 	int* vp1 = this->val;
233 | 	int* vp2 = val;
234 | 
235 | 	for (uint i = 0; i < comp_len; ++i, ++vp1, ++vp2) {
236 | 		int d = abs(*vp1 - *vp2);
237 | 		if (d > max_abs) max_abs = d;
238 | 	}
239 | 	return max_abs;
240 | }
241 | 
242 | int ITensor::get_max() {
243 | 	return dajnn::get_max(val, span);
244 | }
245 | 
246 | int ITensor::get_min() {
247 | 	return dajnn::get_min(val, span);
248 | }
249 | 
250 | FTensor::FTensor() : Tensor() {
251 | 	val = nullptr;
252 | }
253 | 
254 | FTensor::FTensor(ITensor* tensor) {
255 | 	shape = tensor->shape;
256 | 	span = get_span(&shape);
257 | 	
258 | 	_val_ = val = (float*) malloc(span * 4);
259 | 	int* tp = tensor->val;
260 | 
261 | 	for (float* vp = val; vp < val + span; ++vp, ++tp) {
262 | 		*vp = (float) *tp;
263 | 	}
264 | }
265 | 
266 | FTensor::FTensor(FTensor* tensor, bool copy_val) : FTensor() {
267 | 	this->val = (float*) _init_(tensor, copy_val);
268 | }
269 | 
270 | FTensor::FTensor(vector<uint>* shape, float* val, bool copy_val) : FTensor() {
271 | 	this->val = (float*) _init_(shape, val, copy_val);
272 | }
273 | 
274 | FTensor::FTensor(float* val, bool copy_val, uint dim1, ...) : FTensor() {
275 | 	va_list ap;
276 | 	va_start(ap, dim1);
277 | 	this->val = (float*) _init_(val, copy_val, dim1, ap);
278 | 	va_end(ap);
279 | }
280 | 
281 | FTensor::FTensor(uint dim1, ...) : FTensor() {
282 | 	va_list ap;
283 | 	va_start(ap, dim1);
284 | 	this->val = (float*) _init_(nullptr, false, dim1, ap);
285 | 	va_end(ap);
286 | }
287 | 
288 | FTensor::FTensor(ByteStream* stream) : FTensor() {
289 | 	char compressed = 0;
290 | 	stream->read(&compressed, 1, 1);
291 | 
292 | 	if (compressed) {
293 | 		_read_meta_(stream);
294 | 
295 | 		float min_v = 0, max_v = 0;
296 | 		short sh = 0;
297 | 
298 | 		stream->read(&min_v, 4, 1);
299 | 		stream->read(&max_v, 4, 1);
300 | 		_val_ = val = (float*) malloc(span * 4);
301 | 
302 | 		for (float* vp = val; vp < val + span; ++vp) {
303 | 			stream->read(&sh, 2, 1);
304 | 			*vp = min_v + (max_v - min_v) * (1 + (float) sh / SHRT_MAX) / 2;
305 | 		}
306 | 	} else {
307 | 		val = (float*) _init_(stream);
308 | 	}
309 | }
310 | 
311 | void FTensor::save(ByteStream* stream, bool compressed) {
312 | 	char flag = compressed ? 1 : 0;
313 | 	stream->write(&flag, 1, 1);
314 | 
315 | 	if (compressed) {
316 | 		_write_meta_(stream);
317 | 
318 | 		float min_v = get_min();
319 | 		float max_v = get_max();
320 | 		short sh = 0;
321 | 
322 | 		stream->write(&min_v, 4, 1);
323 | 		stream->write(&max_v, 4, 1);
324 | 
325 | 		for (float* vp = val; vp < val + span; ++vp) {
326 | 			sh = (short) ((2 * (*vp - min_v) / (max_v - min_v) - 1) * SHRT_MAX);
327 | 			stream->write(&sh, 2, 1);
328 | 		}
329 | 	} else {
330 | 		_save_(stream);
331 | 	}
332 | }
333 | 
334 | void FTensor::print(uint start, uint end) {
335 | 	if (start == END_DIM) start = 0;
336 | 	if (end == END_DIM) end = span;
337 | 
338 | 	for (uint i = start; i < end; ++i) {
339 | 		printf("%.8f,", val[i]);
340 | 	}
341 | }
342 | 
343 | float FTensor::compare(FTensor* tensor) {
344 | 	return compare(tensor->val, tensor->span);
345 | }
346 | 
347 | float FTensor::compare(float* val, uint len) {
348 | 	uint comp_len = MIN(span, len);
349 | 	float max_abs = 0;
350 | 	float* vp1 = this->val;
351 | 	float* vp2 = val;
352 | 
353 | 	for (uint i = 0; i < comp_len; ++i, ++vp1, ++vp2) {
354 | 		float d = fabsf(*vp1 - *vp2);
355 | 		if (d > max_abs) max_abs = d;
356 | 	}
357 | 	return max_abs;
358 | }
359 | 
360 | float FTensor::get_max() {
361 | 	return dajnn::get_max(val, span);
362 | }
363 | 
364 | float FTensor::get_min() {
365 | 	return dajnn::get_min(val, span);
366 | }
367 | 
368 | ByteStream::ByteStream() {
369 | 	buff = nullptr;
370 | 	fp = nullptr;
371 | 	pointer = 0;
372 | }
373 | 
374 | ByteStream::ByteStream(const void* buff) : ByteStream() {
375 | 	this->buff = (const char*) buff;
376 | }
377 | 
378 | ByteStream::ByteStream(FILE* fp) : ByteStream() {
379 | 	this->fp = fp;
380 | }
381 | 
382 | string ByteStream::read_str() {
383 | 	string str;
384 | 	char t = 0;
385 | 
386 | 	for (uint i = 0; i < MAX_MODEL_STR; ++i) {
387 | 		if (!read(&t, 1, 1)) break;
388 | 		if (!t) break;
389 | 		str += t;
390 | 	}
391 | 	return str;
392 | }
393 | 
394 | uint ByteStream::read(void* dst, int ele_size, int ele_count) {
395 | 	if (buff) {
396 | 		int len = ele_size * ele_count;
397 | 		memcpy(dst, &buff[pointer], len);
398 | 		pointer += len;
399 | 		return ele_count;
400 | 	} else if (fp) {
401 | 		return (uint) fread(dst, ele_size, ele_count, fp);
402 | 	} else {
403 | 		return 0;
404 | 	}
405 | }
406 | 
407 | void ByteStream::write(void* src, int ele_size, int ele_count) {
408 | 	if (buff) {
409 | 		int len = ele_size * ele_count;
410 | 		memcpy((char*) &buff[pointer], src, len);
411 | 		pointer += len;
412 | 	} else if (fp) {
413 | 		fwrite(src, ele_size, ele_count, fp);
414 | 	}
415 | }
416 | 
417 | int ByteStream::seek() {
418 | 	return pointer;
419 | }
420 | 
421 | }
422 | 


--------------------------------------------------------------------------------
/paddle/paddle_api_2.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | /*
 16 |  * This file defines PaddlePredictor, the api for lite. It supports multiple
 17 |  * hardware including ARM, X86, OpenCL, CUDA and so on.
 18 |  */
 19 | 
 20 | #ifndef PADDLE_LITE_API_2_H_  // NOLINT
 21 | #define PADDLE_LITE_API_2_H_
 22 | #include <map>
 23 | #include <memory>
 24 | #include <string>
 25 | #include <utility>
 26 | #include <vector>
 27 | #include "paddle_place.h"  // NOLINT
 28 | #include "paddle_api.h"
 29 | 
 30 | 
 31 | namespace paddle {
 32 | namespace lite_api {
 33 | 
 34 | #ifdef LITE_WITH_ARM
 35 | 
 36 | LITE_API     void paddle_DeviceInit();
 37 | LITE_API     void paddle_clip_kernel_fp32(const float* input, int64_t num, float min, float max, float* output);
 38 | LITE_API     void paddle_elementwise_mul(const float* dinx, const float* diny, float* dout, int num);
 39 | LITE_API     void paddle_elementwise_div(const float* dinx, const float* diny, float* dout, int num);
 40 | LITE_API     void paddle_elementwise_add(const float* dinx, const float* diny, float* dout, int num);
 41 | LITE_API     void paddle_elementwise_sub(const float* dinx, const float* diny, float* dout, int num);
 42 | LITE_API     void paddle_elementwise_pow(const float* dinx, const float* diny, float* dout, int num);
 43 | LITE_API     void paddle_elementwise_max(const float* dinx, const float* diny, float* dout,int num);
 44 | LITE_API     void paddle_act_relu(const float* din, float* dout, int size, int threads);
 45 | LITE_API     void paddle_act_sigmoid(const float* din, float* dout, int size, int threads);
 46 | LITE_API     void act_tanh(const float* din, float* dout, int size, int threads);
 47 | LITE_API     void act_log(const float* din, float* dout, int size, int threads);
 48 | LITE_API     void act_exp(const float* din, float* dout, int size, int threads);
 49 | 
 50 | // New added activate
 51 | LITE_API     void act_leakyrelu(const float* din, float* dout, int size, float alpha, int threads);
 52 | LITE_API     void act_sqrt(const float* din, float* dout, int size, int threads);
 53 | LITE_API     void act_softmax(const float* din, float* dout, int dims, int axis_num);
 54 | 
 55 | 
 56 | LITE_API     void scale(const float* din, float* dout, int num, float scale, float bias);
 57 | LITE_API     void scale(const int* din, int* dout, int num, int scale, int bias);
 58 | LITE_API     void scale(const float* din,
 59 |                   float* dout,
 60 |                   int outer_dim,
 61 |                   int scale_dim,
 62 |                   int inner_dim,
 63 |                   const float* scale_data,
 64 |                   const float* bias_data);
 65 | LITE_API     void scale(const float* din,
 66 |                   float* dout,
 67 |                   int outer_dim,
 68 |                   int scale_dim,
 69 |                   const float* scale_data,
 70 |                   const float* bias_data);
 71 | 
 72 | LITE_API void paddle_matmul(const int M, const int N, const int K, const float* X, const float* W, float* Y, int cls=0, int ths=1);
 73 | LITE_API void paddle_fccompute(const int M, const int N, const int K,
 74 |       const float* X, const float* W, float* Y,
 75 |       const float* bias = nullptr, ActivationType activationtype = ActivationType::kIndentity,
 76 |       int cls=0, int ths=1);
 77 | 
 78 | LITE_API void paddle_matmul_quantize(const int M, const int N, const int K,
 79 |         const int8_t* X, const float xscale,
 80 |         const int8_t* W, const float wscale, float* Y);
 81 | 
 82 | LITE_API  void paddle_conv1d(
 83 |       int batches,
 84 |       int channels, int xlen, float* indata,
 85 |       int filters, int kernelsize, float *kerneldata,
 86 |       float*outdata, float* bias=NULL,
 87 |       int padding = 0, int dilation = 1, int stride = 1,
 88 |       int flag_act = 0, float leaky_relu_scale = 0.1,
 89 |       int cls = 1, int ths = 2);
 90 | 
 91 | LITE_API void paddle_conv2d(
 92 |       int batches,
 93 |       int x_h, int x_w, int channels, float* indata,
 94 |       int filters, int kernel_h, int kernel_w, float* kerneldata,
 95 |       float* outdata, float* bias=NULL,
 96 |       int padding_h = 0, int padding_w = 0, 
 97 |       int dilation_h = 1, int dilation_w = 1,
 98 |       int stride_h = 1, int stride_w = 1,
 99 |       int flag_act = 0, float leaky_relu_scale = 0.1,
100 |       int cls = 1, int ths = 2);
101 | 
102 | LITE_API  void paddle_conv(
103 |       std::vector<int64_t>indatashape, float* indata,
104 |       std::vector<int64_t>kernelshape, float *kerneldata,
105 |       std::vector<int64_t>outdatashape, float*outdata,
106 |       bool flag_bias, float*biasdata,
107 |       std::vector<int> pad,
108 |       std::vector<int> dilation,
109 |       std::vector<int> stride, 
110 |       int flag_act, float leaky_relu_scale, int cls, int ths);
111 | LITE_API void paddle_matrix_norm_row(const float* x_data,
112 |                      const float* scale_data,
113 |                      const float* bias_data,
114 |                      float* out_data,
115 |                      float* mean_out,
116 |                      float* var_out,
117 |                      float epsilon,
118 |                      int batch_size,
119 |                      int feature_size);
120 | LITE_API void paddle_mean_var(const float* x_data,
121 |                      float* mean_out,
122 |                      float* var_out,
123 |                      float epsilon,
124 |                      int batch_size,
125 |                      int feature_size);
126 | // LITE_API void paddle_conv1d_int(
127 | //       int channels, int xlen, int8_t* indata,
128 | //       int filters, int kernelsize, int8_t *kerneldata,
129 | //       float *outdata, float* bias,  float input_scale, float weight_scale, 
130 | //       int padding=0, int dilation=1, int stride=1,
131 | //       int flag_act=0, float leaky_relu_scale=0.1,
132 | //       int cls=0, int ths=1);
133 | 
134 | // LITE_API void paddle_conv_int(
135 | //       std::vector<int64_t>indatashape, int8_t* indata,
136 | //       std::vector<int64_t>kernelshape, int8_t *kerneldata,
137 | //       std::vector<int64_t>outdatashape, float*outdata,
138 | //       bool flag_bias, float*biasdata,float input_scale,  float weight_scale, 
139 | //       std::vector<int> pad,
140 | //       std::vector<int> dilation,
141 | //       std::vector<int> stride, int flag_act, float leaky_relu_scale, int cls=0, int ths=1);
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | LITE_API  void paddle_conv_transpose1d(
154 |       int channels, int xlen, float* indata,
155 |       int filters, int kernelsize, float* kerneldata,
156 |       float* outdata,
157 |       int padding = 0, int dilation = 1, int stride = 1,
158 |       int flag_act = 0, float leaky_relu_scale = 0.1,
159 |       int cls = 1, int ths = 2
160 |      );
161 | 
162 | 
163 | LITE_API void paddle_conv_transpose2d(
164 |       int channels, int x_h, int x_w, float* indata,
165 |       int filters, int kernel_h, int kernel_w, float* kerneldata,
166 |       float* outdata,
167 |       int padding_h=0, int padding_w=0, 
168 |       int dilation_h=1, int dilation_w=1,
169 |       int stride_h=1, int stride_w=1,
170 |       int flag_act=0, float leaky_relu_scale=0.1,
171 |       int cls=1, int ths=2);
172 | LITE_API void paddle_conv_transpose(
173 |           std::vector<int64_t>indatashape, float* indata,
174 |           std::vector<int64_t>kernelshape, float* kerneldata,
175 |           std::vector<int64_t>outdatashape, float* outdata,
176 |           std::vector<int> pad,
177 |           std::vector<int> dilation,
178 |           std::vector<int> stride, int flag_act, float leaky_relu_scale, int cls, int ths);
179 | 
180 | LITE_API void paddle_layernorm1d(float* x, float* weight, float* bias, float* outdata, 
181 |       float* meandata, float* vardata, int batch_size, int features);
182 | 
183 | LITE_API void paddle_batchnorm1d(float* x, float* outdata, 
184 |       float* scale, float* bias, float* mean_data, float* var_data,
185 |       int channels, int xlen, int cls = 1, int ths = 2);
186 | 
187 | LITE_API void paddle_batchnorm(std::vector<int64_t>indatashape, float* indata, float* outdata,
188 |           float* scaledata, float*biasdata, float* mean_data, float* var_data,
189 |           int cls = 1, int ths = 2);
190 | 
191 | LITE_API void paddle_fill_bias(float* x, float* bias, int channels, int xlen, bool flag_relu=false);
192 | 
193 | LITE_API void paddle_transpose2d(float* x, float* out, int size1, int size2, int cls = 0, int ths = 1);
194 | 
195 | LITE_API void paddle_transpose3d(float* x, float* out, int size1, int size2, int size3, int axis1, int axis2, int cls = 0, int ths = 1);
196 | 
197 | LITE_API void paddle_transpose(float* x, float* out, std::vector<int> axis_size, int axis1 = 1, int axis2 = 0, int cls = 0, int ths=1);
198 | 
199 | LITE_API void paddle_transpose(std::vector<int64_t> input_shape, float* indata, 
200 |       std::vector<int64_t> output_shape, float* outdata,
201 |       std::vector<int> axis, int cls, int ths);
202 | 
203 | LITE_API void paddle_reflect1d(float* din, float* dout, int channels, int x_len, int dilation);
204 | LITE_API void paddle_reflect2d(float* din, float* dout, int channels, int x_h, int x_w, int dilation_h, int dilation_w);
205 | 
206 | LITE_API void paddle_matmul_int16_32(int m, int n, int k, int16_t* A, int16_t* B, int32_t* C, bool rettrans, int cls=1, int ths=2);
207 | 
208 | // New added functions
209 | LITE_API void paddle_affine(const float* din, const float* weight, const float* bias, const int dim1, const int dim2, float* dout );
210 | 
211 | LITE_API float paddle_FindAbsMax(float* din, int size);
212 | 
213 | LITE_API float paddle_GetScale(float threshold, int bit_length);
214 | 
215 | // LITE_API float paddle_fp32_to_int8_1d(const float* din, int8_t* dout, int size);
216 | // LITE_API void paddle_int8_to_fp32_1d(const int8_t* din, float* out, const float scale, int size);
217 | // LITE_API void paddle_int32_to_fp32_1d(const int* din, float* dout, const float scale, int size);
218 | // LITE_API float paddle_int32_to_int8_1d(const int* din, int8_t* dout, const float scale, int size);
219 | // LITE_API float paddle_fp32_to_int16_1d(const float* din, int16_t* dout, int size);
220 | // LITE_API void paddle_int16_to_fp32_1d(const int16_t* din, float* dout, const float scale, int size);
221 | 
222 | 
223 | 
224 | 
225 | 
226 | #endif//LITE_WITH_ARM
227 | 
228 | 
229 | 
230 | }  // namespace lite_api
231 | }  // namespace paddle
232 | 
233 | #endif  // NOLINT
234 | 


--------------------------------------------------------------------------------
/paddle/paddle_api.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | /*
 16 |  * This file defines PaddlePredictor, the api for lite. It supports multiple
 17 |  * hardware including ARM, X86, OpenCL, CUDA and so on.
 18 |  */
 19 | 
 20 | #ifndef PADDLE_LITE_API_H_  // NOLINT
 21 | #define PADDLE_LITE_API_H_
 22 | #include <map>
 23 | #include <memory>
 24 | #include <string>
 25 | #include <utility>
 26 | #include <vector>
 27 | #include "paddle_place.h"  // NOLINT
 28 | 
 29 | namespace paddle {
 30 | namespace lite_api {
 31 | 
 32 | using shape_t = std::vector<int64_t>;
 33 | using lod_t = std::vector<std::vector<uint64_t>>;
 34 | 
 35 | enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
 36 | // Methods for allocating L3Cache on Arm platform
 37 | enum class L3CacheSetMethod {
 38 |   kDeviceL3Cache = 0,  // Use the system L3 Cache size, best performance.
 39 |   kDeviceL2Cache = 1,  // Use the system L2 Cache size, trade off performance
 40 |                        // with less memory consumption.
 41 |   kAbsolute = 2,       // Use the external setting.
 42 |   // kAutoGrow = 3,   // Not supported yet, least memory consumption.
 43 | };
 44 | 
 45 | // return true if current device supports OpenCL model
 46 | LITE_API bool IsOpenCLBackendValid(bool check_fp16_valid = false);
 47 | 
 48 | struct LITE_API Tensor {
 49 |   explicit Tensor(void* raw);
 50 |   explicit Tensor(const void* raw);
 51 | 
 52 |   void Resize(const shape_t& shape);
 53 | 
 54 |   /// Readonly data.
 55 |   template <typename T>
 56 |   const T* data() const;
 57 | 
 58 |   template <typename T>
 59 |   T* mutable_data(TargetType type = TargetType::kHost) const;
 60 | 
 61 |   // Share external memory. Note: ensure that the data pointer is in a valid
 62 |   // state
 63 |   // during the prediction process.
 64 |   void ShareExternalMemory(void* data, size_t memory_size, TargetType target);
 65 | 
 66 |   template <typename T, TargetType type = TargetType::kHost>
 67 |   void CopyFromCpu(const T* data);
 68 | 
 69 |   template <typename T>
 70 |   void CopyToCpu(T* data) const;
 71 |   /// Shape of the tensor.
 72 |   shape_t shape() const;
 73 |   TargetType target() const;
 74 |   PrecisionType precision() const;
 75 |   void SetPrecision(PrecisionType precision);
 76 | 
 77 |   // LoD of the tensor
 78 |   lod_t lod() const;
 79 | 
 80 |   // Set LoD of the tensor
 81 |   void SetLoD(const lod_t& lod);
 82 |   bool IsInitialized() const;
 83 | 
 84 |  private:
 85 |   void* raw_tensor_;
 86 | };
 87 | 
 88 | /// The PaddlePredictor defines the basic interfaces for different kinds of
 89 | /// predictors.
 90 | class LITE_API PaddlePredictor {
 91 |  public:
 92 |   PaddlePredictor() = default;
 93 | 
 94 |   /// Get i-th input.
 95 |   virtual std::unique_ptr<Tensor> GetInput(int i) = 0;
 96 | 
 97 |   /// Get i-th output.
 98 |   virtual std::unique_ptr<const Tensor> GetOutput(int i) const = 0;
 99 | 
100 |   virtual void Run() = 0;
101 |   virtual std::shared_ptr<PaddlePredictor> Clone() = 0;
102 |   virtual std::shared_ptr<PaddlePredictor> Clone(
103 |       const std::vector<std::string>& var_names) = 0;
104 | 
105 |   virtual std::string GetVersion() const = 0;
106 | 
107 |   // Get input names
108 |   virtual std::vector<std::string> GetInputNames() = 0;
109 |   // Get output names
110 |   virtual std::vector<std::string> GetOutputNames() = 0;
111 |   // Get output names
112 |   virtual std::vector<std::string> GetParamNames();
113 | 
114 |   // Get Input by name
115 |   virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
116 | 
117 |   /// Get a readonly tensor, return null if no one called `name` exists.
118 |   virtual std::unique_ptr<const Tensor> GetTensor(
119 |       const std::string& name) const = 0;
120 |   /// Get a mutable tensor, return null if on one called `name` exists
121 |   /// internal infereces API, not recommanded.
122 |   virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
123 | 
124 |   /// Persist the optimized model to disk. This API is only supported by
125 |   /// CxxConfig, and the persisted model can be reused for MobileConfig.
126 |   virtual void SaveOptimizedModel(
127 |       const std::string& model_dir,
128 |       LiteModelType model_type = LiteModelType::kProtobuf,
129 |       bool record_info = false);
130 | 
131 |   virtual ~PaddlePredictor() = default;
132 | 
133 |  protected:
134 |   int threads_{1};
135 |   lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND};
136 | };
137 | 
138 | /// Base class for all the configs.
139 | class LITE_API ConfigBase {
140 |   std::string model_dir_;
141 |   int threads_{1};
142 |   PowerMode mode_{LITE_POWER_NO_BIND};
143 |   // gpu opencl
144 |   CLTuneMode opencl_tune_mode_{CL_TUNE_NONE};
145 |   CLPrecisionType opencl_precision_{CL_PRECISION_AUTO};
146 |   // Where to cache the npu/xpu/rknpu/apu offline model to the binary files
147 |   std::string subgraph_model_cache_dir_{""};
148 |   // Set the cached npu/xpu/rknpu/apu offline model from the buffers
149 |   std::map<std::string, std::pair<std::vector<char>, std::vector<char>>>
150 |       subgraph_model_cache_buffers_{};
151 |   int device_id_{0};
152 |   int x86_math_num_threads_ = 1;
153 | 
154 |  public:
155 |   explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
156 |   // set Model_dir
157 |   void set_model_dir(const std::string& x) { model_dir_ = x; }
158 |   const std::string& model_dir() const { return model_dir_; }
159 |   // set Thread
160 |   void set_threads(int threads);
161 |   int threads() const { return threads_; }
162 |   // set Power_mode
163 |   void set_power_mode(PowerMode mode);
164 |   PowerMode power_mode() const { return mode_; }
165 |   // set GPU opencl tune
166 |   void set_opencl_tune(CLTuneMode tune_mode = CL_TUNE_NONE,
167 |                        size_t lws_repeats = 4);
168 |   // set GPU opencl precision
169 |   void set_opencl_precision(CLPrecisionType p = CL_PRECISION_AUTO);
170 |   // set subgraph_model_dir
171 |   void set_subgraph_model_cache_dir(std::string subgraph_model_cache_dir) {
172 |     subgraph_model_cache_dir_ = subgraph_model_cache_dir;
173 |   }
174 |   const std::string& subgraph_model_cache_dir() const {
175 |     return subgraph_model_cache_dir_;
176 |   }
177 |   void set_subgraph_model_cache_buffers(const std::string& key,
178 |                                         const std::vector<char>& cfg,
179 |                                         const std::vector<char>& bin);
180 |   const std::map<std::string, std::pair<std::vector<char>, std::vector<char>>>&
181 |   subgraph_model_cache_buffers() const {
182 |     return subgraph_model_cache_buffers_;
183 |   }
184 |   // set Device ID
185 |   void set_device_id(int device_id) { device_id_ = device_id; }
186 |   int get_device_id() const { return device_id_; }
187 |   // set x86_math_num_threads
188 |   void set_x86_math_num_threads(int threads);
189 |   int x86_math_num_threads() const;
190 | };
191 | 
192 | class LITE_API CxxModelBuffer {
193 |  public:
194 |   CxxModelBuffer(const char* program_buffer,
195 |                  size_t program_buffer_size,
196 |                  const char* params_buffer,
197 |                  size_t params_buffer_size);
198 |   CxxModelBuffer(std::string&& program_buffer, std::string&& params_buffer);
199 |   const std::string& get_program() const;
200 |   const std::string& get_params() const;
201 |   bool is_empty() const;
202 | 
203 |   CxxModelBuffer() = default;
204 |   CxxModelBuffer(const CxxModelBuffer&) = delete;
205 | 
206 |  private:
207 |   std::string program_;
208 |   std::string params_;
209 | };
210 | 
211 | /// CxxConfig is the config for the Full feature predictor.
212 | class LITE_API CxxConfig : public ConfigBase {
213 |   std::vector<Place> valid_places_;
214 |   std::string model_file_;
215 |   std::string param_file_;
216 |   std::shared_ptr<CxxModelBuffer> model_buffer_{nullptr};
217 |   std::vector<std::string> passes_internal_{};
218 |   bool quant_model_{false};  // Enable post_quant_dynamic in opt
219 |   QuantType quant_type_{QuantType::QUANT_INT16};
220 |   std::map<int, std::vector<std::shared_ptr<void>>>
221 |       preferred_inputs_for_warmup_;
222 | #ifdef LITE_WITH_CUDA
223 |   bool multi_stream_{false};
224 | #endif
225 | #ifdef LITE_WITH_MLU
226 |   lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
227 |   int mlu_core_number_{1};
228 |   DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
229 |   std::vector<float> mlu_first_conv_mean_{};
230 |   std::vector<float> mlu_first_conv_std_{};
231 | #endif
232 | 
233 |  public:
234 |   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
235 |   void set_model_file(const std::string& path) { model_file_ = path; }
236 |   void set_param_file(const std::string& path) { param_file_ = path; }
237 |   void set_model_buffer(const char* model_buffer,
238 |                         size_t model_buffer_size,
239 |                         const char* param_buffer,
240 |                         size_t param_buffer_size) {
241 |     model_buffer_.reset(new CxxModelBuffer(
242 |         model_buffer, model_buffer_size, param_buffer, param_buffer_size));
243 |   }
244 |   void set_model_buffer(std::shared_ptr<CxxModelBuffer> model_buffer) {
245 |     model_buffer_ = model_buffer;
246 |   }
247 |   const CxxModelBuffer& get_model_buffer() const;
248 |   // internal inference to choose passes for model optimizing,
249 |   // it's designed for internal developer and not recommanded
250 |   // for comman users.
251 |   void set_passes_internal(
252 |       const std::vector<std::string>& passes_internal = {}) {
253 |     passes_internal_ = passes_internal;
254 |   }
255 |   const std::vector<std::string>& get_passes_internal() const {
256 |     return passes_internal_;
257 |   }
258 |   const std::vector<Place>& valid_places() const { return valid_places_; }
259 |   std::string model_file() const { return model_file_; }
260 |   std::string param_file() const { return param_file_; }
261 |   bool is_model_from_memory() const { return static_cast<bool>(model_buffer_); }
262 |   // note: `model_from_memory` has the same effect as `is_model_from_memory`,
263 |   // but is_model_from_memory is recommended and `model_from_memory` will be
264 |   // abandoned in v3.0.
265 |   bool model_from_memory() const { return static_cast<bool>(model_buffer_); }
266 | 
267 | #ifdef LITE_WITH_CUDA
268 |   void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
269 |   bool multi_stream() const { return multi_stream_; }
270 | #endif
271 | 
272 | #ifdef LITE_WITH_MLU
273 |   // set MLU core version, which is used when compiling MLU kernels
274 |   void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
275 |   // set MLU core number, which is used when compiling MLU kernels
276 |   void set_mlu_core_number(int core_number);
277 |   // whether use MLU's first conv kernel. First conv is a special kernel
278 |   // provided by MLU, its input is uint8, and also needs two 3-dimentional
279 |   // vectors which save all inputs' mean and std values
280 |   // set the 3-dimentional mean vector and 3-dimentional std vector used by
281 |   // MLU's first conv
282 |   void set_mlu_firstconv_param(const std::vector<float>& mean,
283 |                                const std::vector<float>& std);
284 |   // set MLU input layout. User can specify layout of input data to be NHWC,
285 |   // default is NCHW
286 |   void set_mlu_input_layout(DataLayoutType layout);
287 | 
288 |   lite_api::MLUCoreVersion mlu_core_version() const;
289 |   int mlu_core_number() const;
290 |   DataLayoutType mlu_input_layout() const;
291 |   // std::pair<mean, std>
292 |   std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
293 | #endif
294 | 
295 |   // XPU only, set the size of the workspace memory from L3 cache for the
296 |   // current thread.
297 |   void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
298 |   // XPU only, specify the target device ID for the current thread.
299 |   // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
300 |   // thread
301 |   void set_xpu_dev_per_thread(int dev_no = 0);
302 |   void set_xpu_multi_encoder_precision(const std::string& precision = "int16");
303 | 
304 |   // set input tensor for warmup.
305 |   // It is optional. If you set prefered_inputs, model wil run immediately when
306 |   // predictor is created
307 |   template <class T>
308 |   void set_preferred_inputs_for_warmup(const int group_idx,
309 |                                        const int tensor_idx,
310 |                                        const shape_t& shape,
311 |                                        const lod_t& lod = {},
312 |                                        const T fill_value = 0,
313 |                                        const void* data = nullptr);
314 |   const std::map<int, std::vector<std::shared_ptr<void>>>&
315 |   preferred_inputs_for_warmup() const {
316 |     return preferred_inputs_for_warmup_;
317 |   }
318 | 
319 |   void set_quant_model(bool quant_model) { quant_model_ = quant_model; }
320 |   bool quant_model() const { return quant_model_; }
321 |   void set_quant_type(QuantType quant_type) { quant_type_ = quant_type; }
322 |   QuantType quant_type() const { return quant_type_; }
323 | };
324 | 
325 | /// MobileConfig is the config for the light weight predictor, it will skip
326 | /// IR optimization or other unnecessary stages.
327 | class LITE_API MobileConfig : public ConfigBase {
328 |   // whether to load data from memory. Model data will be loaded from memory
329 |   // buffer if model_from_memory_ is true.
330 |   bool model_from_memory_{false};
331 | 
332 |   // model data readed from file or memory buffer in combined format.
333 |   std::string lite_model_file_;
334 | 
335 |   // NOTE: This is a deprecated variable and will be removed in latter release.
336 |   std::string model_buffer_;
337 |   std::string param_buffer_;
338 | 
339 |  public:
340 |   // set model data in combined format, `set_model_from_file` refers to loading
341 |   // model from file, set_model_from_buffer refers to loading model from memory
342 |   // buffer
343 |   void set_model_from_file(const std::string& x);
344 |   void set_model_from_buffer(const std::string& x);
345 |   // return model data in lite_model_file_, which is in combined format.
346 |   const std::string& lite_model_file() const { return lite_model_file_; }
347 | 
348 |   // return model_from_memory_, which indicates whether to load model from
349 |   // memory buffer.
350 |   bool is_model_from_memory() const { return model_from_memory_; }
351 |   // note: `model_from_memory` has the same effect as `is_model_from_memory`,
352 |   // but is_model_from_memory is recommended and `model_from_memory` will be
353 |   // abandoned in v3.0.
354 |   bool model_from_memory() const { return model_from_memory_; }
355 | 
356 |   // NOTE: This is a deprecated API and will be removed in latter release.
357 |   void set_model_buffer(const char* model_buffer,
358 |                         size_t model_buffer_size,
359 |                         const char* param_buffer,
360 |                         size_t param_buffer_size);
361 | 
362 |   // NOTE: This is a deprecated API and will be removed in latter release.
363 |   const std::string& model_buffer() const { return model_buffer_; }
364 | 
365 |   // NOTE: This is a deprecated API and will be removed in latter release.
366 |   const std::string& param_buffer() const { return param_buffer_; }
367 | 
368 |   // This is the method for allocating workspace_size according to L3Cache size
369 |   void SetArmL3CacheSize(
370 |       L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
371 |       int absolute_val = -1);
372 | };
373 | 
374 | template <typename ConfigT>
375 | LITE_API std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
376 | 
377 | }  // namespace lite_api
378 | }  // namespace paddle
379 | 
380 | #endif  // NOLINT
381 | 


--------------------------------------------------------------------------------