├── docs
    ├── flow.png
    ├── system_overview_aws.pdf
    └── system_overview_local.pdf
├── fpga_cnn
    ├── src
    │   ├── pow_function.h
    │   ├── image_converter.h
    │   ├── data_type.h
    │   ├── config.h
    │   ├── resize_image.h
    │   ├── activation_functions.h
    │   ├── construct_net.h
    │   ├── ff_test.cpp
    │   ├── max_pool_acc_innerpp.h
    │   ├── fc_acc_innerpp.h
    │   ├── acc_instance.h
    │   └── conv_acc_innerpp.h
    └── testbench
    │   ├── pooling_validate.h
    │   ├── conv_validate.h
    │   ├── fc_validate.h
    │   ├── pooling_validate.cpp
    │   ├── print_array.h
    │   ├── fc_validate.cpp
    │   └── conv_validate.cpp
├── acc_runtime
    ├── aws_acc
    │   ├── api_lib
    │   │   ├── src
    │   │   │   ├── acc_ctrl.o
    │   │   │   └── acc_ctrl.cpp
    │   │   └── inc
    │   │   │   ├── cl_tsc.h
    │   │   │   └── acc_ctrl.h
    │   └── README.md
    └── local_acc
    │   ├── demos
    │       └── convTest
    │       │   ├── runtime
    │       │   ├── runtime.o
    │       │   ├── acc_ctrl.o
    │       │   ├── Makefile
    │       │   ├── acc_config.h
    │       │   └── runtime.cpp
    │   ├── api_lib
    │       ├── inc
    │       │   ├── cl_tsc.h
    │       │   └── acc_ctrl.h
    │       └── src
    │       │   └── acc_ctrl.cpp
    │   └── README.md
├── scripts
    └── hls_impl
    │   ├── syn.sh
    │   └── hls_script.tcl
├── netGenerator
    ├── clean.sh
    ├── README.md
    ├── run_generator.sh
    ├── dse
    │   ├── model_partition.py
    │   ├── param_write.py
    │   ├── global_search.py
    │   ├── model_extract.py
    │   ├── model_split.py
    │   ├── tm_tn_multiAcc.py
    │   ├── task_analysis.py
    │   └── helping_functions.py
    ├── alex.prototxt
    ├── netGen
    │   ├── generate_consNet.py
    │   └── generate_accInst.py
    └── paramExtractor
    │   └── extract.py
├── examples
    └── AlexNet
    │   ├── net_config_params.txt
    │   └── acc_ins_params.txt
├── LICENSE
└── README.md


/docs/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/docs/flow.png


--------------------------------------------------------------------------------
/docs/system_overview_aws.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/docs/system_overview_aws.pdf


--------------------------------------------------------------------------------
/fpga_cnn/src/pow_function.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/fpga_cnn/src/pow_function.h


--------------------------------------------------------------------------------
/docs/system_overview_local.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/docs/system_overview_local.pdf


--------------------------------------------------------------------------------
/fpga_cnn/src/image_converter.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/fpga_cnn/src/image_converter.h


--------------------------------------------------------------------------------
/acc_runtime/aws_acc/api_lib/src/acc_ctrl.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/aws_acc/api_lib/src/acc_ctrl.o


--------------------------------------------------------------------------------
/acc_runtime/local_acc/demos/convTest/runtime:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/local_acc/demos/convTest/runtime


--------------------------------------------------------------------------------
/acc_runtime/local_acc/demos/convTest/runtime.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/local_acc/demos/convTest/runtime.o


--------------------------------------------------------------------------------
/acc_runtime/local_acc/demos/convTest/acc_ctrl.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/local_acc/demos/convTest/acc_ctrl.o


--------------------------------------------------------------------------------
/scripts/hls_impl/syn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in 0 1 2
 4 | do
 5 |     rm -rf sub_net_$i vivado_hls.log
 6 | done
 7 | 
 8 | echo "Cleaned existing projects!!!"
 9 | 
10 | echo "Start generating sub-net IPs ..."
11 | 
12 | vivado_hls -f hls_script.tcl
13 | #vivado_hls -f hls_script.tcl
14 | 


--------------------------------------------------------------------------------
/netGenerator/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | rm -rf paramExtractor/*.txt
 5 | rm -rf dse/*.txt
 6 | rm -rf netGen/*.txt
 7 | rm -rf netGen/*.h
 8 | rm -rf *.h
 9 | 
10 | rm -rf ../gen_proj
11 | rm -rf ./dse/__pycache__/
12 | rm -rf ./dse/.idea/
13 | rm -rf ./netGen/.idea/
14 | rm -rf ./paramExtractor/.idea/
15 | 
16 | echo "Cleaned all the intermediate files and newly generated file!!!"
17 | 


--------------------------------------------------------------------------------
/netGenerator/README.md:
--------------------------------------------------------------------------------
1 | #This is the README file for the NN model generation.
2 | 
3 | Follow the steps below to generate the CNN accelerator:
4 | 1. Extract the parameters of an input CNN in paramExtractor/ folder. Copy the net_config_params.txt to dse/ folder.
5 | 2. Run design space exploration in dse/ folder and get the acc_ins_param.txt.
6 | 3. Mv acc_ins_param.txt to netGen folder to generate the accelerators.
7 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/demos/convTest/Makefile:
--------------------------------------------------------------------------------
 1 | runtime:runtime.o acc_ctrl.o
 2 | 	g++ -o runtime runtime.o acc_ctrl.o -I ../../api_lib/inc
 3 | runtime.o:runtime.cpp ../../api_lib/src/acc_ctrl.cpp
 4 | 	g++ -c runtime.cpp ../../api_lib/src/acc_ctrl.cpp -I ../../api_lib/inc
 5 | acc_ctrl.o:../../api_lib/src/acc_ctrl.cpp
 6 | 	g++ -c ../../api_lib/src/acc_ctrl.cpp -I ../../api_lib_inc
 7 | 
 8 | clean:
 9 | 	rm -f *.o runtime
10 | 


--------------------------------------------------------------------------------
/acc_runtime/aws_acc/README.md:
--------------------------------------------------------------------------------
 1 | <span style="display: inline-block">
 2 | 
 3 | # Table of Contents
 4 | 
 5 | 1. AGFI number: agfi-03391747dc68cd939
 6 | 2. Testing demos on this AGFI.
 7 | 3. Make sure your sdk_setup.sh is executed successfully.
 8 | 
 9 | # Environmental Settings
10 | 1. F1.2Xlarge instance
11 | 2. Pre-installed caffe framework
12 | 3. Pre-installed Opencv package support
13 | 4. Only suitable to FPGA development AMI
14 | 
15 | # How to use
16 | ## Hardware setup
17 | *Map the AGFI to the platform first, all these demos work with the same AGFI
18 | 
19 | ## Compile and execute the host program 
20 | 1. cd into demo folder, then make
21 | 2. execute the demo with the compiled executable file
22 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/demos/convTest/acc_config.h:
--------------------------------------------------------------------------------
 1 | #ifndef ACC_CONFIG_H
 2 | #define ACC_CONFIG_H
 3 | 
 4 | #define ACC0_PARA_OFFSET       0x10000000
 5 | #define ACC0_WEIGHT_OFFSET     0x12000000
 6 | #define ACC0_DATA_IN_OFFSET    0x13000000
 7 | #define ACC0_CTRL_OFFSET       0x00000000
 8 | 
 9 | 
10 | #define ACC1_PARA_OFFSET       0x14000000
11 | #define ACC1_WEIGHT_OFFSET     0x16000000
12 | #define ACC1_DATA_IN_OFFSET    0x00000000
13 | #define ACC1_CTRL_OFFSET       0x00010000
14 | 
15 | 
16 | #define ACC2_PARA_OFFSET       0x17000000
17 | #define ACC2_WEIGHT_OFFSET     0x19000000
18 | #define ACC2_DATA_IN_OFFSET    0x00000000
19 | #define ACC2_CTRL_OFFSET       0x00020000
20 | 
21 | 
22 | 
23 | #endif


--------------------------------------------------------------------------------
/fpga_cnn/testbench/pooling_validate.h:
--------------------------------------------------------------------------------
 1 | #ifndef POOLING_VALIDATE_H
 2 | #define POOLING_VALIDATE_H
 3 | 
 4 | #include "/opt/Xilinx/Vivado/2018.1/include/gmp.h"
 5 | #include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h"
 6 | #include "ap_fixed.h"
 7 | 
 8 | class pooling_validate
 9 | {
10 | public:
11 | 	int num_input;
12 | 	int stride;
13 | 	int kernel_size;
14 | 	int inputfeature_size;
15 | 	int outputfeature_size;
16 | 	int act;
17 | 	
18 | 	ap_int<512> *input_feature;
19 | 	ap_int<512> *output_feature;
20 | 	ap_int<512> *output_feature_software;
21 | 	ap_uint<32> config_list[16];
22 | 	
23 | 	pooling_validate(int num_input,int stride,int kernel_size,int inputfeature_size,int act);
24 | 	void print_feature_in(void);
25 | };
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/data_type.h:
--------------------------------------------------------------------------------
 1 | // Baseline data type define for the entire design
 2 | // TODO:
 3 | 
 4 | #ifndef _DATA_TYPE_H_
 5 | #define _DATA_TYPE_H_
 6 | 
 7 | #include<vector>
 8 | //#include<string>
 9 | //#include <ap_fixed.h>
10 | 
11 | using namespace std;
12 | 
13 | typedef unsigned int uint;
14 | typedef uint cnn_size_t;
15 | /*
16 | typedef ap_fixed<64, 32> cnn_data_64;
17 | typedef ap_fixed<32, 16> cnn_data_32;
18 | typedef ap_fixed<16, 8>  cnn_data_16;
19 | typedef ap_fixed<8, 4>   cnn_data_8;
20 | typedef ap_fixed<4, 2>   cnn_data_4;
21 | */
22 | typedef std::vector<float, std::allocator<float> > std_vec_t;
23 | typedef std::vector<std_vec_t> std_tensor_t;
24 | typedef std::vector<std_tensor_t> std_tensor_t_3d;
25 | 
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/acc_runtime/aws_acc/api_lib/inc/cl_tsc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CL_TSC_H_
 2 | #define _CL_TSC_H_
 3 | 
 4 | #define CPU_FREQUENCY (3600)
 5 | #include <stdint.h>
 6 | inline uint64_t ticks() {
 7 |     uint32_t lo, hi;
 8 |     __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
 9 |     return ((uint64_t)hi << 32) | lo;
10 | }
11 | 
12 | inline double cycles_to_nanoseconds(uint64_t cycles) {
13 |     return (uint64_t)((double) cycles / CPU_FREQUENCY * 1000);
14 | }
15 | 
16 | inline double cycles_to_microseconds(uint64_t cycles) {
17 |     return cycles_to_nanoseconds(cycles) / 1000;
18 | }
19 | 
20 | inline double cycles_to_milliseconds(uint64_t cycles) {
21 |     return cycles_to_nanoseconds(cycles) / 1000000;
22 | }
23 | 
24 | inline double cycles_to_seconds(uint64_t cycles) {
25 |     return cycles_to_nanoseconds(cycles) / 1000000000;
26 | }
27 | 
28 | #endif //_CL_TSC_H_
29 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/api_lib/inc/cl_tsc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CL_TSC_H_
 2 | #define _CL_TSC_H_
 3 | 
 4 | #define CPU_FREQUENCY (3600)
 5 | #include <stdint.h>
 6 | inline uint64_t ticks() {
 7 |     uint32_t lo, hi;
 8 |     __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
 9 |     return ((uint64_t)hi << 32) | lo;
10 | }
11 | 
12 | inline double cycles_to_nanoseconds(uint64_t cycles) {
13 |     return (uint64_t)((double) cycles / CPU_FREQUENCY * 1000);
14 | }
15 | 
16 | inline double cycles_to_microseconds(uint64_t cycles) {
17 |     return cycles_to_nanoseconds(cycles) / 1000;
18 | }
19 | 
20 | inline double cycles_to_milliseconds(uint64_t cycles) {
21 |     return cycles_to_nanoseconds(cycles) / 1000000;
22 | }
23 | 
24 | inline double cycles_to_seconds(uint64_t cycles) {
25 |     return cycles_to_nanoseconds(cycles) / 1000000000;
26 | }
27 | 
28 | #endif //_CL_TSC_H_
29 | 


--------------------------------------------------------------------------------
/examples/AlexNet/net_config_params.txt:
--------------------------------------------------------------------------------
 1 | Network Structure: Convolution Pooling Convolution Pooling Convolution Convolution Convolution Pooling InnerProduct InnerProduct InnerProduct 
 2 | nn_in_data_size_conv: 227 27 13 13 13 
 3 | nn_channel_size_conv: 11 5 3 3 3 
 4 | nn_padding_conv: 0 2 1 1 1 
 5 | nn_stride_conv: 4 1 1 1 1 
 6 | nn_in_number_conv: 3 96 256 384 384 
 7 | nn_out_number_conv: 96 256 384 384 256 
 8 | nn_group_conv: 1 2 1 2 2 
 9 | nn_bias_conv: 96 256 384 384 256 
10 | nn_in_data_size_pooling: 55 27 13 
11 | nn_channel_size_pooling: 3 3 3 
12 | nn_padding_pooling: 0 0 0 
13 | nn_stride_pooling: 2 2 2 
14 | nn_in_number_pooling: 96 256 256 
15 | nn_in_data_size_fc: 
16 | nn_in_number_fc: 9216 4096 4096 
17 | nn_out_number_fc: 4096 4096 1000 
18 | nn_channel_size_fc: 1 1 1 
19 | conv_cut_flag: 1 1 1 1 1 
20 | pool_cut_flag: 1 1 1 
21 | fc_cut_flag: 1 1 1 
22 | 


--------------------------------------------------------------------------------
/fpga_cnn/testbench/conv_validate.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONV_VALIDATE_H
 2 | #define CONV_VALIDATE_H
 3 | #include "/opt/Xilinx/Vivado/2018.1/include/gmp.h"
 4 | #include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h"
 5 | #include "ap_fixed.h"
 6 | 
 7 | class conv_validate
 8 | {
 9 | public:
10 | 	int layer_num;
11 | 	ap_int<512> weight[16384];
12 | 	ap_int<512> input_feature[6400];
13 | 	ap_int<512> output_feature[4096];
14 | 	ap_int<512> output_feature_software[4096];
15 | 	ap_fixed<32,26> bias[1024];
16 | 	ap_uint<32>* param_list;
17 | 
18 | 
19 | 	conv_validate(ap_uint<32>* param_list);     //(int layer_num, int num_input,int num_output,int kernel_size,int stride,int padding, int inputfeature_size, int inport);
20 | 	void print_weight(void);
21 | 	void print_feature_in(void);
22 | 	void print_bias(void);
23 | //	void print_feature_out(void);
24 | //
25 | //	void software_conv_process(void);
26 | //	void print_feature_out_softeare(void);
27 | //	void test_fun(void);
28 | };
29 | 
30 | 
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/examples/AlexNet/acc_ins_params.txt:
--------------------------------------------------------------------------------
 1 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 96, 3, 11, 11, 5, 5, 32, 32, 32
 2 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 128, 11, 27, 27, 5, 5, 32, 32, 32
 3 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 96, 19, 13, 13, 5, 5, 32, 32, 32
 4 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 64, 17, 13, 13, 5, 5, 32, 32, 32
 5 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 64, 11, 13, 13, 5, 5, 32, 32, 32
 6 | max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3
 7 | max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3
 8 | max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3
 9 | conv_pool, 0,0,0
10 | conv_pool, 1,1,1
11 | conv_pool, 2,2
12 | conv_pool, 3,3
13 | conv_pool, 4,4,2
14 | sub_net_0,2,1024,20300,9662,22688,4374,14580
15 | sub_net_1,1,1024,27660,2704,4056
16 | sub_net_2,2,1024,69140,4056,4056,4056,3380
17 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/README.md:
--------------------------------------------------------------------------------
 1 | <span style="display: inline-block">
 2 | 
 3 | # Table of Contents
 4 | 
 5 | 1. API functions to call the accelerators.
 6 | 2. Testing demos.
 7 | 
 8 | # Environmental Settings
 9 | 1. UltraScale+ VU118 board in PCIe mode (xdma driver is required)
10 | 2. Pre-installed caffe framework
11 | 3. Pre-installed Opencv package support
12 | 4. Pre-installed Vivado design suits
13 | 
14 | # How to use
15 | 
16 | ## Starting from a design
17 | Generate or download the bitstream to the platform
18 | 1. cd into a demo in the bitstream/ folder
19 | 2. start vivado in tcl mode with 'vivado -mode tcl' command
20 | 3. modify the demo name in the make_spi_mcs.tcl file
21 | 4. source the modified make_spi_mcs.tcl in vivado
22 | 5. source the program_spi.tcl after the .mcs files are generated
23 | 
24 | ## Starting from a bistream file
25 | 1. Download the bitstream to the flash on the board
26 | 2. Re-start your system
27 | 
28 | ## Compile and execute the host program 
29 | 1. cd into a demo in the demos/ folder
30 | 2. make
31 | 3. execute the executable file
32 | 


--------------------------------------------------------------------------------
/fpga_cnn/testbench/fc_validate.h:
--------------------------------------------------------------------------------
 1 | #ifndef FC_VALIDATE_H
 2 | #define FC_VALIDATE_H
 3 | #include "/opt/Xilinx/Vivado/2018.1/include/gmp.h"
 4 | #include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h"
 5 | #include "ap_fixed.h"
 6 | 
 7 | class fc_validate
 8 | {
 9 | public:
10 | 	int layer_num;
11 | 	int num_input;
12 | 	int num_output;
13 | 	int act;
14 | 
15 | //	ap_int<512> weight[1024];
16 | //	ap_int<512> in_feature[1024];
17 | //	ap_int<512> out_feature[1024];
18 | //	ap_int<512> out_feature_software[1024];
19 | //	ap_int<512> bias[32];
20 | 
21 | 	ap_int<512> *weight;
22 | 	ap_int<512> *in_feature;
23 | 	ap_int<512> *out_feature;
24 | 	ap_int<512> *out_feature_software;
25 | 	ap_int<512> bias[4096];
26 | 
27 |     ap_uint<32> lnum_list[16];
28 |     ap_uint<32> config_list[16*16];
29 | 
30 |     fc_validate(int layer_num, int num_input, int num_output, int act);
31 |     void print_weight(void);
32 |     void print_feature_in(void);
33 |     void print_bias(void);
34 |     void print_feature_out(void);
35 | 
36 | 	void software_fc_process(void);
37 | 	void print_software_out(void);
38 | };
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/api_lib/inc/acc_ctrl.h:
--------------------------------------------------------------------------------
 1 | #ifndef ACC_CTRL_H
 2 | #define ACC_CTRL_H
 3 | 
 4 | #include "cl_tsc.h"
 5 | #include <iostream>
 6 | #include <iomanip>
 7 | #include <unistd.h>
 8 | #include <sys/types.h>
 9 | #include <sys/stat.h>
10 | #include <sys/mman.h>
11 | #include <sys/time.h>
12 | #include <fcntl.h>
13 | #include <assert.h>
14 | #include <stdint.h>
15 | using namespace std;
16 | //==============board level interface============//
17 | #define DEVICE_H2C      "/dev/xdma0_h2c_1"
18 | #define DEVICE_C2H      "/dev/xdma0_c2h_2"
19 | #define DEVICE_CTRL		"/dev/xdma0_user"
20 | 
21 | #define MAP_SIZE (8*1024UL)
22 | //================ctrl port addr=================
23 | 
24 | class acc_ctrl
25 | {
26 | private:
27 | 	uint32_t para_offset_addr;
28 | 	uint32_t weight_offset_addr;
29 | 	uint32_t data_in_offset_addr;
30 | 	off_t    ctrl_addr;
31 | public:
32 | 	acc_ctrl(
33 | 			 uint32_t para_offset_addr,
34 | 			 uint32_t weight_offset_addr,
35 | 			 uint32_t data_in_offset_addr,
36 | 			 off_t    ctrl_addr
37 | 			 );
38 | 	void write_weight(short int weight[][32],int weight_length);
39 | 	void write_para(int* para_list,int para_length);
40 | 	void write_data(short int feature[][32],int feature_length);
41 | 	void start_process(int mode);
42 | 	void read_data(short int feature[][32],int feature_length);
43 | 	
44 | };
45 | 
46 | 
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/acc_runtime/aws_acc/api_lib/inc/acc_ctrl.h:
--------------------------------------------------------------------------------
 1 | #ifndef ACC_CTRL_H
 2 | #define ACC_CTRL_H
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <unistd.h>
 7 | #include <sys/types.h>
 8 | #include <sys/stat.h>
 9 | #include <sys/mman.h>
10 | #include <sys/time.h>
11 | #include <fcntl.h>
12 | #include <assert.h>
13 | #include <stdint.h>
14 | #include "cl_tsc.h"
15 | 
16 | #include "fpga_pci.h"
17 | #include "fpga_mgmt.h"
18 | #include "fpga_dma.h"
19 | 
20 | using namespace std;
21 | //==============board level interface============//
22 | #define DEVICE_H2C      "/dev/xdma0_h2c_1"
23 | #define DEVICE_C2H      "/dev/xdma0_c2h_2"
24 | #define DEVICE_CTRL		"/dev/xdma0_user"
25 | 
26 | //================ctrl port addr=================
27 | 
28 | class acc_ctrl
29 | {
30 | private:
31 | 	uint32_t para_offset_addr;
32 | 	uint32_t bias_offset_addr;
33 | 	uint32_t weight_offset_addr;
34 | 	uint32_t data_in_offset_addr;
35 | 	off_t    ctrl_addr;
36 | public:
37 | 	acc_ctrl(
38 | 			 uint32_t para_offset_addr,
39 | 			 uint32_t weight_offset_addr,
40 | 			 uint32_t data_in_offset_addr,
41 | 			 off_t    ctrl_addr
42 | 			 );
43 | 
44 | 	void write_bias(int* bias,int bias_length);
45 | 	void write_weight(short int weight[][32],int weight_length);
46 | 	void write_para(int* para_list,int para_length);
47 | 	void write_data(short int feature[][32],int feature_length);
48 | 	void start_process(int mode);
49 | 	void read_data(short int feature[][32],int feature_length,int memId);
50 | };
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/netGenerator/run_generator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts i:k:h: option
 4 | do
 5 |     case "${option}"
 6 |         in
 7 |         i) INFILE=${OPTARG};;
 8 |         h) HELP=1;;
 9 |     esac
10 | done 
11 | 
12 | #if HELP==1
13 | #    echo "./run_generator.sh -i input.prototxt to execute the generation!"
14 | #fi
15 | 
16 | ./clean.sh
17 | 
18 | echo $INFILE
19 | mkdir -p ../gen_proj
20 | mkdir -p ../gen_proj/hls_proj/src
21 | mkdir -p ../gen_proj/hls_proj/testbench
22 | mkdir -p ../gen_proj/impl_proj/aws_impl
23 | mkdir -p ../gen_proj/impl_proj/local_impl
24 | 
25 | echo "script executed!!!"
26 | #--------------1.param extract-----------------------
27 | python3.5 paramExtractor/extract.py --model $INFILE
28 | mv net_config_params.txt dse/
29 | echo "Finished network parameter extraction."
30 | echo " "
31 | #--------------2.design space exploration------------
32 | python3.5 dse/tm_tn_multiAcc.py dse/net_config_params.txt
33 | mv acc_ins_params.txt netGen/
34 | echo "Finished accelerator design space exploration."
35 | echo " "
36 | #--------------3.code generation---------------------
37 | python3.5 netGen/generate_accInst.py --params netGen/acc_ins_params.txt
38 | python3.5 netGen/generate_consNet.py --params netGen/acc_ins_params.txt
39 | echo "Finished accelerators and sub-nets generation."
40 | echo "Constructing the testing and implementation folder..."
41 | #TODO: move all the files into the correct positions, src/testbench/
42 | cp ../fpga_cnn/src/* ../gen_proj/hls_proj/src/
43 | cp ../fpga_cnn/testbench/* ../gen_proj/hls_proj/testbench/
44 | mv *.h ../gen_proj/hls_proj/src/
45 | 
46 | cp ../scripts/hls_impl/* ../gen_proj/hls_proj/
47 | cp ../scripts/sys_gen/local_impl/* ../gen_proj/impl_proj/local_impl/
48 | cp ../scripts/sys_gen/aws_impl/* ../gen_proj/impl_proj/aws_impl/
49 | 
50 | echo "Files copied"
51 | echo "Generation done!!!"
52 | 
53 | exit
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) <2019>
 2 | <Advanced Digital Sciences Center, Singapore>
 3 | <University of Illinois at Urbana-Champaign>
 4 | All rights reserved.
 5 | 
 6 | 
 7 | General terms and conditions for use of the Open-DNN software. The software 
 8 | programs comprising "Open-DNN" and the documentation provided with them are 
 9 | copyright Yao Chen, Deming Chen, the Advanced Digital Sciences Center, Singapore 
10 | and the University of Illinois at Urbana-Champaign.
11 | 
12 | Only non-commercial, not-for-profit use of this software is permitted. No part 
13 | of this software may be incorporated into a commercial product without the 
14 | written consent of the authors (Yao Chen and Deming Chen).  Similarly, use of 
15 | this software to assist in the development of new commercial FPGA designs is 
16 | prohibited, unless the written consent of the authors is obtained.
17 | 
18 | This software is provided "as is" with no warranties or guarantees of support. 
19 | All users of the software must take the copy from this site. You may modify or 
20 | use the source code for other non-commercial, not-for-profit research endeavours,
21 | provided that all copyright attribution on the source code is retained, and the
22 | original or modified source code is not redistributed, in whole or in part, or
23 | included in or with any commercial product, except by written agreement with
24 | the authors, and full and complete attribution for use of the code is given in
25 | any resulting publications. Subject to these conditions, the software is
26 | provided free of charge to all interested parties.
27 | 
28 | When referencing this particular open-source software in a publication, please 
29 | cite the following publication:
30 | Yao Chen, Jiong He, Xiaofan Zhang, Cong Hao and Deming Chen, "Cloud-DNN: An Open 
31 | Framework for Mapping DNN Models to Cloud FPGAs", Proceedings of ACM/SIGDA 
32 | International Symposium on Field Programmable Gate Arrays, February 2019.
33 | 


--------------------------------------------------------------------------------
/fpga_cnn/testbench/pooling_validate.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include "pooling_validate.h"
 5 | using namespace std;
 6 | 
 7 | pooling_validate::pooling_validate(int num_input,int stride,int kernel_size,int inputfeature_size,int act)
 8 | {
 9 | 	
10 | 	int i,j,k;
11 | 	this->num_input = num_input;
12 | 	this->stride = stride;
13 | 	this->kernel_size = kernel_size;
14 | 	this->inputfeature_size = inputfeature_size;
15 | 	this->act = act;
16 | 	outputfeature_size = (inputfeature_size-kernel_size)/stride + 1;
17 | 	
18 | 	config_list[0] = inputfeature_size;
19 | 	config_list[1] = inputfeature_size;
20 |     config_list[2] = num_input;
21 | 	config_list[3] = kernel_size;
22 | 	config_list[4] = outputfeature_size;
23 | 	config_list[5] = outputfeature_size;
24 | 	config_list[6] = stride;
25 | 	config_list[7] = 0;
26 | 	config_list[8] = act;
27 | 	config_list[9] = 0;
28 | 	config_list[10] = 0;
29 | 	config_list[11] = 0;
30 | 	config_list[12] = 0;
31 | 	config_list[13] = 0;
32 | 	config_list[14] = 0;
33 | 	config_list[15] = 0;
34 | 	
35 | 	input_feature = new ap_int<512>[inputfeature_size * inputfeature_size * (int)(ceil(((double)num_input)/32))];
36 | 	output_feature = new ap_int<512>[outputfeature_size * outputfeature_size * (int)(ceil(((double)num_input)/32))];
37 | 	output_feature_software = new ap_int<512>[outputfeature_size * outputfeature_size * (int)(ceil(((double)num_input)/32))];
38 | 	
39 | 	
40 | 	for(i = 0 ; i < num_input; i++)
41 | 		for(j = 0 ; j < inputfeature_size ;j++)
42 | 			for(k = 0 ; k < inputfeature_size; k++)
43 | //				input_feature[i/32*inputfeature_size*inputfeature_size + j*inputfeature_size + k].range(i%32) = rand()%10;
44 | 				;
45 | }
46 | 
47 | 
48 | //void pooling_validate::print_feature_in(void)
49 | //{
50 | //	int i,j,k;
51 | //	for(i = 0 ; i < num_input; i++)
52 | //		for(j = 0 ; j < inputfeature_size ; j++)
53 | //			for(k = 0 ; k < inputfeature_size; k++)
54 | //				;
55 | //
56 | //}
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/scripts/hls_impl/hls_script.tcl:
--------------------------------------------------------------------------------
 1 | ##############################################################################
 2 | ## This file is modified from the tcl script generated with vivado_hls 2018.2.
 3 | ## This modified version is used to ease the run of synthesis of the sub_nets.
 4 | ## Please edit based on the user manual if necessary.
 5 | ##############################################################################
 6 | 
 7 | #if { $argc != 1} {
 8 | #    puts "The hls_script.tcl script requires only one input."
 9 | #    puts "For example. hls_script.tcl 0"
10 | #    puts "Please try again"
11 | #} else {
12 | #    puts [$PNUM = $argv 0]
13 | #}
14 | 
15 | open_project ip_gen
16 | 
17 | 
18 | #add design files to the project
19 | add_files ./src/data_type.h
20 | add_files ./src/config.h
21 | add_files ./src/activation_functions.h
22 | add_files ./src/conv_acc_2ibuf.h
23 | add_files ./src/max_pool_acc_innerpp.h
24 | add_files ./src/fc_acc_innerpp.h
25 | add_files ./src/acc_instance.h
26 | add_files ./src/construct_net.h
27 | add_files ./src/ff_test.cpp
28 | 
29 | add_files -tb ./testbench/conv_validate.h
30 | add_files -tb ./testbench/conv_validate.cpp
31 | add_files -tb ./testbench/pooling_validate.h
32 | add_files -tb ./testbench/pooling_validate.cpp
33 | add_files -tb ./testbench/fc_validate.h
34 | add_files -tb ./testbench/fc_validate.cpp
35 | add_files -tb ./testbench/print_array.h
36 | add_files -tb ./src/ff_test.cpp
37 | 
38 | for {set i 0} {$i < 3} {incr i} {
39 |     set_top sub_net_$i
40 | 
41 |     open_solution -reset "sub_net_$i"
42 |     # UltraScale+ 
43 |     set_part {xcvu9p-flgb2104-2-i} -tool vivado
44 | 
45 |     create_clock -period 1.6 -name default
46 |     config_compile -name_max_length 500 -pipeline_loops 0
47 |     csim_design -clean -compiler gcc
48 |     csynth_design
49 | 
50 |     # If the co-sim is required for verification, uncomment the following line.
51 |     #cosim_design -compiler gcc -trace_level all
52 | 
53 |     export_design -flow syn -rtl verilog -format ip_catalog
54 | }
55 | 
56 | exit
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/config.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef _CONFIG_H_
 3 | #define _CONFIG_H_
 4 | 
 5 | #include "ap_fixed.h"
 6 | #include "ap_int.h"
 7 | 
 8 | 
 9 | //extern int layer_count = 0;
10 | //define data type
11 | typedef ap_fixed<16,10> data_type;
12 | typedef ap_fixed<16,10> data_type_w;
13 | typedef ap_fixed<16,10> data_type_o;
14 | typedef ap_fixed<32,26> b_type;
15 | 
16 | typedef unsigned int uint;
17 | 
18 | typedef ap_uint<32> Tparam;
19 | // counter datatype used in the entire design
20 | typedef ap_int<512> data_type_itf;
21 | //typedef ap_fixed<16,0> param_type;
22 | 
23 | #define READ_LEN_i uint(sizeof(ap_int<512>)/sizeof(data_type))
24 | #define READ_LEN_w uint(sizeof(ap_int<512>)/sizeof(data_type_w))
25 | #define READ_LEN_o uint(sizeof(ap_int<512>)/sizeof(data_type_o))
26 | 
27 | #define DATA_LEN uint(sizeof(data_type))
28 | #define DATA_O_LEN int(sizeof(data_type_itf)/sizeof(data_type_o))
29 | #define DATA_O int(sizeof(data_type_o))
30 | 
31 | // C++ compilation debug mode
32 | #define _LAYER_MODE_ 1
33 | 
34 | #ifdef _LAYER_MODE_
35 | #define IBUF (Tr-1)*S_max + K_max
36 | //#define
37 | #endif
38 | 
39 | // C++ compilation debug mode
40 | //#ifdef _LAYER_MODE_
41 | //#define _ACC_MODE_ 0
42 | //#else
43 | //#define _ACC_MODE_ 1
44 | //#endif
45 | 
46 | #define _HLS_MODE_  1
47 | 
48 | //#define _BATCH_MODE_ 1
49 | #ifdef _BATCH_MODE_
50 | #define _KERNEL_DEBUG_ 0
51 | #else
52 | #define _KERNEL_DEBUG_ 1
53 | #endif
54 | #ifndef _HLS_MODE_
55 | #define _C_DEBUG_MODE_ 1
56 | #endif
57 | 
58 | #define _8CH_ 1
59 | 
60 | 
61 | //network configuration PARAMETERS
62 | int weight_bias_record = 0;
63 | int weight_bias_count_1 = 0;
64 | int weight_bias_count_2 = 0;
65 | int nn_in_data_size_conv[2] = {28, 14};
66 | int nn_in_number_conv[2] = {1, 6};
67 | int nn_out_number_conv[2] = {6, 16};
68 | int nn_channel_size_conv[2] = {5, 5};
69 | int nn_padding_conv[2] = {2, 0};
70 | int nn_group_conv[2] = {1, 1};
71 | int nn_in_data_size_pooling[2] = {28, 10};
72 | int nn_in_number_pooling[2] = {6, 16};
73 | int nn_channel_size_pooling[2] = {2, 2};
74 | int nn_in_data_size_fc[1] = {5};
75 | int nn_in_number_fc[1] = {16};
76 | int nn_out_number_fc[1] = {10};
77 | int nn_channel_size_fc[1] = {5};
78 | 
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/netGenerator/dse/model_partition.py:
--------------------------------------------------------------------------------
 1 | # Author Jiong He on 28 June 2018 at ADSC
 2 | #!/usr/bin/env python
 3 | 
 4 | from itertools import permutations
 5 | 
 6 | 
 7 | def partition(layerlist):
 8 |     """
 9 |     partition is a generator that generates all possible partitions of a given layer list in to all possible
10 |     number (i.e., 1 to len(LayerList)) of subpartititons.
11 |     :param layerlist: a list containing each layer information in the form of a tuple (layer index, layer name).
12 |     :return: generate one possible partition
13 |     """
14 |     if len(layerlist) == 1:
15 |         yield [layerlist]
16 |     else:
17 |         first = layerlist[0]
18 |         for each_partition in partition(layerlist[1:]):
19 |             # first choice: insert the first element into each of the subpartition's subsets
20 |             for groupIdx, group in enumerate(each_partition, 0):
21 |                 yield each_partition[:groupIdx] + [[first] + group] + each_partition[groupIdx + 1:]
22 |             # second choice: insert the first as an individual subset
23 |             yield [[first]] + each_partition
24 | 
25 | 
26 | def partition_to_k(layerlist, k, order=False):
27 |     """
28 |     partition_to_k calls partition function and filters those partitions that has k number of groups. If order is True,
29 |     all permutations of this partition will be treated as different partition and output one by one.
30 |     :param layerlist: a list containing each layer information in the form of a tuple (layer index, layer name).
31 |     :param k: number of groups to partitioned into
32 |     :param order: whether the order of groups matter
33 |     :return: generate one possible partition
34 |     """
35 |     for each_partition_candidate in partition(layerlist):
36 |         if len(each_partition_candidate) == k:
37 |             if not order:
38 |                 yield each_partition_candidate
39 |             else:
40 |                 for enum_item in permutations(each_partition_candidate):
41 |                     yield enum_item
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     # layer_list_1 = [(0, 'c'), (1, 'p'), (2, 'c'), (3, 'c'), (4, 'p'), (5, 'c')]
46 |     layer_list_2 = [(0, 'c'), (1, 'p'), (2, 'c'), (3, 'c')]
47 |     layer_list_3 = [1, 2, 3, 4]
48 |     for idx, item in enumerate(partition_to_k(layer_list_3, 3, False), 1):
49 |         print(item)


--------------------------------------------------------------------------------
/fpga_cnn/src/resize_image.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef _RESIZE_IMAGE_H_
 3 | #define _RESIZE_IMAGE_H_
 4 | 
 5 | #include <iostream>
 6 | template<int channels, int width, int height, int size>
 7 | void resize_image(float(&x)[channels][height][width], int h, int w, float(&y)[channels][size][size]) {
 8 | 	//(1)
 9 | 	int w0 = w;
10 | 	int h0 = h;
11 | 	int w1 = size;
12 | 	int h1 = size;
13 | 	float fw = float(w0) / (w1);
14 | 	float fh = float(h0) / (h1);
15 | 
16 | 	float image_max = 0.0;
17 | 	float image_min = x[0][0][0];
18 | 	for (int i = 0; i < channels; i++) {
19 | 		for (int j = 0; j < height; j++) {
20 | 			for (int k = 0; k < width; k++) {
21 | 				if (x[i][j][k]>image_max)
22 | 					image_max = x[i][j][k];
23 | 				if (x[i][j][k]<image_min)
24 | 					image_min = x[i][j][k];
25 | 			}
26 | 		}
27 | 	}
28 | 	
29 | 	for (int i = 0; i < channels; i++) {
30 | 		for (int j = 0; j < height; j++) {
31 | 			for (int k = 0; k < width; k++) {
32 | 				x[i][j][k] = (x[i][j][k] - image_min) / (image_max - image_min);
33 | 			}
34 | 		}
35 | 	}
36 | 
37 | 	for (int i = 0; i < channels; i++) {
38 | 		for (int j = 0; j < h1; j++) {
39 | 			for (int k = 0; k < w1; k++) {
40 | 				//(2)
41 | 				float x0 = (float)((k + 0.5) * fw - 0.5);
42 | 				float y0 = (float)((j + 0.5) * fh - 0.5);
43 | 
44 | 				int x1 = int(x0);
45 | 				int x2 = x1 + 1;
46 | 				int y1 = int(y0);
47 | 				int y2 = y1 + 1;
48 | 				//(3)
49 | 				float fx1 = x0 - x1;
50 | 				float fx2 = 1.0f - fx1;
51 | 				float fy1 = y0 - y1;
52 | 				float fy2 = 1.0f - fy1;
53 | 
54 | 				float s1 = fx1*fy1;
55 | 				float s2 = fx2*fy1;
56 | 				float s3 = fx2*fy2;
57 | 				float s4 = fx1*fy2;
58 | 
59 | 				y[i][j][k] = (x[i][y2][x2] * s1 + x[i][y2][x1] * s2 + x[i][y1][x1] * s3 + x[i][y1][x2] * s4) 
60 | 					* (image_max - image_min) + image_min;
61 | 			}
62 | 		}
63 | 	}
64 | 
65 | 	
66 | 	cout << "image_max: " << image_max << endl;
67 | 	cout << "image_min: " << image_min << endl;
68 | 
69 | 	//ofstream in_data_crop;
70 | 	//in_data_crop.open("in_data_crop.txt", ios::app);
71 | 	//for (int i = 0; i < channels; i++) {
72 | 	//	//in_data_crop << "channel " << i << "****************" << endl;
73 | 	//	for (int j = 0; j < size; j++) {
74 | 	//		//in_data_crop << "row " << j << "****************" << endl;
75 | 	//		for (int k = 0; k < size; k++) {
76 | 	//			in_data_crop << y[i][j][k] << " ";
77 | 	//		}
78 | 	//		in_data_crop << endl;
79 | 	//	}
80 | 	//	in_data_crop << endl;
81 | 	//}
82 | 	//in_data_crop.close();
83 | }
84 | 
85 | #endif 
86 | 


--------------------------------------------------------------------------------
/acc_runtime/aws_acc/api_lib/src/acc_ctrl.cpp:
--------------------------------------------------------------------------------
  1 | #include "acc_ctrl.h"
  2 | using namespace std;
  3 | acc_ctrl::acc_ctrl( uint32_t para_offset_addr,
  4 | 		    uint32_t weight_offset_addr,
  5 |                     uint32_t data_in_offset_addr,
  6 | 		    off_t    ctrl_addr
  7 | 		 			)
  8 | {
  9 | 	this->para_offset_addr = para_offset_addr;
 10 | 	this->weight_offset_addr = weight_offset_addr;
 11 | 	this->data_in_offset_addr = data_in_offset_addr;
 12 | 	this->ctrl_addr = ctrl_addr;
 13 | }
 14 | 
 15 | void acc_ctrl::write_weight(	short int weight[][32],
 16 | 				int weight_length)
 17 | {
 18 | 	cout <<"write weight start" << endl;
 19 | 	int write_fd;
 20 |         int rc;
 21 |         int slot_id = 0;
 22 |         write_fd = fpga_dma_open_queue( FPGA_DMA_XDMA, slot_id,/*channel*/ 0, /*is_read*/ false);
 23 |         rc = fpga_dma_burst_write(write_fd, (uint8_t*)weight,weight_length,weight_offset_addr);
 24 |         if(write_fd >= 0)
 25 |             close(write_fd);
 26 | 	cout <<"write weight finish" << endl;
 27 | }
 28 | 
 29 | void acc_ctrl::write_para(	int* para_list,
 30 | 				int para_length)
 31 | {
 32 | 	cout <<"write para start" << endl;
 33 | 	int write_fd;
 34 |         int rc;
 35 |         int slot_id = 0;
 36 |         write_fd = fpga_dma_open_queue( FPGA_DMA_XDMA, slot_id,/*channel*/ 0, /*is_read*/ false);
 37 |         rc = fpga_dma_burst_write(write_fd, (uint8_t*)para_list,para_length,para_offset_addr);
 38 |         if(write_fd >= 0)
 39 |             close(write_fd);
 40 | 	cout <<"write para finish" << endl;
 41 | }
 42 | 
 43 | void acc_ctrl::write_data(	short int feature[][32],
 44 | 				int feature_length)
 45 | {
 46 | 	cout <<"write data start"<<endl;
 47 | 
 48 | 	int write_fd;
 49 |         int rc;
 50 |         int slot_id = 0;
 51 |         write_fd = fpga_dma_open_queue( FPGA_DMA_XDMA, slot_id,/*channel*/ 0, /*is_read*/ false);
 52 |         rc = fpga_dma_burst_write(write_fd, (uint8_t*)feature,feature_length,data_in_offset_addr);
 53 |         if(write_fd >= 0)
 54 |             close(write_fd);
 55 | 	cout <<"write data finish"<<endl; 
 56 | }
 57 | 
 58 | void acc_ctrl::start_process(int mode)
 59 | {
 60 | 	cout <<"process start"<<endl;
 61 |         int rc;
 62 |         uint32_t value;
 63 |         pci_bar_handle_t pci_bar_handle = PCI_BAR_HANDLE_INIT;
 64 |         rc = fpga_pci_attach(0,FPGA_APP_PF, 0, 0,  &pci_bar_handle);
 65 | 
 66 |         rc = fpga_pci_poke(pci_bar_handle, ctrl_addr+0x18, weight_offset_addr);   //weight
 67 |         rc = fpga_pci_poke(pci_bar_handle, ctrl_addr+0x10, para_offset_addr);   //para
 68 |         rc = fpga_pci_poke(pci_bar_handle, ctrl_addr+0x20, data_in_offset_addr);   //feature in
 69 | 
 70 |         rc = fpga_pci_poke(pci_bar_handle, ctrl_addr+0x28, mode);
 71 | 
 72 |         // cout << ctrl_addr<<endl;
 73 |         rc = fpga_pci_peek(pci_bar_handle, ctrl_addr, &value);
 74 |         cout << "value:"<<value<<endl;
 75 |         rc = fpga_pci_poke(pci_bar_handle, ctrl_addr, 0x00000001);
 76 |         while((value&0x00000002) != 0x00000002)
 77 |         {
 78 |            rc = fpga_pci_peek(pci_bar_handle, ctrl_addr, &value);
 79 |         }
 80 | 
 81 |         if (pci_bar_handle >= 0)
 82 |         {
 83 |             rc = fpga_pci_detach(pci_bar_handle);
 84 |             if (rc)
 85 |             {
 86 |                 cout <<"Failure while detaching from the fpga."<<endl;
 87 |             }
 88 |         }
 89 | 	cout <<"process finish"<<endl;
 90 | }
 91 | 
 92 | void acc_ctrl::read_data(short int feature[][32],int feature_length,int memId)
 93 | {
 94 | 	cout << "read start"  << endl;
 95 |         int read_fd;
 96 |         int rc;
 97 |         int slot_id = 0;
 98 | 
 99 |         read_fd = fpga_dma_open_queue(FPGA_DMA_XDMA, slot_id, /*channel*/ 0, /*is_read*/ true);
100 |         rc = fpga_dma_burst_read(read_fd, (uint8_t*)feature,feature_length, 0xC0000000);
101 |         if (read_fd >= 0)
102 |         {
103 |             close(read_fd);
104 |         }
105 | 	cout << "read finish" << endl;
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/api_lib/src/acc_ctrl.cpp:
--------------------------------------------------------------------------------
  1 | #include "acc_ctrl.h"
  2 | 
  3 | using namespace std;
  4 | 
  5 | const char *device_h2c = DEVICE_H2C;
  6 | const char *device_c2h = DEVICE_C2H;
  7 | const char *device_ctrl = DEVICE_CTRL;
  8 | 
  9 | acc_ctrl::acc_ctrl( uint32_t para_offset_addr,
 10 | 			 		uint32_t weight_offset_addr,
 11 | 			 		uint32_t data_in_offset_addr,
 12 | 			 		off_t    ctrl_addr
 13 | 		 			)
 14 | {
 15 | 	this->para_offset_addr = para_offset_addr;
 16 | 	this->weight_offset_addr = weight_offset_addr;
 17 | 	this->data_in_offset_addr = data_in_offset_addr;
 18 | 	this->ctrl_addr = ctrl_addr;
 19 | }
 20 | 
 21 | void acc_ctrl::write_weight(	short int weight[][32],
 22 | 								int weight_length)
 23 | {
 24 | 	cout <<"write weight start" << endl;
 25 | 	int fpga_fd;
 26 | 	int rc;
 27 | 	off_t off;
 28 | 	fpga_fd= open(device_h2c,O_RDWR);
 29 | 	assert(fpga_fd >= 0);
 30 | 	off = lseek(fpga_fd,weight_offset_addr,SEEK_SET);
 31 | 	rc = write(fpga_fd, weight, weight_length);
 32 | 	assert(rc == weight_length);
 33 | 	close(fpga_fd); 
 34 | 	cout <<"write weight finish" << endl;
 35 | }
 36 | 
 37 | 
 38 | void acc_ctrl::write_para(	int* para_list,
 39 | 							int para_length)
 40 | {
 41 | 	cout <<"write para start" << endl;
 42 | 	int fpga_fd;
 43 | 	int rc;
 44 | 	off_t off;
 45 | 	fpga_fd= open(device_h2c,O_RDWR);
 46 | 	assert(fpga_fd >= 0);
 47 | 	off = lseek(fpga_fd,para_offset_addr,SEEK_SET);
 48 | 	rc = write(fpga_fd, para_list, para_length);
 49 | 	assert(rc == para_length);
 50 | 	close(fpga_fd); 
 51 | 	cout <<"write para finish" << endl;
 52 | }
 53 | 
 54 | 
 55 | void acc_ctrl::write_data(	short int feature[][32],
 56 | 							int feature_length)
 57 | {
 58 | 	cout <<"write data start"<<endl;
 59 | 	int fpga_fd;
 60 | 	int rc;
 61 | 	off_t off;
 62 | 	fpga_fd= open(device_h2c,O_RDWR);
 63 | 	assert(fpga_fd >= 0);
 64 | 	off = lseek(fpga_fd,data_in_offset_addr,SEEK_SET);
 65 | 	rc = write(fpga_fd, feature, feature_length);
 66 | 	assert(feature_length);
 67 | 	close(fpga_fd);
 68 | 	cout <<"write data finish"<<endl; 
 69 | }
 70 | 
 71 | 
 72 | 
 73 | void acc_ctrl::start_process(int mode)
 74 | {
 75 | 	cout <<"process start"<<endl;
 76 | 	timeval begin,end;
 77 | 	int i,j,k;
 78 | 	int fpga_fd;
 79 | 	void *map_base, *virt_addr;
 80 | 
 81 | 	fpga_fd = open(device_ctrl,O_RDWR|O_SYNC);
 82 | 	assert(fpga_fd >= 0);
 83 | 	cout <<"fpga_ctrl_open"<<endl;
 84 | 
 85 |     //config_mem_offset
 86 |    	map_base = mmap(0,MAP_SIZE,PROT_READ|PROT_WRITE,MAP_SHARED,fpga_fd,ctrl_addr);
 87 |    	cout <<"map_base:" << map_base << endl;
 88 |     //1.weight_mem_offset
 89 |    	virt_addr = (uint8_t*)map_base + 0x18;
 90 |    	(*(int*)virt_addr) = weight_offset_addr;
 91 |   	cout <<"virt_addr:" << virt_addr << endl;
 92 | 
 93 |    	//2.para_mem_offset
 94 |    	virt_addr = (uint8_t*)map_base + 0x10;
 95 |    	(*(int*)virt_addr) = para_offset_addr;   
 96 | 
 97 |     //3.data_in_mem offset
 98 |    	virt_addr = (uint8_t*)map_base + 0x20;
 99 |    	(*(int*)virt_addr) = data_in_offset_addr;
100 | 
101 | 	//4.switch storage mode
102 | 	virt_addr = (uint8_t*)map_base + 0x28;
103 | 	(*(int*)virt_addr) = mode;
104 | 
105 |   	//start process
106 | 	virt_addr = (int*)map_base;
107 | 
108 | 	//gettimeofday(&begin,NULL);
109 | 	(*((uint32_t *) virt_addr)) = 0x00000001;
110 | 	while(((*((uint32_t *) virt_addr)) & 0x00000002)!=0x00000002);
111 | 	//gettimeofday(&end,NULL);
112 | 	//cout <<"process time    " << (end.tv_usec - begin.tv_usec)/1000.0 <<"ms" <<endl;
113 | 	close(fpga_fd);
114 | 	cout <<"process finish"<<endl;
115 | }
116 | 
117 | 
118 | 
119 | 
120 | void acc_ctrl::read_data(short int feature[][32],int feature_length)
121 | {
122 | 	cout << "read start"  << endl;
123 | 	int fpga_fd;
124 | 	int rc;
125 | 	off_t off;
126 | 	fpga_fd = open(device_c2h, O_RDWR | O_NONBLOCK);
127 | 	assert(fpga_fd >= 0);
128 | 	off = lseek(fpga_fd,0xC0000000,SEEK_SET);
129 | 	cout << "read finish00" << endl;
130 | 	rc = read(fpga_fd, feature, feature_length);
131 | 	assert(rc == feature_length);
132 | 	close(fpga_fd); 
133 | 	cout << "read finish" << endl;
134 | }


--------------------------------------------------------------------------------
/fpga_cnn/testbench/print_array.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by yaochen on 22/10/18.
  3 | //
  4 | 
  5 | #ifndef _PRINT_ARRAY_H_
  6 | #define _PRINT_ARRAY_H_
  7 | 
  8 | #include <string>
  9 | #include "config.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | int print_array_3d(string array_name, int channel, int r_dim, int c_dim, data_type_itf *array){
 14 | 
 15 |     ap_fixed<16,10> print_tmp = 0;
 16 |     cout << array_name << endl;
 17 | 
 18 |     for (int ch = 0; ch < channel; ch++) {
 19 |         cout << "print output channel: " << ch << endl;
 20 |         for (int j = 0; j < r_dim; j++) {
 21 |             for (int i = 0; i < c_dim; i++) {
 22 |                 print_tmp.range(15,0) = array[j*r_dim + i].range(ch*16+15, ch*16);
 23 | #if _HLS_MODE_
 24 |                 cout << print_tmp << " ";
 25 | #else
 26 |                 cout << print_tmp << " ";
 27 | #endif
 28 |             }
 29 |             cout << endl;
 30 |         }
 31 |         cout << endl;
 32 |     }
 33 |     cout << endl;
 34 |     cout << endl;
 35 | 
 36 |     return 0;
 37 | }
 38 | 
 39 | int squeeze_input(string array_name,
 40 |                   int channel,
 41 |                   int r_dim,
 42 |                   int c_dim,
 43 |                   data_type_o *i_array,
 44 |                   data_type_itf *o_array,
 45 |                   bool dis_enable) {
 46 | 
 47 |     ap_fixed<16,10> print_tmp = 0;
 48 |     cout << array_name << endl;
 49 | 
 50 |     for (int ch = 0; ch < channel; ch++) {
 51 |         for (int i = 0; i < r_dim; i++) {
 52 |             for (int j = 0; j < c_dim; j++) {
 53 |                 for (int wd = 0; wd <= ch && wd < 32; wd++) {
 54 | 
 55 |                     o_array[i * r_dim + j].range((wd + 1) * 16 - 1, wd * 16) = i_array[i * r_dim + j].range(15, 0);
 56 | #if _C_DEBUG_MODE_
 57 |                     if (dis_enable) {cout << setw(3) << i_array[i * 28 + j] << " ";}
 58 | #else
 59 |                     if (dis_enable) {cout << i_array[i * 28 + j] << " ";}
 60 | #endif
 61 |                 }
 62 |             }
 63 |             if (dis_enable) {cout << endl;}
 64 |         }
 65 |         if (dis_enable) {cout << endl;}
 66 |     }
 67 |     if (dis_enable) {cout << endl;}
 68 | 
 69 |     return 0;
 70 | }
 71 | 
 72 | int squeeze_weight(string array_name,
 73 |                    int i_channel,
 74 |                    int o_channel,
 75 |                    int kernel_size,
 76 |                    data_type_w *i_data,
 77 |                    data_type_itf *o_array,
 78 |                    bool dis_enable) {
 79 | 
 80 |     ap_fixed<16,10> print_tmp = 0;
 81 |     cout << array_name << endl;
 82 | 
 83 |     if (dis_enable) {
 84 |         cout << "Printing squeezed weight data ----------------------------- " << endl;
 85 |     }
 86 | 
 87 |     for (int i = 0; i < i_channel; i++){
 88 |         for (int j = 0; j < o_channel; j++) {
 89 |             for (int k1 = 0; k1 < kernel_size; k1++) {
 90 |                 for (int k2 = 0; k2 < kernel_size; k2++) {
 91 |                     data_type_w w = *(i_data + i * o_channel * kernel_size * kernel_size +
 92 |                                       j * kernel_size * kernel_size + k1 * kernel_size + k2);
 93 | //                    for(int ch = 0; ch < 32 && ch < i_channel; ch++){
 94 | //                        ap_fixed<16,10> w =
 95 | //                                i_data[ch*o_channel*kernel_size*kernel_size + j*kernel_size*kernel_size + k1*kernel_size + k2];
 96 |                     o_array[i / 32 * o_channel * kernel_size * kernel_size + j * kernel_size * kernel_size +
 97 |                             k1 * kernel_size + k2].range(i * 16 + 15, i * 16) = w.range(15, 0);
 98 |                     if (dis_enable) { cout << w << " "; }
 99 |                 }
100 |             }if (dis_enable) { cout << endl; }
101 |         }if (dis_enable) {cout << endl;}
102 |     }
103 |     if (dis_enable) {cout << endl;}
104 | 
105 |     if (dis_enable) {
106 |         cout << "Finished printing squeezed weight data ----------------------------- " << endl;
107 |     }
108 | 
109 |     return 0;
110 | 
111 | }
112 | 
113 | 
114 | #endif //_PRINT_ARRAY_H_
115 | 


--------------------------------------------------------------------------------
/netGenerator/dse/param_write.py:
--------------------------------------------------------------------------------
 1 | def conv_param_write(conv_param_list, store_file):
 2 | 
 3 |     with open(store_file, "w") as wf:
 4 | 
 5 |         for i in range(0, len(conv_param_list)):
 6 |             for j in range(0, len(conv_param_list[i][1])):
 7 |                 conv_param = "conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, " \
 8 |                     + str(conv_param_list[i][1][j][0]) + ", " \
 9 |                     + str(conv_param_list[i][1][j][1]) + ", " \
10 |                     + str(conv_param_list[i][1][j][2]) + ", " \
11 |                     + str(conv_param_list[i][1][j][2]) + ", 5, 5, 32, 32, 32"
12 |                 wf.write(conv_param + "\n")
13 |     wf.close()
14 | 
15 | 
16 | def pool_param_write(pool_param_list, store_file):
17 | 
18 |     with open(store_file, "a+") as wf:
19 |         # for i in range(0, len(parameters)):
20 |         #     for j in range(0, len(parameters[i][1])):
21 |         #         pool_param = "max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, " \
22 |         #             + str(parameters[i][1][j][0]) + ", " \
23 |         #             + str(parameters[i][1][j][1]) + ", " \
24 |         #             + str(parameters[i][1][j][2]) + ", 2, 3"
25 |         #         wf.write(pool_param + "\n")
26 |         for i in range(0, len(pool_param_list)):
27 |             pool_param = "max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3"
28 |             wf.write(pool_param + "\n")
29 |     wf.close()
30 | 
31 | 
32 | def layer_acc_param_write(layer_acc_list, store_file):
33 |     with open(store_file, "a+") as wf:
34 |         conv_core_counter = 0
35 |         pool_core_counter = 0
36 |         for i in range(0, len(layer_acc_list)):
37 |             pool_flag = False
38 |             for j in range(0, len(layer_acc_list[i])):
39 |                 print(layer_acc_list[i])
40 |                 print(layer_acc_list[i][j])
41 |                 print(layer_acc_list[i][j][-1])
42 |                 if layer_acc_list[i][j][-1] == True:
43 |                     pool_flag = True
44 |                     print("Found layer with pooling")
45 |                 # else:
46 |                 #     pool_flag = False
47 |             if pool_flag == True:
48 |                 layer_acc = "conv_pool, " + str(conv_core_counter) + "," + str(conv_core_counter) + "," + str(
49 |                     pool_core_counter) + "\n"
50 |                 conv_core_counter += 1
51 |                 pool_core_counter += 1
52 |             else:
53 |                 layer_acc = "conv_pool, " + str(conv_core_counter) + "," + str(conv_core_counter) + "\n"
54 |                 conv_core_counter += 1
55 |             wf.write(layer_acc)
56 | 
57 | 
58 | #TODO: write the sub_net function parameters into file
59 | def sub_param_write(subn_param_list, store_file):
60 | 
61 |     with open(store_file, "a+") as wf:
62 |         for i in range(0, len(subn_param_list)):
63 |             print(subn_param_list[i])
64 |             subn_param = "sub_net_"
65 |             for j in range(0, 4):
66 |                 subn_param += str(subn_param_list[i][j])+","
67 |             for j in range(0, len(subn_param_list[i][4])):
68 |                 if j < len(subn_param_list[i][4])-1:
69 |                     subn_param += str(int(subn_param_list[i][4][j]))+","
70 |                 else:
71 |                     subn_param += str(int(subn_param_list[i][4][j]))
72 |             wf.write(subn_param + "\n")
73 |     wf.close()
74 | 
75 | 
76 | def generate_param_file(conv_param_list, pool_param_list, layer_acc_list, subn_param_list, store_file):
77 | 
78 |     # with open(store_file, "w") as wf:
79 | 
80 |         # write the conv parameters
81 |         # for i in range(0, len(parameters)):
82 |         #     for j in range(0, len(parameters[i][1])):
83 |     conv_param_write(conv_param_list, store_file)
84 |     pool_param_write(pool_param_list, store_file)
85 |     layer_acc_param_write(layer_acc_list, store_file)
86 |     sub_param_write(subn_param_list, store_file)
87 | 
88 | 
89 |         # write the pooling parameters
90 | 
91 |         # wirte conv_pool function parameters
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     generate_param_file(conv_param_list, pool_param_list, layer_acc_list, subn_param_list, "acc_ins_params.txt")
96 |     # conv_param_write(parameters)


--------------------------------------------------------------------------------
/fpga_cnn/src/activation_functions.h:
--------------------------------------------------------------------------------
  1 | //This file contains the popular activation functions used in CNNs
  2 | //TODO: modify the commented function to be compatible with gcc compilation.
  3 | //TODO: change the functions into class based expression.
  4 | 
  5 | #ifndef _ACTIVATION_FUNCTIONS_H_
  6 | #define _ACTIVATION_FUNCTIONS_H_
  7 | 
  8 | #include <math.h>
  9 | #include <iostream>
 10 | 
 11 | #include "data_type.h"
 12 | //using namespace std;
 13 | /*
 14 | identity = i;
 15 | sigmod   = s;
 16 | relu     = r;
 17 | leaky_relu = l;
 18 | elu      = e;
 19 | tan_h    = t;
 20 | tan_hp1m2 = h;
 21 | */
 22 | template<typename T>
 23 | T relu(T data){
 24 | //    return (T(0) >= data ? T(0) : data);
 25 |     if (data > T(0)){
 26 |         //cout << "data in range  " << data << " ====> " << data << endl;
 27 |         return data;
 28 |     }
 29 |     else {
 30 |         //cout << "data out range  " << data << " ====> " << 0 << endl;
 31 |         return T(0);
 32 |     }
 33 | }
 34 | 
 35 | template<int a, int b>
 36 | ap_fixed<a, b> RELU(ap_fixed<a, b> data){
 37 |     if (data > 0){
 38 |         return data;
 39 |     }
 40 |     else
 41 |         return 0;
 42 | };
 43 | 
 44 | ap_fixed<64, 32> Relu_64(ap_fixed<64,32> data){
 45 |     if (data > 0){
 46 |         return data;
 47 |     }
 48 |     else
 49 |         return 0;
 50 | };
 51 | 
 52 | ap_fixed<32,16> Relu_32(ap_fixed<32,16> data){
 53 |     if(data > 0) return data;
 54 |     else return 0;
 55 | };
 56 | ap_fixed<24,16> Relu_24(ap_fixed<24,16> data){
 57 |     if(data > 0) return data;
 58 |     else return 0;
 59 | };
 60 | ap_fixed<20,16> Relu_20(ap_fixed<20,16> data){
 61 |     if(data > 0) return data;
 62 |     else return 0;
 63 | };
 64 | ap_fixed<16,12> Relu_16(ap_fixed<16,12> data){
 65 |     if(data > 0) return data;
 66 |     else return 0;
 67 | };
 68 | 
 69 | ap_fixed<8,4> Relu_8(ap_fixed<8,4> data){
 70 |     if(data > 0) return data;
 71 |     else return 0;
 72 | };
 73 | 
 74 | float f(char type, float data) {
 75 | 	if (type == 'i') // identity
 76 | 	{
 77 | 		return data;
 78 | 	}
 79 | 	else if (type == 's') { // sigmod
 80 | 		return float(1) / (float(1) + exp(-data));
 81 | 	}
 82 | 	else if (type == 'r') { //relu
 83 | 		return ((float(0) > data) ? float(0) : data);
 84 | 	}
 85 | 	else if (type == 'l') { //leak_relu
 86 | 		return (data > float(0)) ? data : float(0.01) * data;
 87 | 	}
 88 | 	else if (type == 'e') { // elu
 89 | 		return (data<float(0) ? (exp(data) - float(1)) : data);
 90 | 	}
 91 | 	else if (type == 't') { // tanh
 92 | 		const float ep = exp(data);
 93 | 		const float em = exp(-data);
 94 | 		return (ep - em) / (ep + em);
 95 | //		return tanh(data);
 96 | 	}
 97 | 	else if (type == 'h') { // tan_hp1m2
 98 | 		const float ep = exp(data);
 99 | 		return ep / (ep + exp(-data));
100 | 	}
101 |     else return false;
102 | }
103 | 
104 | //float f(string& type, const vec_t& v, int i) {
105 | //	if (type == "softmax")
106 | //	{
107 | //		float alpha = *std::max_element(v.begin(), v.end());
108 | //		float numer = exp(v[i] - alpha);
109 | //		float denom = float(0);
110 | //		for (uint i = 0; i < v.size(); i++) {
111 | //			denom += exp(v[i] - alpha);
112 | //		}
113 | //		return numer / denom;
114 | //	}
115 | //    else return false;
116 | //}
117 | /*
118 | float df(char& type, float data) {
119 | 	if (type == "identity")
120 | 	{
121 | 		return float(1);
122 | 	}
123 | 	else if (type == "sigmoid") {
124 | 		return data * (float(1) - data);
125 | 	}
126 | 	else if (type == "relu") {
127 | 		return data > float(0) ? float(1) : float(0);
128 | 	}
129 | 	else if (type == "leaky_relu") {
130 | 		return data > float(0) ? float(1) : float(0.01);
131 | 	}
132 | 	else if (type == "elu") {
133 | 		return (data > float(0) ? float(1) : (float(1) + data));
134 | 	}
135 | 	else if (type == "tan_h") {
136 | 		return float(1) - data*data;
137 | 	}
138 | 	else if (type == "tan_hp1m2") {
139 | 		return 2 * data *(float(1) - data);
140 | 	}
141 |     else return false;
142 | }
143 | */
144 | //vec_t df(string& type, const vec_t& y, uint index) {
145 | //    vec_t v(0, 0);
146 | //    if (type == "softmax")
147 | //	{
148 | //		vec_t v(y.size(), 0);
149 | //		for (uint i = 0; i < y.size(); i++)
150 | //			v[i] = (i == index) ? y[index] * (float(1) - y[index]) : -y[i] * y[index];
151 | //
152 | //		return v;
153 | //	}
154 | //    else return v;
155 | //}
156 | 
157 | #endif
158 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/construct_net.h:
--------------------------------------------------------------------------------
  1 | #ifndef _CONSTRUCT_NET_H_
  2 | #define _CONSTRUCT_NET_H_
  3 | 
  4 | //#include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h"
  5 | //#include "/opt/Xilinx/Vivado/2018.1/include/gmp.h"
  6 | //#include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h"
  7 | //#include "/opt/Xilinx/Vivado/2018.1/include/gmp.h"
  8 | 
  9 | #include "config.h"
 10 | #include <iostream>
 11 | #include <ap_fixed.h>
 12 | #include "acc_instance.h"
 13 | using namespace std;
 14 | 
 15 | void sub_net_0(
 16 | 				Tparam param_port[1024],
 17 | //				ap_fixed<32,26> bias_in[4096],
 18 | 				data_type_itf weight_in[131072],
 19 | 				data_type_itf data_in_0[65536],
 20 | 				data_type_itf data_out_0[32768],
 21 | 				data_type_itf data_in_1[32768],
 22 | 				data_type_itf data_out_1[4096],
 23 | 				int select
 24 |    )
 25 | {
 26 | 
 27 | #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS
 28 | #pragma HLS INTERFACE s_axilite port=select bundle=CRTL_BUS
 29 | 
 30 | #pragma HLS INTERFACE s_axilite port=param_port bundle=CRTL_BUS
 31 | #pragma HLS INTERFACE m_axi port=param_port offset=slave depth=1024 bundle=PARAM_IN
 32 | //#pragma HLS INTERFACE s_axilite port=bias_in bundle=CRTL_BUS
 33 | //#pragma HLS INTERFACE m_axi port=bias_in offset=slave depth=4096 		bundle=BIAS_IN
 34 | #pragma HLS INTERFACE s_axilite port=weight_in bundle=CRTL_BUS
 35 | #pragma HLS INTERFACE m_axi port=weight_in offset=slave depth=131072 	bundle=WEIGHT_IN
 36 | 
 37 | #pragma HLS INTERFACE s_axilite port=data_in_0 bundle=CRTL_BUS
 38 | #pragma HLS INTERFACE m_axi port=data_in_0 offset=slave depth=65536 	bundle=DATA_IN
 39 | 
 40 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3
 41 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3
 42 | #pragma HLS INTERFACE bram port=data_out_1
 43 | 
 44 | 
 45 | 	int acc0_mem_inport_offset = 0;
 46 | 	int acc0_mem_outport_offset = 0;
 47 | 	int acc1_mem_inport_offset = 0;
 48 | 	int acc1_mem_outport_offset = 0;
 49 | 
 50 | 
 51 | 	if (select == 0)
 52 | 	{
 53 | 		acc0_mem_inport_offset = 0;
 54 | 		acc0_mem_outport_offset = 0;
 55 | 		acc1_mem_inport_offset = 16384;
 56 | 		acc1_mem_outport_offset = 2048;
 57 | 	}
 58 | 	else
 59 | 	{
 60 | 		acc0_mem_inport_offset = 16384;
 61 | 		acc0_mem_outport_offset = 16384;
 62 | 		acc1_mem_inport_offset = 0;
 63 | 		acc1_mem_outport_offset = 0;
 64 | 	}
 65 | 
 66 |     conv_pool_acc_0(param_port,       bias_in,     weight_in, data_in_0 + acc0_mem_inport_offset, data_out_0 + acc0_mem_outport_offset);
 67 |     conv_pool_acc_1(param_port + 256, bias_in+256, weight_in, data_in_1 + acc1_mem_inport_offset, data_out_1 + acc1_mem_outport_offset);
 68 | 
 69 | };
 70 | 
 71 | 
 72 | void sub_net_1(
 73 | 				Tparam param_port[1024],
 74 | 				ap_fixed<32,26> bias_in[4096],
 75 | 				data_type_itf weight_in[131072],
 76 | 				data_type_itf data_in_0[65536],
 77 | 				data_type_itf data_out_0[32768],
 78 | 				data_type_itf data_in_1[32768],
 79 | 				data_type_itf data_out_1[4096],
 80 | 				int select
 81 |    )
 82 | {
 83 | 
 84 | #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS
 85 | #pragma HLS INTERFACE s_axilite port=select bundle=CRTL_BUS
 86 | 
 87 | #pragma HLS INTERFACE s_axilite port=param_port bundle=CRTL_BUS
 88 | #pragma HLS INTERFACE m_axi port=param_port offset=slave depth=1024 bundle=PARAM_IN
 89 | //#pragma HLS INTERFACE s_axilite port=bias_in bundle=CRTL_BUS
 90 | //#pragma HLS INTERFACE m_axi port=bias_in offset=slave depth=4096 		bundle=BIAS_IN
 91 | #pragma HLS INTERFACE s_axilite port=weight_in bundle=CRTL_BUS
 92 | #pragma HLS INTERFACE m_axi port=weight_in offset=slave depth=131072 	bundle=WEIGHT_IN
 93 | 
 94 | #pragma HLS INTERFACE s_axilite port=data_in_0 bundle=CRTL_BUS
 95 | #pragma HLS INTERFACE m_axi port=data_in_0 offset=slave depth=65536 	bundle=DATA_IN
 96 | 
 97 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3
 98 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3
 99 | #pragma HLS INTERFACE bram port=data_out_1
100 | 
101 | 
102 | 	int acc0_mem_inport_offset = 0;
103 | 	int acc0_mem_outport_offset = 0;
104 | 	int acc1_mem_inport_offset = 0;
105 | 	int acc1_mem_outport_offset = 0;
106 | 
107 | 
108 | 	if (select == 0)
109 | 	{
110 | 		acc0_mem_inport_offset = 0;
111 | 		acc0_mem_outport_offset = 0;
112 | 		acc1_mem_inport_offset = 16384;
113 | 		acc1_mem_outport_offset = 2048;
114 | 	}
115 | 	else
116 | 	{
117 | 		acc0_mem_inport_offset = 16384;
118 | 		acc0_mem_outport_offset = 16384;
119 | 		acc1_mem_inport_offset = 0;
120 | 		acc1_mem_outport_offset = 0;
121 | 	}
122 | 
123 |     conv_pool_acc_0(param_port,       bias_in,     weight_in, data_in_0 + acc0_mem_inport_offset, data_out_0 + acc0_mem_outport_offset);
124 |     conv_pool_acc_1(param_port + 256, bias_in+256, weight_in, data_in_1 + acc1_mem_inport_offset, data_out_1 + acc1_mem_outport_offset);
125 | 
126 | };
127 | 
128 | 
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/netGenerator/alex.prototxt:
--------------------------------------------------------------------------------
  1 | name: "AlexNet"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Input"
  5 |   top: "data"
  6 |   input_param { shape: { dim: 10 dim: 3 dim: 227 dim: 227 } }
  7 | }
  8 | layer {
  9 |   name: "conv1"
 10 |   type: "Convolution"
 11 |   bottom: "data"
 12 |   top: "conv1"
 13 |   param {
 14 |     lr_mult: 1
 15 |     decay_mult: 1
 16 |   }
 17 |   param {
 18 |     lr_mult: 2
 19 |     decay_mult: 0
 20 |   }
 21 |   convolution_param {
 22 |     num_output: 96
 23 |     kernel_size: 11
 24 |     stride: 4
 25 |   }
 26 | }
 27 | layer {
 28 |   name: "relu1"
 29 |   type: "ReLU"
 30 |   bottom: "conv1"
 31 |   top: "conv1"
 32 | }
 33 | layer {
 34 |   name: "norm1"
 35 |   type: "LRN"
 36 |   bottom: "conv1"
 37 |   top: "norm1"
 38 |   lrn_param {
 39 |     local_size: 5
 40 |     alpha: 0.0001
 41 |     beta: 0.75
 42 |   }
 43 | }
 44 | layer {
 45 |   name: "pool1"
 46 |   type: "Pooling"
 47 |   bottom: "norm1"
 48 |   top: "pool1"
 49 |   pooling_param {
 50 |     pool: MAX
 51 |     kernel_size: 3
 52 |     stride: 2
 53 |   }
 54 | }
 55 | layer {
 56 |   name: "conv2"
 57 |   type: "Convolution"
 58 |   bottom: "pool1"
 59 |   top: "conv2"
 60 |   param {
 61 |     lr_mult: 1
 62 |     decay_mult: 1
 63 |   }
 64 |   param {
 65 |     lr_mult: 2
 66 |     decay_mult: 0
 67 |   }
 68 |   convolution_param {
 69 |     num_output: 256
 70 |     pad: 2
 71 |     kernel_size: 5
 72 |     group: 2
 73 |   }
 74 | }
 75 | layer {
 76 |   name: "relu2"
 77 |   type: "ReLU"
 78 |   bottom: "conv2"
 79 |   top: "conv2"
 80 | }
 81 | layer {
 82 |   name: "norm2"
 83 |   type: "LRN"
 84 |   bottom: "conv2"
 85 |   top: "norm2"
 86 |   lrn_param {
 87 |     local_size: 5
 88 |     alpha: 0.0001
 89 |     beta: 0.75
 90 |   }
 91 | }
 92 | layer {
 93 |   name: "pool2"
 94 |   type: "Pooling"
 95 |   bottom: "norm2"
 96 |   top: "pool2"
 97 |   pooling_param {
 98 |     pool: MAX
 99 |     kernel_size: 3
100 |     stride: 2
101 |   }
102 | }
103 | layer {
104 |   name: "conv3"
105 |   type: "Convolution"
106 |   bottom: "pool2"
107 |   top: "conv3"
108 |   param {
109 |     lr_mult: 1
110 |     decay_mult: 1
111 |   }
112 |   param {
113 |     lr_mult: 2
114 |     decay_mult: 0
115 |   }
116 |   convolution_param {
117 |     num_output: 384
118 |     pad: 1
119 |     kernel_size: 3
120 |   }
121 | }
122 | layer {
123 |   name: "relu3"
124 |   type: "ReLU"
125 |   bottom: "conv3"
126 |   top: "conv3"
127 | }
128 | layer {
129 |   name: "conv4"
130 |   type: "Convolution"
131 |   bottom: "conv3"
132 |   top: "conv4"
133 |   param {
134 |     lr_mult: 1
135 |     decay_mult: 1
136 |   }
137 |   param {
138 |     lr_mult: 2
139 |     decay_mult: 0
140 |   }
141 |   convolution_param {
142 |     num_output: 384
143 |     pad: 1
144 |     kernel_size: 3
145 |     group: 2
146 |   }
147 | }
148 | layer {
149 |   name: "relu4"
150 |   type: "ReLU"
151 |   bottom: "conv4"
152 |   top: "conv4"
153 | }
154 | layer {
155 |   name: "conv5"
156 |   type: "Convolution"
157 |   bottom: "conv4"
158 |   top: "conv5"
159 |   param {
160 |     lr_mult: 1
161 |     decay_mult: 1
162 |   }
163 |   param {
164 |     lr_mult: 2
165 |     decay_mult: 0
166 |   }
167 |   convolution_param {
168 |     num_output: 256
169 |     pad: 1
170 |     kernel_size: 3
171 |     group: 2
172 |   }
173 | }
174 | layer {
175 |   name: "relu5"
176 |   type: "ReLU"
177 |   bottom: "conv5"
178 |   top: "conv5"
179 | }
180 | layer {
181 |   name: "pool5"
182 |   type: "Pooling"
183 |   bottom: "conv5"
184 |   top: "pool5"
185 |   pooling_param {
186 |     pool: MAX
187 |     kernel_size: 3
188 |     stride: 2
189 |   }
190 | }
191 | layer {
192 |   name: "fc6"
193 |   type: "InnerProduct"
194 |   bottom: "pool5"
195 |   top: "fc6"
196 |   param {
197 |     lr_mult: 1
198 |     decay_mult: 1
199 |   }
200 |   param {
201 |     lr_mult: 2
202 |     decay_mult: 0
203 |   }
204 |   inner_product_param {
205 |     num_output: 4096
206 |   }
207 | }
208 | layer {
209 |   name: "relu6"
210 |   type: "ReLU"
211 |   bottom: "fc6"
212 |   top: "fc6"
213 | }
214 | layer {
215 |   name: "drop6"
216 |   type: "Dropout"
217 |   bottom: "fc6"
218 |   top: "fc6"
219 |   dropout_param {
220 |     dropout_ratio: 0.5
221 |   }
222 | }
223 | layer {
224 |   name: "fc7"
225 |   type: "InnerProduct"
226 |   bottom: "fc6"
227 |   top: "fc7"
228 |   param {
229 |     lr_mult: 1
230 |     decay_mult: 1
231 |   }
232 |   param {
233 |     lr_mult: 2
234 |     decay_mult: 0
235 |   }
236 |   inner_product_param {
237 |     num_output: 4096
238 |   }
239 | }
240 | layer {
241 |   name: "relu7"
242 |   type: "ReLU"
243 |   bottom: "fc7"
244 |   top: "fc7"
245 | }
246 | layer {
247 |   name: "drop7"
248 |   type: "Dropout"
249 |   bottom: "fc7"
250 |   top: "fc7"
251 |   dropout_param {
252 |     dropout_ratio: 0.5
253 |   }
254 | }
255 | layer {
256 |   name: "fc8"
257 |   type: "InnerProduct"
258 |   bottom: "fc7"
259 |   top: "fc8"
260 |   param {
261 |     lr_mult: 1
262 |     decay_mult: 1
263 |   }
264 |   param {
265 |     lr_mult: 2
266 |     decay_mult: 0
267 |   }
268 |   inner_product_param {
269 |     num_output: 1000
270 |   }
271 | }
272 | layer {
273 |   name: "prob"
274 |   type: "Softmax"
275 |   bottom: "fc8"
276 |   top: "prob"
277 | }
278 | 


--------------------------------------------------------------------------------
/netGenerator/dse/global_search.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import helping_functions
  3 | import sys
  4 | import math
  5 | from model_partition import partition_to_k
  6 | from model_split import model_split_by_list
  7 | from model_split import gop_calculate
  8 | import pprint
  9 | import threading
 10 | import multiprocessing
 11 | import time
 12 | from local_search import local_search
 13 | 
 14 | result_Q = multiprocessing.Queue()
 15 | PROCESS_NUM = 4
 16 | 
 17 | 
 18 | class SearchProcess(multiprocessing.Process):
 19 |     def __init__(self, param_v, processIdx, result_Q):
 20 |         multiprocessing.Process.__init__(self)
 21 |         self.layer_list = param_v[0]
 22 |         self.acc_cluster_num = param_v[1]
 23 |         self.conv_N = param_v[2]
 24 |         self.conv_M = param_v[3]
 25 |         self.conv_r = param_v[4]
 26 |         self.conv_R = param_v[5]
 27 |         self.conv_K = param_v[6]
 28 |         self.conv_S = param_v[7]
 29 |         self.flag = param_v[8]
 30 |         self.overall_lat = param_v[9]
 31 |         self.processIdx = processIdx
 32 |         self.result_Q = result_Q
 33 | 
 34 |     def run(self):
 35 | 
 36 |         start = time.time()
 37 |         process_gop_list = []
 38 |         process_item_list = []
 39 |         process_util_list = []
 40 |         process_pair_list = []
 41 | 
 42 |         search_counter = 0
 43 | 
 44 |         print("Process " + str(self.processIdx) + " starts global search.")
 45 | 
 46 |         for idx, item in enumerate(partition_to_k(self.layer_list, self.acc_cluster_num, False), 0):
 47 |             if idx % PROCESS_NUM == self.processIdx:
 48 |                 sub_gop_list = []
 49 |                 search_counter = search_counter + 1
 50 |                 sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \
 51 |                     = model_split_by_list(self.conv_N, self.conv_M, self.conv_r, self.conv_R, self.conv_K, self.conv_S, self.flag, item)
 52 |                 sub_pair_list, sub_lat_list, sub_util_list = \
 53 |                     local_search(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag)
 54 | 
 55 |                 for i in range(0, len(sub_conv_N)):
 56 |                     sub_gop_list.append(gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i]))
 57 | 
 58 |                 if max(sub_lat_list) < self.overall_lat:
 59 |                     overall_lat = max(sub_lat_list)
 60 |                     if len(process_pair_list) < 6:
 61 |                         process_item_list.append(item)
 62 |                         process_pair_list.append(sub_pair_list)
 63 |                         # process_pair_list.append([overall_lat])
 64 |                         process_util_list.append([overall_lat])
 65 |                         process_gop_list.append(sub_gop_list)
 66 |                         # process_util_list.append(sub_util_list)
 67 |                         # process_pair_list.append(sub_util_list)
 68 |                     # else:
 69 |                     #     max_among_mins = process_pair_list.index(max(overall_lat))
 70 |                     #     process_pair_list.remove(process_pair_list[max_among_mins])
 71 |                     #     process_pair_list.append(sub_pair_list)
 72 |                     #     process_pair_list.append([overall_lat])
 73 |                     #     process_pair_list.append(sub_util_list)
 74 | 
 75 |             # print "For set ID: " + str(idx) + ", the final explored points = ", search_counter
 76 | 
 77 |         if len(process_pair_list) != 0:
 78 |             self.result_Q.put((process_pair_list, process_item_list, process_gop_list, process_util_list))
 79 | 
 80 |         end = time.time()
 81 |         print("Thread ", self.processIdx, " :", (end - start))
 82 | 
 83 | 
 84 | def global_search(layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat):
 85 |     """
 86 |     :param layer_list: a list containing each layer information in the form of a tuple (layer index, layer name).
 87 |     :param acc_cluster_num:
 88 |     :param conv_N:
 89 |     :param conv_M:
 90 |     :param conv_r:
 91 |     :param conv_R:
 92 |     :param conv_K:
 93 |     :param conv_S:
 94 |     :param flag:
 95 |     :param pair_list:
 96 |     :param overall_lat:
 97 |     :return:
 98 |     """
 99 |     sub_conv_N = []
100 |     sub_conv_M = []
101 |     sub_conv_r = []
102 |     sub_conv_R = []
103 |     sub_conv_K = []
104 |     sub_conv_S = []
105 |     sub_flag = []
106 |     sub_pair_list = []
107 |     sub_lat_list = []
108 |     sub_util_list = []
109 | 
110 |     gop_list = []
111 |     item_list = []
112 |     util_list = []
113 |     pair_list = []
114 | 
115 |     processes = []
116 |     for i in range(PROCESS_NUM):
117 |         p = SearchProcess((layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat), i, result_Q)
118 |         processes.append(p)
119 | 
120 |     for p in processes:
121 |         p.start()
122 | 
123 |     for p in processes:
124 |         p.join()
125 | 
126 |     results = list()
127 |     while not result_Q.empty():
128 |         results.append(result_Q.get())
129 |     for item in results:
130 |         pair_list = pair_list + item[0]
131 |         item_list = item_list + item[1]
132 |         gop_list = gop_list + item[2]
133 |         util_list = util_list + item[3]
134 | 
135 |     return pair_list, item_list, gop_list, util_list
136 | 


--------------------------------------------------------------------------------
/fpga_cnn/testbench/fc_validate.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include "fc_validate.h"
  6 | 
  7 | using namespace std;
  8 | 
  9 | fc_validate::fc_validate(int layer_num, int num_input, int num_output, int act) {
 10 |     int i, j, k;
 11 |     this->layer_num = layer_num;
 12 |     this->num_input = num_input;
 13 |     this->num_output = num_output;
 14 |     this->act = act;
 15 | 
 16 |     this->lnum_list[0] = layer_num;
 17 | 
 18 |     this->config_list[0] = num_input;
 19 |     this->config_list[1] = 0;
 20 |     this->config_list[2] = num_output;
 21 |     this->config_list[3] = 0;
 22 |     this->config_list[4] = 0;
 23 |     this->config_list[5] = 0;
 24 |     this->config_list[6] = 0;
 25 |     this->config_list[7] = 0;
 26 |     this->config_list[8] = 0;
 27 |     this->config_list[9] = 0;
 28 |     this->config_list[10] = 0;
 29 |     this->config_list[11] = 0;
 30 |     this->config_list[12] = 0;
 31 |     this->config_list[13] = 0;
 32 |     this->config_list[14] = 0;
 33 |     this->config_list[15] = 0;
 34 | 
 35 | //    ap_int<512> weight[32];
 36 | //    ap_int<512> in_feature[32];
 37 | //    ap_int<512> bias[32];
 38 | //    ap_int<512> out_feature[32];
 39 | //    ap_int<512> out_feature_software[32];
 40 | 
 41 | 	weight = new ap_int<512>[1024*1024];
 42 | 	in_feature = new ap_int<512>[1024];
 43 | //	bias = new ap_fixed<32,26>[num_output];
 44 | 	out_feature = new ap_int<512>[1024];
 45 | 	out_feature_software = new ap_int<512>[1024];
 46 | 
 47 | 
 48 |     // initial weight data with random numbers
 49 |     for(i = 0; i < (num_input/32)*num_output; i++){
 50 |         for(j = 0; j < 32; j++){
 51 |             weight[i].range(16*j+15, 16*j) = rand()%10 - 5; // rand()%10 - 5
 52 |         }
 53 |     }
 54 |     for(i = (num_input/32) * num_output; i < ((int)(ceil((float)num_input/32))) * num_output; i++)
 55 | 	{
 56 | 		for(j = 0 ; j < num_input % 32; j++)
 57 | 			weight[i].range(15+16*j,16*j) = rand()%10 - 5; //rand()%10 - 5;
 58 | 		for(j = num_input % 32 ; j < 32; j++)
 59 | 			weight[i].range(15+16*j,16*j) = 0;
 60 | 	}
 61 | 
 62 |     // initial input data with random numbers
 63 |     for(i = 0 ; i < num_input/32; i++){
 64 | 	    for(j = 0 ; j < 32; j++){
 65 | 			in_feature[i].range(15+16*j,16*j) = (rand()%2 -1) * 64; //(rand()%2 -1) * 64
 66 |         }
 67 |     }
 68 | 	for(i = num_input/32 ; i < (int)(ceil(float(num_input)/32)); i++){
 69 | 		for(j = 0 ; j < num_input % 32; j++){
 70 | 			in_feature[i].range(15+16*j, 16*j) = (rand()%2 - 1) * 64; // (rand()%2 - 1) * 64
 71 |         }
 72 | 		for(j = num_input % 32; j < 32; j++){
 73 | 			in_feature[i].range(15+16*j, 16*j) = 0;
 74 |         }
 75 | 	}
 76 | 
 77 |     // initial bias data with random numbers
 78 |     for(i = 0; i < num_output/32; i++){
 79 |         for(j = 0; j < 32; j++){
 80 |             bias[i].range(15+16*j, 16*j) = 0*64; // (rand()%2) * 64
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | void fc_validate::print_feature_in(void){
 86 | 	int i,j;
 87 | 	cout << "fc input feature:" << endl;
 88 | 	for(i = 0 ; i < (int)(ceil(float(num_input)/32)); i++)
 89 | 	{
 90 | 		cout <<i<<":";
 91 | 		for(j = 0 ; j < 32; j++)
 92 | 			cout <<(short int)(in_feature[i].range(15+16*j,16*j)) <<" ";
 93 | 		cout << endl;
 94 | 	}
 95 | }
 96 | 
 97 | void fc_validate::print_weight(){
 98 | 	int i,j,k;
 99 | 	cout << "fc weight:" << endl;
100 | 	for(i = 0 ; i < (int)(ceil((float)num_input/32))*num_output; i++)
101 | 	{
102 | 		cout << i << ":";
103 | 		for(j = 0 ; j < 32; j++)
104 | 			cout << (short int)weight[i].range(15+16*j,16*j) <<" ";
105 | 		cout << endl;
106 | 	}
107 | }
108 | void fc_validate::print_bias(){
109 | 	int i, j, k;
110 | 	cout << "fc bias:" << endl;
111 | 	for(i = 0 ; i < (int)(ceil((float)num_output/32)); i++){
112 | 		cout << i << ":";
113 | 		for(j = 0; j < 32 && j < num_output; j++)
114 | 			cout << (short int)bias[i].range(15+16*j, 16*j) << " ";
115 | 		cout << endl;
116 | 	}
117 | }
118 | void fc_validate::print_feature_out(){
119 | 	int i,j;
120 | 	cout <<"fc feature out:"<<endl;
121 | 	for(i = 0 ; i < (int)(ceil(float(num_output)/32)); i++)
122 | 	{
123 | 		cout <<i<<":";
124 | 		for(j = 0 ; j < 32 && j < num_output; j++)
125 | 			cout << (short int)(out_feature[i].range(15+16*j, 16*j)/64) <<" ";
126 | 		cout << endl;
127 | 	}
128 | }
129 | 
130 | void fc_validate::software_fc_process(void){
131 |     int i, j, r, c;
132 |     int temp;
133 |     short int* temp_array_i = new short int[num_input];
134 |     short int* temp_array_w = new short int[num_input];
135 | 
136 |     // transfer in_data and weight to regular data array
137 |     for(i = 0; i < num_input/32; i++){
138 |         for(j = 0; j < 32; j++){
139 |             temp_array_i[32*i + j] = in_feature[i].range(15+16*j, 16*j);
140 |             temp_array_w[32*i + j] = weight[i].range(15+16*j, 16*j);
141 |         }
142 |     }
143 |     
144 |     for(i = 0; i < num_output; i++){
145 |         temp = (bias[i/32].range(15+16*(i%32), 16*(i%32)));
146 |         for(j = 0; j < num_input; j++){
147 |             temp += temp_array_i[j] * temp_array_w[j];
148 |         }
149 |         out_feature_software[(i/32)].range(15+16*(i%32), 16*(i%32)) = temp/64; // (temp < 0) ? 0 : temp
150 |     }
151 | }
152 | 
153 | void fc_validate::print_software_out(){
154 | 	int i, j;
155 | 	cout << "fc software feature out:" << endl;
156 | 	for(i=0; i< (int)(ceil(float(num_output)/32)); i++){
157 | 		cout << i << ":";
158 | 		for (j = 0; j < 32 && j < num_output; j++){
159 | 			cout << (short int)(out_feature_software[i].range(15+16*j, 16*j)/64) << " ";
160 | 		}
161 | 		cout << endl;
162 | 	}
163 | }
164 | 


--------------------------------------------------------------------------------
/netGenerator/dse/model_extract.py:
--------------------------------------------------------------------------------
  1 | import helping_functions
  2 | import sys
  3 | 
  4 | # this function is used to extract the network model information based on the key words in the
  5 | # net_config_params.txt file
  6 | def model_extract(include_fc):
  7 | 
  8 |     arr = helping_functions.read_params(sys.argv[1])
  9 |     prms, prms_str = helping_functions.extraction(arr)
 10 | 
 11 |     init_conv_N = prms[prms_str.index("nn_in_number_conv")]
 12 |     init_conv_r = prms[prms_str.index("nn_in_data_size_conv")]
 13 |     init_conv_M = prms[prms_str.index("nn_out_number_conv")]
 14 |     init_conv_P = prms[prms_str.index("nn_padding_conv")]
 15 |     init_conv_K = prms[prms_str.index("nn_channel_size_conv")]
 16 |     init_conv_S = prms[prms_str.index("nn_stride_conv")]
 17 |     init_conv_G = prms[prms_str.index("nn_group_conv")]
 18 |     init_fc_N   = prms[prms_str.index("nn_in_number_fc")]
 19 |     init_fc_Rin = prms[prms_str.index("nn_in_data_size_fc")]
 20 |     init_fc_M   = prms[prms_str.index("nn_out_number_fc")]
 21 |     init_fc_K   = prms[prms_str.index("nn_channel_size_fc")]
 22 |     init_pool_N = prms[prms_str.index("nn_in_data_size_pooling")]
 23 |     cut_flag_conv = prms[prms_str.index("conv_cut_flag")]
 24 |     cut_flag_pool = prms[prms_str.index("pool_cut_flag")]
 25 |     cut_flag_fc   = prms[prms_str.index("fc_cut_flag")]
 26 | 
 27 |     nn_in_number_conv_values1 = []
 28 |     if isinstance(init_fc_N, list):
 29 |         for fc_in_number in init_fc_N:
 30 |             nn_in_number_conv_values1.append(fc_in_number)
 31 |     else:
 32 |         nn_in_number_conv_values1.append(0)
 33 | 
 34 |     nn_out_number_conv_values1 = []
 35 |     if isinstance(init_fc_M, list):
 36 |         for fc_out_number in init_fc_M:
 37 |             nn_out_number_conv_values1.append(fc_out_number)
 38 |     else:
 39 |         nn_out_number_conv_values1.append(0)
 40 | 
 41 |     nn_fc_sizes_conv = []
 42 |     if isinstance(init_fc_Rin, list):
 43 |         for fc_in_size in init_fc_Rin:
 44 |             nn_fc_sizes_conv.append(fc_in_size)
 45 |     else:
 46 |         nn_fc_sizes_conv.append(0)
 47 | 
 48 |     nn_channel_size_conv_values = []
 49 |     if isinstance(init_fc_K, list):
 50 |         for kernel_size in init_fc_K:
 51 |             nn_channel_size_conv_values.append(kernel_size)
 52 |     else:
 53 |         nn_channel_size_conv_values.append(0)
 54 | 
 55 |     nn_stride_values1 = []
 56 |     if isinstance(init_fc_Rin, list):
 57 |         for stride_value in init_fc_Rin:
 58 |             nn_stride_values1.append(stride_value)
 59 |     else:
 60 |         nn_stride_values1.append(1)
 61 | 
 62 |     conv_only_M = [int(val) for val in init_conv_M]
 63 |     # print init_conv_M
 64 |     # print conv_only_M
 65 | 
 66 |     nn_conv_group_values = []
 67 |     if isinstance(init_conv_G, list):
 68 |         for group_value in init_conv_G:
 69 |             nn_conv_group_values.append(group_value)
 70 |         else:
 71 |             nn_conv_group_values.append(1)
 72 | 
 73 |     nn_fc_cut_flag = []
 74 |     if isinstance(cut_flag_fc, list):
 75 |         for cut_value in cut_flag_fc:
 76 |             nn_fc_cut_flag.append(cut_value)
 77 |     else:
 78 |         nn_fc_cut_flag.append(1)
 79 | 
 80 |     if(include_fc == 'include_fc'):
 81 |         init_conv_N = init_conv_N + nn_in_number_conv_values1
 82 |         init_conv_M = init_conv_M + nn_out_number_conv_values1
 83 |         init_conv_r = init_conv_r + nn_fc_sizes_conv
 84 |         init_conv_K = init_conv_K + nn_channel_size_conv_values
 85 |         init_conv_S = init_conv_S + nn_stride_values1
 86 |         cut_flag_conv = cut_flag_conv + nn_fc_cut_flag
 87 | 
 88 |     conv_N = [int(string) for string in init_conv_N]
 89 |     conv_M = [int(string) for string in init_conv_M]
 90 |     conv_r = [int(string) for string in init_conv_r]
 91 |     conv_K = [int(string) for string in init_conv_K]
 92 |     conv_S = [int(string) for string in init_conv_S]
 93 |     conv_P = [int(string) for string in init_conv_P]
 94 |     conv_G = [int(string) for string in init_conv_G]
 95 |     cut_flag = [int(string) for string in cut_flag_conv]
 96 | 
 97 |     if not init_fc_Rin:
 98 |         conv_P = conv_P + [0]
 99 |     else:
100 |         conv_P = conv_P + [0] * len(init_fc_Rin)
101 |         conv_P = conv_P + [0]
102 |     conv_G = [int(string) for string in init_conv_G]
103 |     max_conv_N = max(conv_N)
104 |     max_conv_M = max(conv_M)
105 |     max_conv_S = max(conv_S)
106 |     max_conv_K = max(conv_K)
107 | 
108 |     conv_R = []
109 |     conv_layer_num = int(len(conv_r))
110 |     for r in range(0, conv_layer_num):
111 |         R = (conv_r[r] - conv_K[r] + conv_S[r] + 2*conv_P[r])/conv_S[r]
112 |         conv_R.append(R)
113 | 
114 |     # find the positions of Conv layers followed by Pooling layer
115 |     flag = [False] * conv_layer_num
116 |     count = 0
117 |     print(prms[0])
118 |     print(len(prms[0]))
119 |     for prms_index in range(len(prms[0]) - 2):
120 |         if "Convolution" in prms[0][prms_index]:
121 |             # if "Pooling" in prms[0][prms_index + 1] + prms[0][prms_index + 2]:
122 |             if "Pooling" in prms[0][prms_index + 1]:
123 |                 flag[count] = True
124 |             count += 1
125 | 
126 |     # print "conv_N: ", conv_N
127 |     # print "conv_M: ", conv_M
128 |     # print "conv_r: ", conv_r
129 |     # print "conv_R: ", conv_R
130 |     # print "conv_K: ", conv_K
131 |     # print "conv_S: ", conv_S
132 |     # print "flag", flag
133 |     # print "cut_flag", cut_flag
134 | 
135 |     return conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag, init_pool_N
136 | 
137 | # if __name__ == "__main__":
138 | #     conv_N, conv_M, conv_r, conv_R, conv_K, conv_S = model_extract()


--------------------------------------------------------------------------------
/fpga_cnn/testbench/conv_validate.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <stdlib.h>
  4 | #include "conv_validate.h"
  5 | using namespace std;
  6 | 
  7 | 
  8 | conv_validate::conv_validate(ap_uint<32>* param_list)
  9 | {
 10 | 	int i,j,k;
 11 | 	int input_num;
 12 | 	int input_feature_size;
 13 | 	int output_num;
 14 | 
 15 | 	this->param_list = param_list;
 16 | 	input_num = param_list[16+0];
 17 | 	input_feature_size = param_list[16+3];
 18 | 
 19 | 	layer_num = param_list[0];
 20 | 	for(i = 0 ; i < 4096; i++)
 21 | 		input_feature[i] = 0;
 22 | 	for(i = 0 ; i < 2048; i++)
 23 | 		weight[i] = 0;
 24 | 	for(i = 0 ; i < 4096; i++)
 25 | 		output_feature[i] = 0;
 26 | 	for(i = 0 ; i < 1024; i++)
 27 | 		bias[i] = 0;
 28 | 
 29 | 
 30 | 	for(i = 0 ; i < 1;i++)
 31 | 	{
 32 | 		for(j = 0 ; j < 16 * 16;j++)
 33 | 			input_feature[j].range(15+16*i,16*i) = 64;
 34 | 	}
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 	for(i = 0 ; i < 9;i++)
 40 | 	{
 41 | 		weight[i].range(15,0) = 16;
 42 | 	}
 43 | 
 44 | 
 45 | 	for(i = 0 ; i < 9;i++)
 46 | 	{
 47 | 		weight[i].range(16+15,16+0) = 0;
 48 | 	}
 49 | 
 50 | 	for(i = 0 ; i < 9;i++)
 51 | 	{
 52 | 		weight[i].range(16*2+15,16*2+0) = 0;
 53 | 	}
 54 | 
 55 | 
 56 | 
 57 | 
 58 | //	for(j = 0 ; j < 8; j++)
 59 | //		for(i = 0 ; i < 72;i++)
 60 | //		{
 61 | //			weight[i].range(15+16*j,0+16*j) = 64;
 62 | //		}
 63 | 
 64 | 
 65 | 
 66 | //	for(i = 18 ; i < 27;i++)
 67 | //	{
 68 | //		weight[i].range(15,0) = 64;
 69 | //	}
 70 | }
 71 | 
 72 | 
 73 | 
 74 | 
 75 | void conv_validate :: print_feature_in(void)
 76 | {
 77 | 	int i,j;
 78 | 	cout << "input feature:" << endl;
 79 | 	for(i = 0 ; i <12*12; i++)
 80 | 	{
 81 | 		cout <<i<<":";
 82 | 		for(j = 0 ; j < 32; j++)
 83 | 			cout <<(short int)(input_feature[i].range(15+16*j,16*j)) <<" ";
 84 | 		cout << endl;
 85 | 	}
 86 | }
 87 | //
 88 | //
 89 | //void conv_validate :: print_weight(void)
 90 | //{
 91 | //	int i,j,k;
 92 | //	cout << "weight:"<<endl;
 93 | //	for(i = 0 ; i < (int)(ceil((double)num_input/32))*num_output * kernel_size*kernel_size; i++)
 94 | //	{
 95 | //		cout << i << ":";
 96 | //		for(j = 0 ; j < 32; j++)
 97 | //			cout << weight[i].range(15+16*j,16*j) <<" ";
 98 | //		cout << endl;
 99 | //	}
100 | //}
101 | //
102 | //void conv_validate :: print_bias(void)
103 | //{
104 | //	int i;
105 | //	cout <<"bias:"<<endl;
106 | //	for(i = 0 ; i < num_output; i++)
107 | //		cout << i << ":" << bias[i] <<endl;
108 | //}
109 | //
110 | //
111 | //void conv_validate :: print_feature_out(void)
112 | //{
113 | //	int i,j;
114 | //	cout <<"feature out:"<<endl;
115 | //	for(i = 0 ; i < outputfeature_size * outputfeature_size * (int)(ceil(double(num_output)/32)); i++)
116 | //	{
117 | //		cout <<i<<":";
118 | //		for(j = 0 ; j < 32; j++)
119 | //			cout <<output_feature[i].range(15+16*j,16*j) <<" ";
120 | //		cout << endl;
121 | //	}
122 | //}
123 | //
124 | //void conv_validate :: software_conv_process(void)       //assume padding = 2
125 | //{
126 | //	int i,j,k,x,y,z;
127 | //	int temp;
128 | //	short int* temp_array = new short int[(inputfeature_size+2*padding) * (inputfeature_size+2*padding) * num_input];
129 | //
130 | //	for(k = 0 ; k < num_input; k++)
131 | //		for(i = 0 ; i < inputfeature_size+2*padding; i++)
132 | //			for(j = 0 ; j < inputfeature_size+2*padding ; j++)
133 | //				temp_array[ k * (inputfeature_size+2*padding ) * (inputfeature_size+2*padding ) + i * (inputfeature_size+2*padding ) + j] = 0;
134 | //
135 | //	for(j = 0 ; j < num_input; j++)
136 | //		for(x = padding; x < inputfeature_size + padding; x++)
137 | //			for(y = padding; y < inputfeature_size + padding; y++)
138 | //				temp_array[ j * (inputfeature_size+2*padding ) * (inputfeature_size+2*padding ) + x * (inputfeature_size+2*padding ) + y]
139 | //							= input_feature[(j/32) * inputfeature_size * inputfeature_size +(x-padding)*inputfeature_size + (y-padding)].range(16*(j%32)+15,16*(j%32));
140 | //
141 | //	cout <<"software processing..." << endl;
142 | //	for(i = 0 ; i < num_output; i++)
143 | //	{
144 | //		for(x = 0 ; x < inputfeature_size-kernel_size+1+2*padding; x+=stride)
145 | //			for(y = 0; y < inputfeature_size-kernel_size+1+2*padding;y+=stride)
146 | //			{
147 | //				temp = bias[i].range(15,0);
148 | //				for(j = 0 ; j < num_input; j++)
149 | //				{
150 | //
151 | //
152 | //					for(k = 0 ; k < kernel_size;k++)
153 | //					{
154 | //						for(z = 0 ; z < kernel_size; z++)
155 | //						{
156 | //
157 | //							temp += (int)(temp_array[j * (inputfeature_size+2*padding ) * (inputfeature_size+2*padding ) + x * (inputfeature_size+2*padding) + y + (k*(inputfeature_size+2*padding) + z)])
158 | //									//*(int)(weight[(j/32)*num_output*kernel_size*kernel_size +i*kernel_size*kernel_size + k*kernel_size + z].range(15+16*(j%32),16*(j%32)))/64;
159 | //								   *(int)(weight[(i/32)*num_input*kernel_size*kernel_size +j*kernel_size*kernel_size + k*kernel_size + z].range(15+16*(i%32),16*(i%32)))/64;
160 | //
161 | //						}
162 | //					}
163 | //				}
164 | //
165 | //				output_feature_software[(i/32) *outputfeature_size * outputfeature_size + (x/stride) * outputfeature_size + y/stride].range(16*(i%32)+15,16*(i%32)) = (temp < 0) ? 0 : temp;
166 | //			}
167 | //	}
168 | //}
169 | //
170 | //
171 | //void conv_validate :: print_feature_out_softeare(void)
172 | //{
173 | //	int i,j;
174 | //	cout <<"software feature out:"<<endl;
175 | //	for(i = 0 ; i < outputfeature_size * outputfeature_size * (int)(ceil(double(num_output)/32)); i++)
176 | //	{
177 | //		cout <<i<<":";
178 | //		for(j = 0 ; j < 32; j++)
179 | //			cout <<output_feature_software[i].range(15+16*j,16*j) <<" ";
180 | //		cout << endl;
181 | //	}
182 | //}
183 | //
184 | //void conv_validate :: test_fun(void)
185 | //{
186 | //	cout << "class test : "<< outputfeature_size << endl;
187 | //	cout << (int)(ceil((double)num_input/32)) << endl;
188 | //}
189 | 


--------------------------------------------------------------------------------
/netGenerator/dse/model_split.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import math
  3 | import random
  4 | 
  5 | 
  6 | def model_partition_ordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, i, j):
  7 | 
  8 |     if len(conv_N) <= 3:
  9 |         print("Network size is smaller than 3, results are illegal!")
 10 | 
 11 |     sub_conv_N = []
 12 |     sub_conv_M = []
 13 |     sub_conv_r = []
 14 |     sub_conv_R = []
 15 |     sub_conv_K = []
 16 |     sub_conv_S = []
 17 |     sub_flag = []
 18 |     sub_group = []
 19 | 
 20 |     sub_conv_N.append(conv_N[0: int(i)])
 21 |     sub_conv_N.append(conv_N[int(i): int(j)])
 22 |     sub_conv_N.append(conv_N[int(j): int(len(conv_N))])
 23 | 
 24 |     sub_conv_M.append(conv_M[0: int(i)])
 25 |     sub_conv_M.append(conv_M[int(i): int(j)])
 26 |     sub_conv_M.append(conv_M[int(j): int(len(conv_N))])
 27 | 
 28 |     sub_conv_r.append(conv_r[0: int(i)])
 29 |     sub_conv_r.append(conv_r[int(i): int(j)])
 30 |     sub_conv_r.append(conv_r[int(j): int(len(conv_N))])
 31 | 
 32 |     sub_conv_R.append(conv_R[0: int(i)])
 33 |     sub_conv_R.append(conv_R[int(i): int(j)])
 34 |     sub_conv_R.append(conv_R[int(j): int(len(conv_N))])
 35 | 
 36 |     sub_conv_K.append(conv_K[0: int(i)])
 37 |     sub_conv_K.append(conv_K[int(i): int(j)])
 38 |     sub_conv_K.append(conv_K[int(j): int(len(conv_N))])
 39 | 
 40 |     sub_conv_S.append(conv_S[0: int(i)])
 41 |     sub_conv_S.append(conv_S[int(i): int(j)])
 42 |     sub_conv_S.append(conv_S[int(j): int(len(conv_N))])
 43 | 
 44 |     sub_flag.append(flag[0: int(i)])
 45 |     sub_flag.append(flag[int(i): int(j)])
 46 |     sub_flag.append(flag[int(j): int(len(conv_N))])
 47 | 
 48 |     return sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag
 49 | 
 50 | 
 51 | def model_split_unordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag):
 52 |     sub_conv_N = [[], [], []]
 53 |     sub_conv_M = [[], [], []]
 54 |     sub_conv_r = [[], [], []]
 55 |     sub_conv_R = [[], [], []]
 56 |     sub_conv_K = [[], [], []]
 57 |     sub_conv_S = [[], [], []]
 58 |     sub_flag = [[], [], []]
 59 | 
 60 |     cluster_num = []
 61 | 
 62 |     while (len(sub_conv_N[0]) == 0 or len(sub_conv_N[1]) == 0 or len(sub_conv_N[2]) == 0):
 63 |         cluster_num = []
 64 |         sub_conv_N = [[], [], []]
 65 |         sub_conv_M = [[], [], []]
 66 |         sub_conv_r = [[], [], []]
 67 |         sub_conv_R = [[], [], []]
 68 |         sub_conv_K = [[], [], []]
 69 |         sub_conv_S = [[], [], []]
 70 |         sub_flag = [[], [], []]
 71 |         for i in range(0, int(len(conv_N))):
 72 |             cluster_num.append(random.randrange(0, 3, 1))
 73 |         for i in range(0, int(len(conv_N))):
 74 |             sub_conv_N[cluster_num[i]].append(conv_N[i])
 75 |             sub_conv_M[cluster_num[i]].append(conv_M[i])
 76 |             sub_conv_r[cluster_num[i]].append(conv_r[i])
 77 |             sub_conv_R[cluster_num[i]].append(conv_R[i])
 78 |             sub_conv_K[cluster_num[i]].append(conv_K[i])
 79 |             sub_conv_S[cluster_num[i]].append(conv_S[i])
 80 |             sub_flag[cluster_num[i]].append(flag[i])
 81 | 
 82 |         print(cluster_num)
 83 |     return sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag
 84 | 
 85 | 
 86 | def model_split_by_label(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, lablelist):
 87 |     sub_conv_N = []
 88 |     sub_conv_M = []
 89 |     sub_conv_r = []
 90 |     sub_conv_R = []
 91 |     sub_conv_K = []
 92 |     sub_conv_S = []
 93 |     sub_flag = []
 94 | 
 95 |     for i in range(0, max(lablelist) + 1):
 96 |         sub_conv_N.append([])
 97 |         sub_conv_M.append([])
 98 |         sub_conv_r.append([])
 99 |         sub_conv_R.append([])
100 |         sub_conv_K.append([])
101 |         sub_conv_S.append([])
102 |         sub_flag.append([])
103 | 
104 |     # print sub_conv_N
105 |     for i in range(len(conv_N)):
106 |         sub_conv_N[lablelist[i]].append(conv_N[i])
107 |         sub_conv_M[lablelist[i]].append(conv_M[i])
108 |         sub_conv_r[lablelist[i]].append(conv_r[i])
109 |         sub_conv_R[lablelist[i]].append(conv_R[i])
110 |         sub_conv_K[lablelist[i]].append(conv_K[i])
111 |         sub_conv_S[lablelist[i]].append(conv_S[i])
112 |         sub_flag[lablelist[i]].append(flag[i])
113 | 
114 |     return sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag
115 | 
116 | 
117 | def model_split_by_list(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, par_list):
118 |     sub_conv_N = []
119 |     sub_conv_M = []
120 |     sub_conv_r = []
121 |     sub_conv_R = []
122 |     sub_conv_K = []
123 |     sub_conv_S = []
124 |     sub_flag = []
125 | 
126 |     cluster_num = len(par_list)
127 | 
128 |     for i in range(0, cluster_num):
129 |         sub_conv_N.append([])
130 |         sub_conv_M.append([])
131 |         sub_conv_r.append([])
132 |         sub_conv_R.append([])
133 |         sub_conv_K.append([])
134 |         sub_conv_S.append([])
135 |         sub_flag.append([])
136 | 
137 |     # print sub_conv_N
138 |     for j in range(0, cluster_num):
139 |         for i in range(0, len(par_list[j])):
140 |             sub_conv_N[j].append(conv_N[par_list[j][i]])
141 |             sub_conv_M[j].append(conv_M[par_list[j][i]])
142 |             sub_conv_r[j].append(conv_r[par_list[j][i]])
143 |             sub_conv_R[j].append(conv_R[par_list[j][i]])
144 |             sub_conv_K[j].append(conv_K[par_list[j][i]])
145 |             sub_conv_S[j].append(conv_S[par_list[j][i]])
146 |             sub_flag[j].append(flag[par_list[j][i]])
147 | 
148 |     return sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag
149 | 
150 | 
151 | def gop_calculate(conv_N, conv_M, conv_R, conv_K):
152 |     op_layer = 0
153 |     conv_layer_num = int(len(conv_N))
154 | 
155 |     for i in range(0, conv_layer_num):
156 |         tmp = conv_M[i] * conv_N[i] * conv_R[i] * conv_R[i] * conv_K[i] * conv_K[i] * 2
157 |         # print(conv_N[i], conv_M[i], conv_R[i], conv_K[i])
158 |         op_layer += tmp
159 | 
160 |     gop_all = op_layer
161 |     # print "gop_calculate():", conv_N, conv_M, conv_R, conv_K, gop_all
162 |     return gop_all
163 | 
164 | 
165 | def max_layer_dataout(conv_N, conv_M, conv_R, conv_K):
166 |     max_dataout = 0
167 |     conv_layer_num = int(len(conv_N))
168 |     for i in range(0, conv_layer_num):
169 |         layer_out = conv_M[i] * conv_R[i] * conv_R[i]
170 |         if max_dataout < layer_out:
171 |             max_dataout = layer_out
172 | 
173 |     return max_dataout
174 | 
175 | # if __name__ == "__main__":
176 | #     sub_conv_N = model_split()
177 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/ff_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | 
  3 | //Optional 
  4 | //#include "/opt/Xilinx/Vivado/2018.1/include/gmp.h"
  5 | //#include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h"
  6 | 
  7 | #include "ap_fixed.h"
  8 | #include "construct_net.h"
  9 | #include "../testbench/conv_validate.h"
 10 | #include "../testbench/pooling_validate.h"
 11 | #include "../testbench/fc_validate.h"
 12 | 
 13 | using namespace std;
 14 | 
 15 | void construct_para(ap_uint<32>* para);
 16 | void print_para_list(ap_uint<32>* para);
 17 | 
 18 | int main()
 19 | {
 20 | 	int i,j,k;
 21 | 	cout <<"Test Begin..."<<endl;
 22 | 
 23 | 	ap_uint<32> para_list[512];
 24 | 	ap_int<512> data_temp[2048];
 25 | 	ap_int<512> data_temp2[2048];
 26 | 	ap_int<512> data_temp3[2048];
 27 | 	ap_int<512> data_temp4[2048];
 28 | 	ap_int<512> data_temp5[2048];
 29 | 	ap_int<512> data_temp6[2048];
 30 | 
 31 | 	for(i = 0 ; i < 512; i++)
 32 | 		para_list[i] = 0;
 33 | 	for(i = 0 ; i < 2048; i++)
 34 | 	{
 35 | 		data_temp[i] = 0;
 36 | 		data_temp2[i] = 0;
 37 | 		data_temp3[i] = 0;
 38 | 		data_temp4[i] = 0;
 39 | 		data_temp5[i] = 0;
 40 | 		data_temp6[i] = 0;
 41 | 	}
 42 | 
 43 | 
 44 | 	construct_para(para_list);
 45 | 	print_para_list(para_list);
 46 | 	conv_validate conv_test(para_list);
 47 | 
 48 | 
 49 | 	sub_net_0(
 50 | 			para_list,						//ap_uint<32> param_port[528],
 51 | 			//conv_test.bias,					//ap_fixed<32,26> bias_in[1024],
 52 | 			conv_test.weight,				//data_type_itf weight_in[2048],
 53 | 			conv_test.input_feature,	    //data_type_itf data_in_0[2048],
 54 | 			data_temp,						//data_type_itf data_out_0[2048],
 55 | 			// data_temp,						//data_type_itf data_in_1[2048],
 56 | 			//conv_test.output_feature,	    //data_type_itf data_out_1[2048],
 57 | 			// data_temp2,
 58 | 			0								//int select
 59 | 			);
 60 | 
 61 | /*
 62 | 	sub_net_1(
 63 | 			para_list,						//ap_uint<32> param_port[528],
 64 | 			//conv_test.bias,					//ap_fixed<32,26> bias_in[1024],
 65 | 			conv_test.weight,				//data_type_itf weight_in[2048],
 66 | 			conv_test.input_feature,	    //data_type_itf data_in_0[2048],
 67 | 			data_temp,						//data_type_itf data_out_0[2048],
 68 | 			data_temp,						//data_type_itf data_in_1[2048],
 69 | 			data_temp2,//conv_test.output_feature,	    //data_type_itf data_out_1[2048],
 70 | 			1								//int select
 71 | 			);
 72 | 
 73 | 
 74 | 	sub_net_2(
 75 | 			para_list,						//ap_uint<32> param_port[528],
 76 | 			//conv_test.bias,					//ap_fixed<32,26> bias_in[1024],
 77 | 			conv_test.weight,				//data_type_itf weight_in[2048],
 78 | 			data_temp2,	    //data_type_itf data_in_0[2048],
 79 | 			data_temp3,						//data_type_itf data_out_0[2048],
 80 | 			data_temp3,						//data_type_itf data_in_1[2048],
 81 | 			data_temp4,//conv_test.output_feature,	    //data_type_itf data_out_1[2048],
 82 | 			0								//int select
 83 | 			);
 84 | */
 85 | 
 86 | 	cout <<"Test Finish"<<endl;
 87 | 	return 0;
 88 | }
 89 | 
 90 | 
 91 | void construct_para(ap_uint<32>* para)
 92 | {
 93 | 	  int i;
 94 | 	  //0-1.layer_num
 95 | 	  para[0] = 1;
 96 | 	  //0-2.conv para
 97 | 	  para[16+0] = 1;//N
 98 | 	  para[16+1] = 3;//K
 99 | 	  para[16+2] = 1;//M
100 | 	  para[16+3] = 16;//Rin
101 | 	  para[16+4] = 16;//Cin
102 | 	  para[16+5] = 16;//R
103 | 	  para[16+6] = 16;//C
104 | 	  para[16+7] = 1;//S
105 | 	  para[16+8] = 1;//P
106 | 	  para[16+9] = 1;//act
107 | 	  para[16+10] = 0;//weight_offset
108 | 	  para[16+11] = 0;//bias_offset
109 | 	  para[16+12] = 0;//in_offset
110 | 	  para[16+13] = 0;//out_offset
111 | 	  para[16+14] = 0;//inport
112 | 	  para[16+15] = 0;
113 | 	  //0-3.conv para
114 | 	  para[32+0] = 192;//N
115 | 	  para[32+1] = 3;//K
116 | 	  para[32+2] = 128;//M
117 | 	  para[32+3] = 13;//Rin
118 | 	  para[32+4] = 13;//Cin
119 | 	  para[32+5] = 13;//R
120 | 	  para[32+6] = 13;//C
121 | 	  para[32+7] = 1;//S
122 | 	  para[32+8] = 1;//P
123 | 	  para[32+9] = 1;//act
124 | 	  para[32+10] = 0;//weight_offset
125 | 	  para[32+11] = 0;//bias_offset
126 | 	  para[32+12] = 0;//in_offset
127 | 	  para[32+13] = 512;//out_offset
128 | 	  para[32+14] = 1;//inport
129 | 	  para[32+15] = 0;
130 | 	  //0-4.conv para
131 | 	  para[48+0] = 1;//N
132 | 	  para[48+1] = 3;//K
133 | 	  para[48+2] = 1;//M
134 | 	  para[48+3] = 16;//Rin
135 | 	  para[48+4] = 16;//Cin
136 | 	  para[48+5] = 16;//R
137 | 	  para[48+6] = 16;//C
138 | 	  para[48+7] = 1;//S
139 | 	  para[48+8] = 1;//P
140 | 	  para[48+9] = 1;//act
141 | 	  para[48+10] = 0;//weight_offset
142 | 	  para[48+11] = 0;//bias_offset
143 | 	  para[48+12] = 512;//in_offset
144 | 	  para[48+13] = 0;//out_offset
145 | 	  para[48+14] = 1;//inport
146 | 	  para[48+15] = 0;
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 	  //1-1.layer_num
153 | 	  para[256+0] = 1;
154 | 	  //1-2.conv_para
155 | 	  para[256+16+0] = 1; //N
156 | 	  para[256+16+1] = 3; //K
157 | 	  para[256+16+2] = 1; //M
158 | 	  para[256+16+3] = 16; //Rin
159 | 	  para[256+16+4] = 16; //Cin
160 | 	  para[256+16+5] = 16;  //R
161 | 	  para[256+16+6] = 16;  //C
162 | 	  para[256+16+7] = 1;  //S
163 | 	  para[256+16+8] = 1;  //P
164 | 	  para[256+16+9] = 1;  //act
165 | 	  para[256+16+10] = 0; //weight_offset
166 | 	  para[256+16+11] = 0; //bias_offset
167 | 	  para[256+16+12] = 0; //in_offset
168 | 	  para[256+16+13] = 0; //out_offset
169 | 	  para[256+16+14] = 0; //inport
170 | 	  para[256+16+15] = 0;
171 | 
172 | 	  //1-3.conv_para
173 | 	  para[256+32+0] = 1; //N
174 | 	  para[256+32+1] = 3; //K
175 | 	  para[256+32+2] = 1; //M
176 | 	  para[256+32+3] = 16; //Rin
177 | 	  para[256+32+4] = 16; //Cin
178 | 	  para[256+32+5] = 16;  //R
179 | 	  para[256+32+6] = 16;  //C
180 | 	  para[256+32+7] = 1;  //S
181 | 	  para[256+32+8] = 1;  //P
182 | 	  para[256+32+9] = 1;  //act
183 | 	  para[256+32+10] = 0; //weight_offset
184 | 	  para[256+32+11] = 0; //bias_offset
185 | 	  para[256+32+12] = 0; //in_offset
186 | 	  para[256+32+13] = 512; //out_offset
187 | 	  para[256+32+14] = 1; //inport
188 | 	  para[256+32+15] = 0;
189 | 
190 | 	  //1-4.conv_para
191 | 	  para[256+48+0] = 1; //N
192 | 	  para[256+48+1] = 3; //K
193 | 	  para[256+48+2] = 1; //M
194 | 	  para[256+48+3] = 16; //Rin
195 | 	  para[256+48+4] = 16; //Cin
196 | 	  para[256+48+5] = 16;  //R
197 | 	  para[256+48+6] = 16;  //C
198 | 	  para[256+48+7] = 1;  //S
199 | 	  para[256+48+8] = 1;  //P
200 | 	  para[256+48+9] = 1;  //act
201 | 	  para[256+48+10] = 0; //weight_offset
202 | 	  para[256+48+11] = 0; //bias_offset
203 | 	  para[256+48+12] = 512; //in_offset
204 | 	  para[256+48+13] = 0; //out_offset
205 | 	  para[256+48+14] = 1; //inport
206 | 	  para[256+48+15] = 0;
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | }
214 | void print_para_list(ap_uint<32>* para)
215 | {
216 | 	int i,j;
217 | 
218 | 	for(i = 0 ; i < 32; i++)
219 | 	{
220 | 		for(j = 0 ; j < 16; j++)
221 | 			cout << para[i*16+j] <<" ";
222 | 		cout << endl;
223 | 	}
224 | }
225 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/max_pool_acc_innerpp.h:
--------------------------------------------------------------------------------
  1 | #ifndef _MAX_POOL_ACC_H_
  2 | #define _MAX_POOL_ACC_H_
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include "activation_functions.h"
  7 | using namespace std;
  8 | 
  9 | template <typename Itf, typename Tparam, typename pT, typename pW, typename pG, int pTn, int pTr, int pTc, int pS_max, int pK_max>
 10 | class max_pool_acc {
 11 | 
 12 | private:
 13 | 	int pool_layer_number;
 14 | 
 15 | public:
 16 | 	max_pool_acc() : pool_layer_number(0) {pool_layer_number = 0;};
 17 | 
 18 |     // Tn << 32 && N << 32
 19 |     void in_buf_load_512(
 20 |             ap_fixed<16,10> buf[][(pTr - 1) * pS_max + pK_max][(pTc - 1) * pS_max + pK_max],
 21 |             ap_int<512>* i_data,
 22 |             Tparam in_offset,
 23 |             Tparam n, Tparam r, Tparam c, Tparam S, Tparam K, Tparam P,
 24 |             Tparam R, Tparam C, Tparam N, Tparam R_IN, Tparam C_IN, Tparam TR, Tparam TC)
 25 |     {
 26 |     	ap_int<512> data_tmp = 0;
 27 |         // valid data portion
 28 |         for (int j = r * S - P; j < r * S + TR - P; j++)
 29 |         {
 30 |             for (int k = c * S - P; k < c * S + TC - P; k++)
 31 |             {
 32 | #pragma HLS PIPELINE
 33 |                 for (int i = 0; i < pTn; i+=32)
 34 |                 {
 35 | #pragma HLS UNROLL
 36 |                     if ((i + n >= N) || j < 0 || j >= (R_IN - 2 * P) || k < 0 || k >= (C_IN - 2 * P))
 37 |                     {
 38 |                         for (int wr = 0; wr < pTn; wr++)
 39 |                         {
 40 | #pragma HLS UNROLL
 41 |                             buf[i + wr][j - r * S + P][k - c * S + P] = pT(0);
 42 |                         }
 43 |                     }
 44 |                     else
 45 |                     {
 46 |                         data_tmp = *(i_data + in_offset + (i + n)/32 * (R_IN - 2 * P) * (C_IN - 2 * P) + j * (R_IN - 2 * P) + k);
 47 |                         for (int wr = 0; wr < pTn; wr++)
 48 |                         {
 49 | #pragma HLS UNROLL
 50 |                             buf[wr][j - r * S + P][k - c * S + P].range(15,0) = data_tmp.range((wr + 1) * 16 - 1, (wr) * 16);
 51 |                         }
 52 |                     }
 53 |                 }
 54 |             }
 55 |         }
 56 |     }
 57 | 
 58 |     // Max pooling computation kernel
 59 |     void pool_engine(pT in_buf[][(pTr-1)*pS_max + pK_max][(pTc-1)*pS_max + pK_max],
 60 |                      pG out_buf[][pTr][pTc],
 61 |                      Tparam S, Tparam n, Tparam r, Tparam c, Tparam K, Tparam R, Tparam C, Tparam TR, Tparam TC) {
 62 |         if (n >= 0) {
 63 |             for (int i = 0; i < K; i++) {
 64 |                 for (int j = 0; j < K; j++) {
 65 |                     for (int tr = 0; tr < pTr && tr + r < R && (S * tr + i) < TR; tr++) {
 66 |                         for (int tc = 0; tc < pTc && tc + c < C && (S * tc + j) < TC; tc++) {
 67 | #pragma HLS PIPELINE
 68 |                             for (int tn = 0; tn < pTn; tn++) {
 69 | #pragma HLS UNROLL
 70 |                                 out_buf[tn][tr][tc] = (i == 0 && j == 0) ? in_buf[tn][S * tr][S * tc]
 71 |                                                                          : ((out_buf[tn][tr][tc]
 72 |                                                                          > in_buf[tn][S * tr + i][S * tc + j])
 73 |                                                                          ? out_buf[tn][tr][tc]
 74 |                                                                          : in_buf[tn][S * tr + i][S * tc + j]);
 75 |                             }
 76 |                         }
 77 |                     }
 78 |                 }
 79 |             }
 80 |         }
 81 |     }
 82 | 
 83 |     // Ouput out_buf data to output interface
 84 |     void output_res_512(pG out_buf[][pTr][pTc],
 85 |                         Itf *out_data,
 86 |                         Tparam out_offset,
 87 |                         Tparam n, Tparam r, Tparam c, Tparam N, Tparam R, Tparam C, bool act) {
 88 | 
 89 |         Itf out_tmp = 0;
 90 |         Itf ex_out_tmp = 0;
 91 |         pG tmp = 0;
 92 |         pG tmp_outbuf = 0;
 93 |         if (n >= 0) {
 94 |             for (int j = r; j < r + pTr && j < R; j++)
 95 |             {
 96 |                 for (int k = c; k < c + pTc && k < C; k++)
 97 |                 {
 98 | #pragma HLS PIPELINE
 99 |                     for (int i = 0; i < pTn; i += 32)
100 |                     {
101 |                     	if(i < N - n)
102 |                     	{
103 |                     		for (int wr = 0; wr < (pTn<32?pTn:32); wr++)
104 |                     		{
105 | #pragma HLS UNROLL
106 |                     			tmp_outbuf = RELU(out_buf[i + wr][j - r][k - c]);
107 |                     			out_tmp.range(16 * (wr + 1) - 1, 16 * wr) = tmp_outbuf.range(15,0);
108 |                     		}
109 |                     		*(out_data + out_offset + ((i + n)/32)*R*C + j*C + k) = out_tmp;
110 |                     	}
111 |                     }
112 |                 }
113 |             }
114 |         }
115 |     }
116 | ///////////////////////------------------conv accelerator----------------//////////////////////////
117 | 	void max_pool_layer_mbuf(
118 |             Tparam R_IN,    // input Row
119 |             Tparam C_IN,    // input column
120 |             Tparam N,       //input feature number
121 |             Tparam K,       //input kernel size
122 |             Tparam R,       // output Row
123 |             Tparam C,       // output column
124 |             Tparam S,       // stride size
125 |             Tparam P,       // padding size
126 |             Tparam act,     // activation function bit (1-- with act, 0--without act)
127 |             Tparam i_offset,
128 |             Tparam o_offset,
129 |             Itf *i_data,
130 |             Itf *o_data)
131 | 	{
132 | 
133 |         Tparam TR=0;
134 |         Tparam TC=0;
135 |         ap_fixed<16,10> in_buf_0[pTn][(pTr-1)*pS_max + pK_max][(pTc-1)*pS_max + pK_max];
136 |         ap_fixed<16,10> out_buf_0[pTn][pTr][pTc];
137 | 
138 | #pragma HLS ARRAY_PARTITION variable=in_buf_0 complete dim=1
139 | #pragma HLS ARRAY_PARTITION variable=out_buf_0 complete dim=1
140 | 
141 | 
142 | 
143 |         for(int r = 0; r < R; r += pTr)
144 |         {
145 |             for (int c = 0; c < C; c += pTc)
146 |             {
147 |                 TR = ((r * S + (pTr - 1) * S + K) > R_IN ? (R_IN - r * S) : ((pTr - 1) * S + K));
148 |                 TC = ((c * S + (pTc - 1) * S + K) > C_IN ? (C_IN - c * S) : ((pTc - 1) * S + K));
149 |                 for (int n = 0; n < N ; n += pTn)
150 |                 {
151 |                     in_buf_load_512(in_buf_0, i_data, i_offset, n, r, c, S, K, P, R, C, N, R_IN, C_IN, TR, TC);
152 |                     pool_engine(in_buf_0, out_buf_0, S, n, r, c, K, R, C, TR, TC);
153 |                     output_res_512(out_buf_0, o_data, o_offset, n, r, c, N, R, C, act);
154 |                 }
155 |             }
156 |         }
157 |     }
158 | };
159 | #endif
160 | 


--------------------------------------------------------------------------------
/netGenerator/dse/tm_tn_multiAcc.py:
--------------------------------------------------------------------------------
  1 | import helping_functions
  2 | import sys
  3 | import math
  4 | from model_extract import model_extract
  5 | from model_split import model_partition_ordered
  6 | from model_split import model_split_unordered
  7 | from model_split import gop_calculate
  8 | from model_split import max_layer_dataout
  9 | from model_split import model_split_by_label
 10 | from model_split import model_split_by_list
 11 | from model_split import model_partition_ordered
 12 | # from cluster import clusters_layers_kmeans
 13 | from model_partition import partition
 14 | from model_partition import partition_to_k
 15 | from global_search import global_search
 16 | from local_search import single_item_search
 17 | from local_search import model_partition_by_gop
 18 | from local_search import local_search
 19 | from local_search import per_die_config_dse
 20 | from local_search import per_die_config_dse_multiAcc
 21 | from local_search import per_die_config_dse_multiAcc_flex
 22 | from local_search import conv_net_perf
 23 | from param_write import generate_param_file
 24 | from local_search import flatten
 25 | from task_analysis import acc_task_analysis
 26 | from task_analysis import subnet_task_analysis
 27 | import time
 28 | 
 29 | 
 30 | def print_line(stage_name):
 31 |     if stage_name == "line":
 32 |         print("-" * int(math.ceil((int(80)))))
 33 |     else:
 34 |         print("\n")
 35 |         print("-" * int(math.ceil((int(80) - len(stage_name))/2)), stage_name, "-" * int(math.ceil((int(80) - len(stage_name))/2)))
 36 | 
 37 | def multiAcc_dse():
 38 |     # define the network parameter containers
 39 |     conv_N = []
 40 |     conv_M = []
 41 |     conv_r = []
 42 |     conv_R = []
 43 |     conv_K = []
 44 |     conv_S = []
 45 |     flag = []
 46 |     cut_flag = []
 47 |     pool_N = []
 48 | 
 49 |     sub_conv_N = []
 50 |     sub_conv_M = []
 51 |     sub_conv_r = []
 52 |     sub_conv_R = []
 53 |     sub_conv_K = []
 54 |     sub_conv_S = []
 55 |     sub_flag = []
 56 | 
 57 |     pair_1 = []
 58 |     pair_2 = []
 59 |     pair_3 = []
 60 |     lat_1 = 0
 61 |     lat_2 = 0
 62 |     lat_3 = 0
 63 |     sub_lat_list = []
 64 |     lat_list = []
 65 | 
 66 |     util_1 = 0
 67 |     util_2 = 0
 68 |     util_3 = 0
 69 |     sub_util_list = []
 70 |     util_list = []
 71 | 
 72 |     OPs = 0
 73 |     sub_pair_list = []
 74 |     item_list = []
 75 |     pair_list = []
 76 |     overall_lat = 60551400
 77 |     layer_list = []
 78 |     gop_list = []
 79 | 
 80 |     """
 81 |     step 1: extract model from txt file with parameter no_include_fc / include_fc
 82 |     """
 83 |     conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag, pool_N = model_extract('no_include_fc')
 84 |     # print("Extracted cut flag: ", cut_flag)
 85 |     # print("Extracted pool flag:", flag)
 86 |     OPs = gop_calculate(conv_N, conv_M, conv_R, conv_K)
 87 |     max_layerout = max_layer_dataout(conv_N, conv_M, conv_R, conv_K)
 88 | 
 89 |     print_line("Model extract phase")
 90 |     print("1: ", "Model extracted")
 91 |     print("1: ", "Overall convolution operation required: ", OPs)
 92 |     print("1: ", "Max layer output data: ", max_layerout)
 93 |     # print_line("Model split finish")
 94 | 
 95 |     """
 96 |     step 2: randomly cluster, param k=4, layer label results are in item
 97 |     """
 98 |     print_line("Model partition phase")
 99 |     for i in range(0, len(conv_N)):
100 |         layer_list.append(i)
101 |     # kmeans=clusters_layers_kmeans(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, 2)
102 |     # print kmeans
103 |     partition_location, diff_ratio = model_partition_by_gop(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag)
104 |     print("2: layers extracted", conv_N)
105 |     print("2: layers cutable  ", cut_flag)
106 |     print("2: partition location", partition_location)
107 |     print("2: diff_ratio: ", diff_ratio)
108 | 
109 |     sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \
110 |         =model_partition_ordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, partition_location[0]+1, partition_location[1]+1)
111 |     # print "2: Best partition output: ", partition_location, diff_ratio
112 |     print("2:", sub_conv_N)
113 | 
114 |     sub_gop_list = []
115 |     for i in range(0, len(sub_conv_N)):
116 |         sub_gop_list.append(gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i]))
117 | 
118 | 
119 |     print("2: gop of sub_nets", sub_gop_list)
120 |     print("2: length of sub_conv_N", len(sub_conv_N[0]), len(sub_conv_N[1]), len(sub_conv_N[2]))
121 |     print("2", sub_flag)
122 |     print("2: length of sub_flag", len(sub_flag[0]), len(sub_flag[1]), len(sub_flag[2]))
123 |     sub_pair_list = []
124 |     sub_lat_list = []
125 |     sub_util_list = []
126 | 
127 |     print_line("Best Configuration Search")
128 |     overall_start = time.time()
129 |     # acc_cluster_num = 3
130 |     # pair_list, item_list, gop_list, util_list = global_search(layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat)
131 |     # pair_list, gop_list, util_list = per_die_config_dse_multiAcc(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K,
132 |                                                               # sub_conv_S, sub_flag)
133 |     pair_list = per_die_config_dse_multiAcc_flex(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag)
134 | 
135 |     overall_end = time.time()
136 | 
137 |     print_line("DSE Output")
138 |     print("Best Configuration Search Results for layer accelerators: ")
139 |     for i in range(0, len(pair_list)):
140 |         print(pair_list[i])
141 | 
142 |     acc_task_list, total_acc_num = acc_task_analysis(pair_list, sub_conv_N, sub_conv_M, sub_conv_r, \
143 |                                       sub_conv_R, sub_conv_K, sub_conv_S, sub_flag)
144 | 
145 |     print("Accelerator task list: ")
146 |     for acc_num in range(0, len(acc_task_list)):
147 |         print("acc core", acc_num, " task list: ", acc_task_list[acc_num])
148 | 
149 |     print_line("Subnet Task Out")
150 |     subnet_task_list = subnet_task_analysis(pair_list, acc_task_list, sub_conv_N, sub_conv_M, sub_conv_r, \
151 |                                       sub_conv_R, sub_conv_K, sub_conv_S, sub_flag)
152 |     print("sub net interface list:")
153 |     for i in range(0, len(subnet_task_list)):
154 |         print(subnet_task_list[i])
155 | 
156 | 
157 |     print_line("Write out configurations")
158 |     print(len(pair_list), "sub-nets are generated")
159 |     print(total_acc_num, "accelerators are written into the cofig file")
160 |     generate_param_file(pair_list, pool_N, acc_task_list, subnet_task_list, "acc_ins_params.txt")
161 | 
162 |     print_line("netGen run time system info")
163 |     print("Overall time cost:", overall_end - overall_start, "s")
164 |     print_line("line")
165 | 
166 | 
167 |     print_line("test")
168 |     print(conv_net_perf(sub_conv_N[2], sub_conv_M[2], sub_conv_R[2], sub_conv_S[2], sub_conv_K[2], sub_flag[2], 8, 274, 37, 4, 4))
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     conv_N = multiAcc_dse()
173 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/fc_acc_innerpp.h:
--------------------------------------------------------------------------------
  1 | #ifndef _FC_ACC_H_
  2 | #define _FC_ACC_H_
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include "config.h"
  7 | #include "activation_functions.h"
  8 | 
  9 | #if _C_DEBUG_MODE_
 10 | #include <algorithm>
 11 | #endif
 12 | 
 13 | using namespace std;
 14 | 
 15 | template <typename Itf, typename Tparam, typename T, typename W, typename G, int iTm, int iTn, int iS_max, int iK_max>
 16 | class fc_acc
 17 | {
 18 | 
 19 |   private:
 20 |     int fc_layer_number;
 21 | 
 22 |   public:
 23 |     fc_acc() : fc_layer_number(0) { // construction function with parameter checking
 24 |     	if (iTm < 32 || iTn < 32){
 25 |     		if(iTm < 32) cout << "FC ACC: iTm is invalid, please check the iTm value to make sure it is >= 32 !!!" << endl;
 26 |     		if(iTn < 32) cout << "FC ACC: iTn is invalid, please check the iTm value to make sure it is >= 32 !!!" << endl;
 27 |     	} else {
 28 |     		cout << "FC ACC: fc_acc is valid!" << endl;
 29 |     	}
 30 |     }
 31 | 
 32 |     ////------------------------------C++ debugging functions---------------------------------------////
 33 | 
 34 | 
 35 |     ////-----------------------------Accelerator Functions---------------------------------------////
 36 |     // Load bias data
 37 |     void b_buf_load_512(W buf[], Itf *fc_layer_bias, int fc_b_offset, int m, int M) {
 38 |     	Itf data_temp = 0;
 39 |         for (int i = 0; i < iTm; i+=32){
 40 | #pragma HLS PIPELINE
 41 |         	data_temp = *(fc_layer_bias + fc_b_offset + (m+i)/32);
 42 |         	cout << "index of bias memory : " << fc_b_offset + (m+i)/32 << endl;
 43 |         	for (int wr = 0; wr < 32; wr++){
 44 | #pragma HLS UNROLL
 45 |         		if (i+wr < iTm) {
 46 |         			if (i+wr < M){
 47 |         				buf[i+wr].range(15,0) = data_temp.range((wr+1)*16-1, wr*16);
 48 | //        				cout << "bias buffer[" << i + wr <<"] = "<< buf[i+wr] << endl;
 49 |         			} else {
 50 |         				buf[i+wr].range(15,0) = 0;
 51 |         			}
 52 |         		}
 53 |         	}
 54 |         }
 55 |     }
 56 | 
 57 |     // Load input data
 58 |     void in_buf_load_512(T buf[iTn],
 59 |                      Itf *fc_in_data,
 60 |                      int fc_i_offset, int n, int N) {
 61 |     	Itf data_temp = 0;
 62 |         for (int i = 0; i < iTn; i+=32) {
 63 | #pragma HLS PIPELINE
 64 |         	data_temp = *(fc_in_data + fc_i_offset + (n+i)/32);
 65 | //        	cout << "index of in data memory : " << fc_i_offset + (n+i)/32 << endl;
 66 |             for (int wr = 0; wr < 32; wr++) {
 67 | #pragma HLS UNROLL
 68 |             	if(i + wr < iTn){
 69 |             		if(i+wr < N ){
 70 |             			buf[i+wr].range(15,0) = data_temp.range((wr+1)*16-1, wr*16);
 71 | //            			cout << "data_buffer["<<i+wr <<"] = "<< buf[i+wr] << endl;
 72 |             		} else {
 73 |             			buf[i+wr].range(15,0) = 0;
 74 |             		}
 75 |             	}
 76 |             }
 77 |         }
 78 |     }
 79 | 
 80 |     // Load weights to weight buffer
 81 |     void w_buf_load_512(W buf[iTn],
 82 |                      Itf *fc_in_weight,
 83 |                      int fc_w_offset, int n, int N) {
 84 |     	Itf data_temp = 0;
 85 |         int index = 0;
 86 |         for (int i = 0; i < iTn; i+=32) {
 87 | #pragma HLS PIPELINE
 88 |         	data_temp = *(fc_in_weight + fc_w_offset + (n+i)/32);
 89 | //        	cout << "index of weight memory : " << fc_w_offset + (n+i)/32 << endl;
 90 |             for (int wr = 0; wr < 32; wr++) {
 91 | #pragma HLS UNROLL
 92 |                 index = i + wr;
 93 |             	if(i + wr < iTn){
 94 |             		if(i+wr < N){
 95 |             			buf[index].range(15,0) = data_temp.range((wr+1)*16-1, wr*16);
 96 | //            			cout << "weight_buffer[" << i + wr << "] = "<< buf[index] << endl;
 97 |             		} else {
 98 |             			buf[index].range(15,0) = 0;
 99 |             		}
100 |             	}
101 |             }
102 |         }
103 |     }
104 | 
105 |     // fc computation engine
106 |     void fc_engine(T in_buf[], W w_buf[], W b_buf[], G out_buf[], int n) {
107 | //    	cout << "Calling fc_engine !!!" << endl;
108 | #pragma HLS PIPELINE
109 |             for (int tm = 0; tm < iTm; tm++) {
110 | #pragma HLS UNROLL
111 |             	for (int tn = 0; tn < iTn; tn++) {
112 | #pragma HLS UNROLL
113 | //            		cout << "fc_engine core processing !" << endl;
114 |             		if (tn == 0 && n == 0){
115 |             			out_buf[tm] = b_buf[tm] + w_buf[tn]*in_buf[tn];
116 | //            			cout <<"tn, n = " << tn << "," << n << "out_buf = "<< out_buf[tm] <<" " << w_buf[tn] << " * " << in_buf[tn] << endl;
117 |             		} else {
118 |                         out_buf[tm] = out_buf[tm] + w_buf[tn] * in_buf[tn];
119 | //                        cout << "out_buf = "<< out_buf[tm] <<" " << w_buf[tn] << " * " << in_buf[tn] << endl;
120 |                     }
121 |             	}
122 |             }
123 |     	}
124 | 
125 |     // fc output engine output_res(out_buf_0, fc_o_data, fc_o_offset, n, m, N, M, act);
126 |     void output_res_512(G buf[], Itf* fc_o_data, int fc_o_offset,
127 |     		int n, int m, int N, int M, bool act){
128 |         Itf out_tmp = 0;
129 |         G   tmp = 0;
130 |         G   tmp_outbuf = 0;
131 |         if (n >= N - iTn) {
132 |            	for (int i = 0; i < iTm && i < M; i += 32) { // iTm should always greater than 32, otherwise this will not work
133 |            		for (int wr = 0; wr < 32; wr++){
134 | #pragma HLS PIPELINE
135 |                     if(i + wr < M && i + wr < iTm) {
136 | //                    	tmp_outbuf = RELU(buf[i + wr]);
137 |                     	tmp_outbuf = buf[i + wr];
138 |                         tmp.range(15, 0) = tmp_outbuf.range(15, 0);
139 |                     } else {
140 |                         tmp.range(15,0) = 0;
141 |                     }
142 |                     out_tmp.range(16 * (wr + 1) - 1, 16 * (wr)) = tmp.range(15,0);
143 | //                    cout << "out_buffer[" << wr << "] = " << buf[wr] << endl;
144 |                 }
145 |                 *(fc_o_data + fc_o_offset + (m + i)/32) = out_tmp;
146 | //                cout << "index of out memory : " << fc_o_offset + ((m+i)/32) << endl;
147 |            	}
148 |         }
149 | 	}
150 | 
151 | 
152 | #if _LAYER_MODE_ // layer function with manual double buffer -- worked
153 |     void fc_layer_acc_mbuf(
154 |         int N,               //input feature number
155 |         int M,               // output feature number
156 |         int R_IN,
157 |         int C_IN,
158 |         int K,
159 |         bool act,            // activation function bit (1-- with act, 0--without act)
160 |         Itf *fc_layer_weights, //w[M][N][K][K]
161 |         Itf *fc_layer_bias,    // b[M]
162 |         int fc_w_offset,
163 |         int fc_b_offset,
164 |         int fc_i_offset,
165 |         int fc_o_offset,
166 |         Itf *fc_i_data, // in_data[N][(R-1)*S + K][(C-1)*S + K] --> [N][(R-1)*S + K - 2*P][(C-1)*S + K - 2*P]
167 |         Itf *fc_o_data)
168 |     { // out[M][R][C]
169 | 
170 |         /***************local data buffer groups******************************/
171 |         T in_buf_0[iTn];
172 |         W w_buf_0[iTn];
173 |         W b_buf_0[iTm];
174 |         G out_buf_0[iTm];
175 | 
176 | //        T in_buf_1[iTn];
177 | //        W w_buf_1[iTn];
178 | //        W b_buf_1[iTm];
179 | //        G out_buf_1[iTm];
180 | 
181 | #if _HLS_MODE_
182 | #pragma HLS ARRAY_PARTITION variable = in_buf_0 complete dim = 1
183 | #pragma HLS ARRAY_PARTITION variable = w_buf_0 complete dim = 1
184 | #pragma HLS ARRAY_PARTITION variable = b_buf_0 complete dim = 1
185 | #pragma HLS ARRAY_PARTITION variable = out_buf_0 complete dim = 1
186 | #endif
187 | 
188 | #if _C_DEBUG_MODE_
189 | #if _KERNEL_DEBUG_
190 |         cout << "Starting fc acc manual double buffer test ...." << endl;
191 |         out_buf_reset(out_buf_0);
192 |         b_buf_reset(b_buf_0);
193 |         w_buf_reset(w_buf_0);
194 |         cout << "Local buffer reset finised ..." << endl;
195 | #endif
196 | #endif
197 | 
198 |         for (int m = 0; m < M; m += iTm)
199 |         {
200 | 
201 |             for (int n = 0; n < N; n += iTn)
202 |             {
203 | //--------------------------Load input B W D in ping-pong manner-------------------------//
204 |                 b_buf_load_512(b_buf_0, fc_layer_bias, fc_b_offset, m, M);
205 |                 w_buf_load_512(w_buf_0, fc_layer_weights, fc_w_offset, n, N);
206 |                 in_buf_load_512(in_buf_0, fc_i_data, fc_i_offset, n, N);
207 | //------------------------------compute buffered data -----------------------------------//
208 |                 fc_engine(in_buf_0, w_buf_0, b_buf_0, out_buf_0, n);
209 | //---------------------------transfer output data----------------------------------------//
210 |                 output_res_512(out_buf_0, fc_o_data, fc_o_offset, n, m, N, M, act);
211 | //                output_res_512(b_buf_0, fc_o_data, fc_o_offset, n, m, N, M, act);
212 |             }
213 |         }
214 |     };
215 | #endif
216 | };
217 | 
218 | #endif
219 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Cloud-Dnn
  2 | 
  3 | ## Introduction
  4 | 
  5 | Cloud-DNN is an open-source framework that maps DNN (deep neural network) models trained by Caffe to FPGAs in the cloud for inference acceleration. It takes the input *.prototxt DNN description, generates corresponding C++ network description, and then produces the final hardware accelerator IPs through high-level synthesis. The goal of Cloud-DNN is to provide more flexible and user-friendly DNN acceleration on cloud-FPGAs (e.g., AWS F1). 
  6 | 
  7 | ### Hardware settings
  8 | - Local cluster
  9 | -UltraScale+ VU118 board with PCIe connection
 10 | - AWS cluster
 11 | -AWS F1.2Xlarge instance
 12 | 
 13 | ### OS settings
 14 | - Local cluster
 15 | -Ubuntu 16.04
 16 | - AWS cluster
 17 | -FPGA development system image (Centos 7.6)
 18 | 
 19 | ### Software requirement
 20 | - Python 3.5
 21 | - gcc g++
 22 | - Xilinx vivado_hls 2018.2 (2018.2.op for AWS)
 23 | - Xilinx vivado 2018.2 (2018.2.op for AWS)
 24 | - Caffe and the required libraries (including Pycaffe)
 25 | - aws-fpga repo
 26 | - Drivers
 27 | -Local: XDMA driver for UltraScale+ VU118 board
 28 | -AWS: AWS shell IP support, EDMA(XDMA for the latest version of AWS shell) driver
 29 | 
 30 | 
 31 | <details>
 32 | <summary><big><strong>GitHub Repository Structure</strong></big></summary>
 33 | 
 34 | ```sh
 35 | Open-Dnn/
 36 | |
 37 | |-- LICENSE
 38 | |-- README.md
 39 | |-- netGenerator
 40 | |   |-- paramExtractor
 41 | |   |-- dse
 42 | |   `-- netGen
 43 | |-- scripts
 44 | |   |-- compile
 45 | |   |-- hls_impl
 46 | |   `-- sys_gen
 47 | |-- acc_runtime
 48 | |   |-- local_acc
 49 | |   `-- aws_acc
 50 | |-- fpga_cnn
 51 | |   |-- src
 52 | |   `-- testbench
 53 | |-- docs
 54 | `-- examples
 55 | ```
 56 | </details>
 57 | 
 58 | 
 59 | ## Brief Manual
 60 | 
 61 | ### Steps briefing
 62 | 
 63 | <p align="center">
 64 |   <img width="741" height="295" src="./docs/flow.png">
 65 | </p>
 66 | 
 67 | Building an accelerator system for either local cluster or AWS cluster both requires:
 68 | 
 69 | 1. DNN description analysis
 70 | 
 71 | 1. C++ accelerator description generation
 72 | 
 73 | 1. Accelerator IP generation with vivado_hls
 74 | 
 75 | 1. Accelerator system configuration
 76 | 
 77 | 1. Host function construction and compilation
 78 | 
 79 | The generation processes are the same before step 4. The differences in the rest of the steps are explained with the detailed operations below.
 80 | 
 81 | 
 82 | ### Build accelerator system
 83 | 
 84 | Please follow the steps with a given alex.prototxt file and a trained alex.caffemodel to build your accelerator system. Make sure your environment is well set before starting this manual.
 85 | 
 86 | 1. Generating C++ accelerator description. After the repo is downloaded (no need for the *.caffemodel file for now)
 87 | 	```sh
 88 | 	cd Open-Dnn/netGenerator
 89 |     ./run_generator.sh -i alex.prototxt
 90 |     ```
 91 | run_generator.sh will automatically extract, analyze and generate the C++ code with the given alex.prototxt file. Since the alex.prototxt is given as the repo file, please only download the alex.caffemodel before executing the runtime software.
 92 | 
 93 |    >**:pushpin: TIPS:**
 94 |    > - The run_generator.sh includes all the steps before generating the accelerator IPs: parameter extraction, parameter analysis and C++ code generation. If the process doesn't work with your input model description, please hack the intermediate files copied or moved after every stage in the run_generator.sh to generate your own design.
 95 |    > - The steps in the run_generator.sh script could also be executed one by one with the scripts mentioned for each of the steps with the corresponding input files.
 96 |    > - The intermediate files for alex.prototxt are provided in the examples/ folder, please copy them to the corresponding location to run the generation step by step if your system is constrained with the software environmental supports.
 97 |    > - The parameter extract script is sensitive to the format of the name\type in the prototxt file. The current version only supports the word with the first letter capitalized and with "" symbol for it.
 98 | 
 99 | 2. Generating accelerator IPs. After the run_generator.sh script is executed successfully, the generated project is named as gen_proj and located at Open-Dnn/gen_proj.
100 | 	```sh
101 |     cd ../gen_proj/hls_proj
102 |     ./syn.sh
103 | 	```
104 | syn.sh will generate the 3 sub-net IPs with the C++ code and scripts generated from previous step. One could also hack the accelerator configurations in the acc_instance.h and call the testbench classes to verify the correctness of your change.
105 | 
106 |    >**:pushpin: TIPS:**
107 |    > - For co-sim, please uncomment the iteration in the hls_script.tcl. Current hls_script.tcl is simplified for IP generation.
108 |    > - The provided ff_test.cpp includes a simple testbench for the first sub_net function, which is sub_net_0, please modify and uncomment the others if you need to run co-sim for them. (It will be generated automatically in a future version.)
109 | 
110 | 3. Accelerator system construction. The system construction scripts are provided within the generated project folder gen_proj/impl_proj. Before constructing the accelerator system, make sure the environment is well set and the sub-net IPs are generated and located properly.
111 |    - Local Cluster
112 | 	```sh
113 |     cd ../impl_proj/local_impl/
114 |     *(specify the path of the generated IPs in the build_system_local.tcl)
115 |     use vivado tcl console to call build_system_local.tcl (either in tcl console or with terminal mode)
116 | 	```
117 | 
118 |    >**:pushpin: TIPS:**
119 |    > - You could also manually build your own accelerator system by taking the system_overview_local.pdf in the Open-Dnn/docs/ folder as a reference.
120 |    > - Remember to specify the interface latency as 3 for the URAMs in the system.
121 |    > - Please be aware of the clocks in the system overview.
122 | 
123 | 
124 |    - AWS F1
125 | 
126 |    Before starting this step, please make sure the IPI design examples in the aws-fpga repo could be excuted correctly. Follow the IPI design flow provided by it.
127 | 	```sh
128 |     mkdir ~/aws-fpga/hdk/cl/examples/aws_acc_ipi
129 |     cp ../impl_proj/aws_impl/* ~/aws-fpga/hdk/cl/examples/aws_acc_ipi
130 |     *(Specify the path of the generated IPs in the build_system_aws.tcl)
131 |     use vivado to call build_system_aws.tcl (tcl console or terminal)
132 | 	```
133 | 
134 |    >**:pushpin: TIPS:**
135 |    > - You could also manually build your own accelerator system by taking the system_overview_aws.pdf in the Open-Dnn/docs/ folder as a reference.
136 |    > - Remember to specify the interface latency as 3 for the URAMs in the system.
137 |    > - Please be aware of the clocks in the system overview.
138 | 
139 | 
140 | 4. Runtime software compilation.
141 |    - Local Cluster
142 | 
143 |    After the bitstream of the accelerator system is generated and downloaded to the UltraScale+ VU118 board, copy the acc_runtime/local_acc/ folder to your prefered execution path. Copy the config.h file from the gen_proj/hls_proj/src/ to the local_acc/ folder. Run compilation to get the executable file.
144 | 
145 |    - AWS F1
146 | 
147 |    After the AGFI (follow the instructions for AWS F1 AGFI generation) of the accelerator system is generated and downloaded to the AWS F1 instance (Follow the aws-fpga development process). Copy the acc_runtime/aws_acc/ folder to your prefered execution path. Copy the config.h file from the gen_proj/hls_proj/src/ to the aws_acc/ folder. Run compilation to get the executable file.
148 | 
149 | 
150 | ### Play With Demos
151 | 
152 | 
153 | <details>
154 | <summary><strong>Playing with given demos on local cluster</strong></summary>
155 | 
156 | Please follow the steps to play with a given demo with bitstream and runtime software.
157 | 
158 | 
159 | </details>
160 | 
161 | <details>
162 | <summary><strong>Playing with given demos on AWS F1</strong></summary>
163 | 
164 | Please follow the steps to play with a given demo with AGFI and runtime software.
165 | 
166 | 
167 | </details>
168 | 
169 | 
170 | 
171 | ## Additional Resources
172 | 
173 | For more details, please refer to the paper below.
174 | 
175 | ```sh
176 | @inproceedings{Chen2019fpga,
177 |  author = {Chen, Yao and He, Jiong and Zhang, Xiaofan and Hao, Cong and Chen, Deming},
178 |  title = {Cloud-DNN: An Open Framework for Mapping DNN Models to Cloud FPGAs},
179 |  booktitle = {Proceedings of the 2019 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
180 |  series = {FPGA '19},
181 |  year = {2019},
182 |  isbn = {978-1-4503-6137-8},
183 |  location = {Seaside, CA, USA},
184 |  pages = {73--82},
185 |  numpages = {10},
186 |  url = {http://doi.acm.org/10.1145/3289602.3293915},
187 |  doi = {10.1145/3289602.3293915},
188 |  acmid = {3293915},
189 |  publisher = {ACM},
190 |  address = {New York, NY, USA},
191 |  keywords = {cloud computing, dnn accelerator, fpga, high-level synthesis, neural network, reconfiguration}
192 | }
193 | ```
194 | 


--------------------------------------------------------------------------------
/netGenerator/dse/task_analysis.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | def acc_task_analysis(pair_list, sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag):
  4 | 
  5 |     total_acc_num = 0
  6 |     for i in range(0, len(pair_list)):
  7 |         total_acc_num += len(pair_list[i][1])
  8 | 
  9 |     acc_task_list = []
 10 |     for acc_num in range(0, total_acc_num):
 11 |         acc_task_list.append([])
 12 |     print("acc_task_list: ", acc_task_list)
 13 | 
 14 |     acc_num_counter = 0
 15 |     for sub_net_number in range(0, len(sub_conv_N)):
 16 |         print("sub_net_", sub_net_number, " layer_number: ", len(sub_conv_N[sub_net_number]))
 17 |         print("sub_net_", sub_net_number, " layer_acc_number: ", pair_list[sub_net_number][0][0])
 18 |         print("sub_net_cutting_point: ", pair_list[sub_net_number][0][1])
 19 |         # for sub_net_acc_core in range(0, pair_list[sub_net_number][0][0]):
 20 |         print("acc core_", acc_num_counter)
 21 |         if len(pair_list[sub_net_number][0][1]) == 1 and pair_list[sub_net_number][0][1][0] == -1:
 22 |             for layer_num in range(0, len(sub_conv_N[sub_net_number])):
 23 |                 local_list = []
 24 |                 local_list.append(sub_net_number)
 25 |                 local_list.append(sub_conv_N[sub_net_number][layer_num])
 26 |                 local_list.append(sub_conv_M[sub_net_number][layer_num])
 27 |                 local_list.append(sub_conv_r[sub_net_number][layer_num])
 28 |                 local_list.append(sub_conv_R[sub_net_number][layer_num])
 29 |                 local_list.append(sub_conv_K[sub_net_number][layer_num])
 30 |                 local_list.append(sub_conv_S[sub_net_number][layer_num])
 31 |                 local_list.append(sub_flag[sub_net_number][layer_num])
 32 |                 acc_task_list[acc_num_counter].append(local_list)
 33 |             acc_num_counter += 1
 34 |             print("sub_net no cut")
 35 |         elif len(pair_list[sub_net_number][0][1]) == 1 and pair_list[sub_net_number][0][1][0] == 1:
 36 |             for layer_num in range(0, pair_list[sub_net_number][0][1][0]):
 37 |                 local_list = []
 38 |                 local_list.append(sub_net_number)
 39 |                 local_list.append(sub_conv_N[sub_net_number][layer_num])
 40 |                 local_list.append(sub_conv_M[sub_net_number][layer_num])
 41 |                 local_list.append(sub_conv_r[sub_net_number][layer_num])
 42 |                 local_list.append(sub_conv_R[sub_net_number][layer_num])
 43 |                 local_list.append(sub_conv_K[sub_net_number][layer_num])
 44 |                 local_list.append(sub_conv_S[sub_net_number][layer_num])
 45 |                 local_list.append(sub_flag[sub_net_number][layer_num])
 46 |                 acc_task_list[acc_num_counter].append(local_list)
 47 |             acc_num_counter += 1
 48 |             for layer_num in range(pair_list[sub_net_number][0][1][0], len(sub_conv_N[sub_net_number])):
 49 |                 local_list = []
 50 |                 local_list.append(sub_net_number)
 51 |                 local_list.append(sub_conv_N[sub_net_number][layer_num])
 52 |                 local_list.append(sub_conv_M[sub_net_number][layer_num])
 53 |                 local_list.append(sub_conv_r[sub_net_number][layer_num])
 54 |                 local_list.append(sub_conv_R[sub_net_number][layer_num])
 55 |                 local_list.append(sub_conv_K[sub_net_number][layer_num])
 56 |                 local_list.append(sub_conv_S[sub_net_number][layer_num])
 57 |                 local_list.append(sub_flag[sub_net_number][layer_num])
 58 |                 acc_task_list[acc_num_counter].append(local_list)
 59 |             acc_num_counter += 1
 60 |             print("sub net cut into 2")
 61 |         else:
 62 |             for layer_num in range(0, pair_list[sub_net_number][0][1][0]):
 63 |                 local_list = []
 64 |                 local_list.append(sub_net_number)
 65 |                 local_list.append(sub_conv_N[sub_net_number][layer_num])
 66 |                 local_list.append(sub_conv_M[sub_net_number][layer_num])
 67 |                 local_list.append(sub_conv_r[sub_net_number][layer_num])
 68 |                 local_list.append(sub_conv_R[sub_net_number][layer_num])
 69 |                 local_list.append(sub_conv_K[sub_net_number][layer_num])
 70 |                 local_list.append(sub_conv_S[sub_net_number][layer_num])
 71 |                 local_list.append(sub_flag[sub_net_number][layer_num])
 72 |                 acc_task_list[acc_num_counter].append(local_list)
 73 |             acc_num_counter += 1
 74 |             for layer_num in range(pair_list[sub_net_number][0][1][0], pair_list[sub_net_number][0][1][1]):
 75 |                 local_list = []
 76 |                 local_list.append(sub_net_number)
 77 |                 local_list.append(sub_conv_N[sub_net_number][layer_num])
 78 |                 local_list.append(sub_conv_M[sub_net_number][layer_num])
 79 |                 local_list.append(sub_conv_r[sub_net_number][layer_num])
 80 |                 local_list.append(sub_conv_R[sub_net_number][layer_num])
 81 |                 local_list.append(sub_conv_K[sub_net_number][layer_num])
 82 |                 local_list.append(sub_conv_S[sub_net_number][layer_num])
 83 |                 local_list.append(sub_flag[sub_net_number][layer_num])
 84 |                 acc_task_list[acc_num_counter].append(local_list)
 85 |             acc_num_counter += 1
 86 |             for layer_num in range(pair_list[sub_net_number][0][1][1], len(sub_conv_N[sub_net_number])):
 87 |                 local_list = []
 88 |                 local_list.append(sub_net_number)
 89 |                 local_list.append(sub_conv_N[sub_net_number][layer_num])
 90 |                 local_list.append(sub_conv_M[sub_net_number][layer_num])
 91 |                 local_list.append(sub_conv_r[sub_net_number][layer_num])
 92 |                 local_list.append(sub_conv_R[sub_net_number][layer_num])
 93 |                 local_list.append(sub_conv_K[sub_net_number][layer_num])
 94 |                 local_list.append(sub_conv_S[sub_net_number][layer_num])
 95 |                 local_list.append(sub_flag[sub_net_number][layer_num])
 96 |                 acc_task_list[acc_num_counter].append(local_list)
 97 |             acc_num_counter += 1
 98 |             print("sub net cut into 3")
 99 | 
100 |     return acc_task_list, total_acc_num
101 | 
102 | 
103 | def subnet_task_analysis(pair_list, acc_task_list, sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag):
104 |     #         i  pair_list[0][0] 1024
105 |     #sub_net, 0, 2,              1024, 4096,2048,2048,2048,2048,2048
106 |     #sub_net, 1, 2,1024,4096,2048,2048,2048,2048,2048
107 |     #sub_net, 2, 2,1024,4096,2048,2048,2048,2048,2048
108 |     subnet_task_list = [[], [], []]
109 |     acc_num = []
110 |     param_num = []
111 |     bias = []
112 |     weight_num = []
113 |     data_in_0 = []
114 |     data_out_0 = []
115 |     data_in_1 = []
116 |     data_out_1 = []
117 |     data_in_2 = []
118 |     data_out_2 = []
119 | 
120 |     for i in range(0, len(sub_conv_N)):
121 |         sub_w_num = 0
122 |         sub_b_num = 0
123 |         max_i_num = 0
124 |         max_o_num = 0
125 |         for j in range(0, len(sub_conv_N[i])):
126 |             layer_b = sub_conv_M[i][j]
127 |             # sub_b_num += math.ceil(float(layer_b) / 32)
128 |             sub_w_num += math.ceil(float(layer_b) / 32)
129 |             layer_w = sub_conv_N[i][j] * sub_conv_M[i][j] * sub_conv_K[i][j] * sub_conv_K[i][j]
130 |             sub_w_num += math.ceil(float(layer_w)/32)
131 |         data_i = []
132 |         acc_max_i = []
133 |         acc_max_o = []
134 |         for k in range(0, len(acc_task_list)):
135 |             for l in range(0, len(acc_task_list[k])):
136 |                 local_max_i = 0
137 |                 local_max_o = 0
138 |                 if acc_task_list[k][l][0] == i:
139 |                     # print("sub net", i, "acc task list", k, len(acc_task_list[k]), l)
140 |                     i_size = math.ceil(float(acc_task_list[k][l][1] * \
141 |                                              acc_task_list[k][l][3] * acc_task_list[k][l][3])/32)
142 |                     o_size = math.ceil(float(acc_task_list[k][l][2] * \
143 |                                              acc_task_list[k][l][4] * acc_task_list[k][l][4])/32)
144 |                     if acc_task_list[k][l][7]:
145 |                         o_size += math.ceil(float(acc_task_list[k][l][2] * \
146 |                                              acc_task_list[k][l][4] * acc_task_list[k][l][4])/32/4)
147 |                     if local_max_i < i_size:
148 |                         local_max_i = i_size
149 |                     if local_max_o < o_size:
150 |                         local_max_o = o_size
151 |             if acc_task_list[k][l][0] == i:
152 |                 data_i.append(local_max_i * 2)  #twiced for double buffering
153 |                 data_i.append(local_max_o * 2)
154 |         # print(i, pair_list[i][0][0], 1024, sub_b_num, sub_w_num, data_i)
155 |         subnet_task_list[i].append(i)
156 |         subnet_task_list[i].append(pair_list[i][0][0])
157 |         subnet_task_list[i].append(1024)
158 |         subnet_task_list[i].append(sub_w_num)
159 |         subnet_task_list[i].append(data_i)
160 | 
161 |     return subnet_task_list
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     acc_task_analysis()
166 | 
167 | 


--------------------------------------------------------------------------------
/netGenerator/netGen/generate_consNet.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | def gen_sub_head():
  4 | 
  5 |     strs = '''
  6 | #ifndef _CONSTRUCT_NET_H_
  7 | #define _CONSTRUCT_NET_H_
  8 | 
  9 | #include <iostream>
 10 | #include <ap_fixed.h>
 11 | #include "acc_instance.h"
 12 | 
 13 | using namespace std;
 14 | '''
 15 |     return strs
 16 | 
 17 | def gen_param_port(parameters):
 18 | 
 19 |     param_port_str = ''' 
 20 |         Tparam param_port['''+str(parameters[1]) + '''],
 21 |         data_type_itf weight_in['''+str(parameters[2]) + '''],
 22 |         '''
 23 |     return param_port_str
 24 | 
 25 | def gen_data_port(parameters):
 26 |     port_str = ''' '''
 27 |     print(parameters[0])
 28 |     if str(parameters[0]) == '1':
 29 |         port_str = '''
 30 |         data_type_itf data_in_0['''+str(parameters[3])+'''],
 31 |         data_type_itf data_out_0['''+str(parameters[4])+'''],
 32 |         int select ) {
 33 |         '''
 34 |     if str(parameters[0]) == '2':
 35 |         port_str = '''
 36 |         data_type_itf data_in_0['''+str(parameters[3])+'''],
 37 |         data_type_itf data_out_0['''+str(parameters[4])+'''],
 38 |         data_type_itf data_in_1['''+str(parameters[4])+'''],
 39 |         data_type_itf data_out_1['''+str(parameters[6])+'''],
 40 |         int select ) {
 41 |         '''
 42 |     if str(parameters[0]) == '3':
 43 |         port_str = '''
 44 |         data_type_itf data_in_0['''+str(parameters[3])+'''],
 45 |         data_type_itf data_out_0['''+str(parameters[4])+'''],
 46 |         data_type_itf data_in_1['''+str(parameters[4])+'''],
 47 |         data_type_itf data_out_1['''+str(parameters[6])+'''],
 48 |         data_type_itf data_in_2['''+str(parameters[6])+'''],
 49 |         data_type_itf data_out_2['''+str(parameters[8])+'''],
 50 |         int select ) {
 51 |         '''
 52 |     return port_str
 53 | 
 54 | def gen_param_pragma(parameters):
 55 | 
 56 |     param_pragma = '''
 57 | #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS
 58 | #pragma HLS INTERFACE s_axilite port=select bundle=CRTL_BUS
 59 | 
 60 | #pragma HLS INTERFACE s_axilite port=param_port bundle=CRTL_BUS
 61 | #pragma HLS INTERFACE m_axi port=param_port offset=slave depth='''+str(parameters[1])+''' bundle=PARAM_IN
 62 | //#pragma HLS INTERFACE s_axilite port=bias_in bundle=CRTL_BUS
 63 | //#pragma HLS INTERFACE m_axi port=bias_in offset=slave depth='''+str(parameters[2])+''' bundle=BIAS_IN
 64 | #pragma HLS INTERFACE s_axilite port=weight_in bundle=CRTL_BUS
 65 | #pragma HLS INTERFACE m_axi port=weight_in offset=slave depth='''+str(parameters[2])+''' bundle=WEIGHT_IN 
 66 |     '''
 67 |     return param_pragma
 68 | 
 69 | def gen_data_pragma(parameters):
 70 | 
 71 |     data_pragma = '''
 72 | #pragma HLS INTERFACE s_axilite port=data_in_0 bundle=CRTL_BUS
 73 | #pragma HLS INTERFACE m_axi port=data_in_0 offset=slave depth='''+str(parameters[3])+''' bundle=DATA_IN     
 74 |     '''
 75 |     if str(parameters[0]) == '1':
 76 |         data_pragma += '''
 77 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3
 78 |     '''
 79 |     if str(parameters[0]) == '2':
 80 |         data_pragma += '''
 81 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3
 82 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3
 83 | #pragma HLS INTERFACE bram port=data_out_1        
 84 |         '''
 85 |     if str(parameters[0]) == '3':
 86 |         data_pragma += '''
 87 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3
 88 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3
 89 | #pragma HLS INTERFACE ap_memory port=data_out_1 latency=3
 90 | #pragma HLS INTERFACE ap_memory port=data_in_2 latency=3
 91 | #pragma HLS INTERFACE bram port=data_out_2        
 92 |             '''
 93 |     return data_pragma
 94 | 
 95 | def gen_offset(idx, parameters):
 96 | 
 97 |     offset_str = ''' '''
 98 |     if str(parameters[0]) == '1':
 99 |         offset_str = ''' 
100 |     int acc0_mem_inport_offset = 0;
101 |     int acc0_mem_outport_offset = 0;
102 |     
103 |     if (select == 0)
104 |     {
105 |         acc0_mem_inport_offset = 0;
106 |         acc0_mem_outport_offset = 0;
107 |     }
108 |     else
109 |     {
110 |         acc0_mem_inport_offset = 0;
111 |         acc0_mem_outport_offset = 0;
112 |     }
113 |     '''
114 |     if str(parameters[0]) == '2':
115 |         offset_str = ''' 
116 |     int acc0_mem_inport_offset = 0;
117 | 	int acc0_mem_outport_offset = 0;
118 | 	int acc1_mem_inport_offset = 0;
119 | 	int acc1_mem_outport_offset = 0;
120 | 
121 | 	if (select == 0)
122 | 	{
123 | 		acc0_mem_inport_offset = 0;
124 | 		acc0_mem_outport_offset = 0;
125 | 		acc1_mem_inport_offset = '''+str(int(int(parameters[4])/2))+ ''';
126 | 		acc1_mem_outport_offset = '''+str(int(int(parameters[6])/2))+''';
127 | 	}
128 | 	else
129 | 	{
130 | 		acc0_mem_inport_offset = '''+str(int(int(parameters[3])/2))+''';
131 | 		acc0_mem_outport_offset = '''+str(int(int(parameters[4])/2))+ ''';
132 | 		acc1_mem_inport_offset = 0;
133 | 		acc1_mem_outport_offset = 0;
134 | 	}
135 |     '''
136 |     if str(parameters[0]) == '3':
137 |         offset_str = ''' 
138 |     int acc0_mem_inport_offset = 0;
139 |     int acc0_mem_outport_offset = 0;
140 |     int acc1_mem_inport_offset = 0;
141 |     int acc1_mem_outport_offset = 0;
142 |     int acc2_mem_inport_offset = 0;
143 |     int acc2_mem_outport_offset = 0;
144 | 
145 |     if (select == 0)
146 |     {
147 |         acc0_mem_inport_offset = 0;
148 |         acc0_mem_outport_offset = 0;
149 |         acc1_mem_inport_offset = ''' + str(int(int(parameters[4]) / 2)) + ''';
150 |         acc1_mem_outport_offset = ''' + str(int(int(parameters[6]) / 2)) + ''';
151 |         acc2_mem_inport_offset = 0;
152 |         acc2_mem_outport_offset = 0;
153 |     }
154 |     else
155 |     {
156 |         acc0_mem_inport_offset = ''' + str(int(int(parameters[3]) / 2)) + ''';
157 |         acc0_mem_outport_offset = ''' + str(int(int(parameters[4]) / 2)) + ''';
158 |         acc1_mem_inport_offset = 0;
159 |         acc1_mem_outport_offset = 0;
160 |         acc2_mem_inport_offset = ''' + str(int(int(parameters[6]) / 2)) + ''';
161 |         acc2_mem_outport_offset = ''' + str(int(int(parameters[8]) / 2)) + ''';
162 |     }
163 |     '''
164 | 
165 |     return offset_str
166 | 
167 | def gen_convpool_func(parameters):
168 | 
169 |     func_bd = ''' '''
170 |     for i in range(0, int(parameters[0])):
171 |         func_bd += '''
172 |     conv_pool_acc_'''+str(i)+'''(param_port + '''+str(i*512)+''',       
173 |                 //bias_in,       
174 |                 weight_in,  
175 |                 data_in_'''+str(i)+''' + acc'''+str(i)+'''_mem_inport_offset,  
176 |                 data_out_'''+str(i)+''' +acc'''+str(i)+'''_mem_outport_offset); 
177 |     '''
178 |     return func_bd
179 | 
180 | def gen_subnet_func(idx, parameters):
181 | 
182 |     print(idx)
183 |     param_port = gen_param_port(parameters)
184 |     data_port = gen_data_port(parameters)
185 |     param_pragma = gen_param_pragma(parameters)
186 |     data_pragma = gen_data_pragma(parameters)
187 |     offset_str = gen_offset(idx, parameters)
188 |     convpool_func_bd = gen_convpool_func(parameters)
189 | 
190 |     subnet_func_bd = '''void sub_net_''' + str(idx) + '''( ''' \
191 |                      + param_port \
192 |                      + data_port \
193 |                      + param_pragma \
194 |                      + data_pragma \
195 |                      + offset_str \
196 |                      + convpool_func_bd \
197 |                      + ''' };'''
198 | 
199 |     return subnet_func_bd
200 | 
201 | def construct_function(idx, parameters):
202 | 
203 |     func_body = ''' '''
204 | 
205 |     '''
206 |         data_type_itf data_in_0[2048],
207 |         data_type_itf data_out_0[2048],
208 |         data_type_itf data_in_1[2048],
209 |         data_type_itf data_out_1[2048],
210 |         int select
211 |     ){
212 |         
213 |     '''
214 |     return func_body
215 | 
216 | 
217 | def load_parameter(filename):
218 |     lists = []
219 |     with open(filename) as f:
220 |         while 1:
221 |             line = f.readline()
222 |             if not line:
223 |                 break
224 |             lists.append(line.strip().split(","))
225 | 
226 |     ps_list = {}
227 |     print("loaded parameters")
228 |     for l in lists:
229 |         if l[0] not in ps_list:
230 |             ps_list[l[0]] = []
231 |         ps_list[l[0]].append(l[1:])
232 | 
233 |     print(ps_list)
234 | 
235 |     return ps_list
236 | 
237 | 
238 | def generate_consnet(ps_file, store_file):
239 |     ps = load_parameter(ps_file)
240 |     keys = ["sub_net_0", "sub_net_1", "sub_net_2"]
241 |     print(keys)
242 |     with open(store_file, "w") as wf:
243 |         sub_head = gen_sub_head()
244 |         wf.write(sub_head + "\n")
245 |         sub_net_counter = 0
246 |         for key in keys:
247 |             if key not in ps:
248 |                 continue
249 |             lists = ps[key]
250 |             print("sub net counter:", sub_net_counter)
251 |             print("lists", lists[0], "list len(): ", len(lists))
252 |             # for i in range(len(lists)):
253 |             func = gen_subnet_func(sub_net_counter, lists[0])
254 |             wf.write(func + "\n\n")
255 |             sub_net_counter += 1
256 |         wf.write("#endif\n")
257 |     print("ok")
258 | 
259 | 
260 | if __name__ == "__main__":
261 |     parser = argparse.ArgumentParser()
262 |     parser.add_argument("--params", help="accelerator param file")
263 |     args = parser.parse_args()
264 |     generate_consnet(args.params, "construct_net.h")
265 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/acc_instance.h:
--------------------------------------------------------------------------------
  1 | #ifndef _ACC_INSTANCE_H_
  2 | #define _ACC_INSTANCE_H_
  3 | #include "config.h"
  4 | #include "conv_acc_2ibuf.h"
  5 | #include "fc_acc_innerpp.h"
  6 | #include "max_pool_acc_innerpp.h"
  7 | 
  8 | using namespace std;
  9 | conv_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 4, 8, 8,   5,    5,     32,        32,          32> convAcc0;
 10 | conv_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 4, 8, 8,   5,    5,     32,        32,          32> convAcc1;
 11 | //																	  Tm, Tn,Tr,Tc,S_max,K_max, int IBUF_t, int WBUF_t, int OBUF_t>
 12 | 
 13 | void conv_layer_acc_0(
 14 | 					Tparam N,
 15 | 					Tparam K,
 16 | 					Tparam M,
 17 | 					Tparam R_IN,
 18 | 					Tparam C_IN,
 19 | 					Tparam C_OUT,
 20 | 					Tparam R_OUT,
 21 | 					Tparam S,
 22 | 					Tparam P,
 23 | 					Tparam act,
 24 | 					Tparam inport,
 25 | 					Tparam weight_offset,
 26 | 					Tparam bias_offset,
 27 | 					Tparam in_offset,
 28 | 					Tparam out_offset,
 29 | 					ap_fixed<32,26>* layer_bias,
 30 | 					data_type_itf* i_weight,
 31 | 					data_type_itf* i_data,
 32 | 					data_type_itf* out_data
 33 | 				)
 34 | {
 35 | #pragma HLS INTERFACE m_axi  port=i_data
 36 | //#pragma HLS INTERFACE bram   port=out_data
 37 | 	convAcc0.conv_layer_acc_2ibuf(N, K, M, R_IN, C_IN, C_OUT, R_OUT, S, P, act, inport, weight_offset, bias_offset, in_offset, out_offset,layer_bias,i_weight,i_data,out_data);
 38 | }
 39 | 
 40 | 
 41 | 
 42 | void conv_layer_acc_1(
 43 | 					Tparam N,
 44 | 					Tparam K,
 45 | 					Tparam M,
 46 | 					Tparam R_IN,
 47 | 					Tparam C_IN,
 48 | 					Tparam C_OUT,
 49 | 					Tparam R_OUT,
 50 | 					Tparam S,
 51 | 					Tparam P,
 52 | 					Tparam act,
 53 | 					Tparam inport,
 54 | 					Tparam weight_offset,
 55 | 					Tparam bias_offset,
 56 | 					Tparam in_offset,
 57 | 					Tparam out_offset,
 58 | 					ap_fixed<32,26>* layer_bias,
 59 | 					data_type_itf* i_weight,
 60 | 					data_type_itf* i_data,
 61 | 					data_type_itf* out_data
 62 | 				)
 63 | {
 64 | //#pragma HLS INTERFACE bram port=i_data
 65 | #pragma HLS INTERFACE bram port=out_data
 66 | 	convAcc1.conv_layer_acc_2ibuf(N, K, M, R_IN, C_IN, C_OUT, R_OUT, S, P, act, inport, weight_offset, bias_offset, in_offset, out_offset,layer_bias,i_weight,i_data,out_data);
 67 | }
 68 | 
 69 | max_pool_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3> maxPoolAcc0;
 70 | 
 71 | void max_pool_layer_acc_0(
 72 | 		Tparam R_in,
 73 | 		Tparam C_in,
 74 | 		Tparam N,
 75 | 		Tparam K,
 76 | 		Tparam R,
 77 | 		Tparam C,
 78 | 		Tparam S,
 79 | 		Tparam P,
 80 | 		Tparam act,
 81 | 		Tparam i_offset,
 82 | 		Tparam o_offset,
 83 | 		data_type_itf* i_data,
 84 | 		data_type_itf* o_data){
 85 | 		    maxPoolAcc0.max_pool_layer_mbuf(R_in, C_in, N, K, R, C, S, P, act, i_offset, o_offset, i_data, o_data);
 86 | 		};
 87 | 
 88 | max_pool_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3> maxPoolAcc1;
 89 | 
 90 | void max_pool_layer_acc_1(
 91 | 		Tparam R_in,
 92 | 		Tparam C_in,
 93 | 		Tparam N,
 94 | 		Tparam K,
 95 | 		Tparam R,
 96 | 		Tparam C,
 97 | 		Tparam S,
 98 | 		Tparam P,
 99 | 		Tparam act,
100 | 		Tparam i_offset,
101 | 		Tparam o_offset,
102 | 		data_type_itf* i_data,
103 | 		data_type_itf* o_data){
104 | 		    maxPoolAcc1.max_pool_layer_mbuf(R_in, C_in, N, K, R, C, S, P, act, i_offset, o_offset, i_data, o_data);
105 | 		};
106 | 
107 | fc_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 32,  5,  5> fcAcc0;
108 | 
109 | void fc_layer_acc_0(
110 | 		Tparam N,
111 | 		Tparam K,
112 | 		Tparam M,
113 | 		Tparam R_IN,
114 | 		Tparam C_IN,
115 | 		Tparam C_OUT,
116 | 		Tparam R_OUT,
117 | 		Tparam S,
118 | 		Tparam P,
119 | 		Tparam act,
120 | 		Tparam weight_offset,
121 | 		Tparam bias_offset,
122 | 		Tparam in_offset,
123 | 		Tparam out_offset,
124 | 		data_type_itf* layer_bias,
125 | 		data_type_itf* i_weight,
126 | 		data_type_itf* i_data,
127 | 		data_type_itf* out_data
128 | 		){
129 | 			fcAcc0.fc_layer_acc_mbuf(N, M, R_IN, C_IN, K, act,
130 | 					i_weight, layer_bias,
131 | 					weight_offset, bias_offset, in_offset, out_offset,
132 | 					i_data, out_data);
133 | 		};
134 | 
135 | 
136 | 
137 | void conv_pool_acc_0(
138 | 					Tparam* param_port,
139 | 					ap_fixed<32,26>* bias_in,
140 | 					data_type_itf* weight_in,
141 | 					data_type_itf* data_in,
142 | 					data_type_itf* data_out
143 | 	   	   	   	   )
144 | {
145 | 	Tparam layer_num_local[16];
146 | 	Tparam param_conv_local[16];
147 | 	Tparam param_pool_local[16];
148 | 
149 | 	for (unsigned int ll = 0; ll < 16; ll++)
150 | 	{
151 | #pragma HLS PIPELINE
152 | 		layer_num_local[ll] = param_port[ll];
153 | 	}
154 | 
155 | 	cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl;
156 | 	cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <<endl;
157 | 	for (unsigned int l = 0; l < layer_num_local[0]; l++)
158 | 	{
159 | 		cout << "LAYER ACC: CONV Processing " << l << "th layer ..." << endl;
160 | 		int l_temp = l*16;
161 | 		for (unsigned int i = 0; i < 16; i++)
162 | 		{
163 | #pragma HLS PIPELINE
164 | 			param_conv_local[i] = param_port[16 + l_temp + i];
165 | 			param_pool_local[i] = param_port[16 + l_temp + i];
166 | 		}
167 | 		if (param_pool_local[15] == 0)
168 | 		{
169 | 			cout << "LAYER ACC: Execute conv layer without pool, enable_pool =  " << param_pool_local[15] << endl;
170 | 			conv_layer_acc_0(param_conv_local[0], // N
171 | 							 param_conv_local[1], // K
172 | 							 param_conv_local[2], // M
173 | 							 param_conv_local[3], // Rin
174 | 							 param_conv_local[4], // Cin
175 | 							 param_conv_local[5], // R
176 | 							 param_conv_local[6], // C
177 | 							 param_conv_local[7], // S
178 | 							 param_conv_local[8], // P
179 | 							 param_conv_local[9], // act
180 | 							 param_conv_local[14], //inport    0:read data_in via AXI bus     1:read data_in via BARM port
181 | 							 param_conv_local[10], // w_offset
182 | 							 param_conv_local[11], // b_offset
183 | 							 param_conv_local[12], // in_offset
184 | 							 param_conv_local[13], // out_offset
185 | 							 bias_in,
186 | 							 weight_in,
187 | 							 data_in,
188 | 							 data_out);
189 | 		}
190 | 		else
191 | 		{
192 | 			cout << "LAYER ACC: POOL Execute pool layer with pooling enable: " << param_pool_local[15] << endl;
193 | 			max_pool_layer_acc_0(param_pool_local[0], // R_in,
194 | 								param_pool_local[1], // C_in
195 | 								param_pool_local[2], // N
196 | 								param_pool_local[3], // K
197 | 								param_pool_local[4], // R
198 | 								param_pool_local[5], // C
199 | 								param_pool_local[6], // S
200 | 								param_pool_local[7], // P
201 | 								param_pool_local[8], // act
202 | 								param_pool_local[9], // in_offset
203 | 								param_pool_local[10], // out_offset
204 | 								data_out,
205 | 								data_out);
206 | 		}
207 | 	}
208 | }
209 | 
210 | 
211 | 
212 | void conv_pool_acc_1(
213 | 					Tparam* param_port,
214 | 					ap_fixed<32,26>* bias_in,
215 | 					data_type_itf* weight_in,
216 | 					data_type_itf* data_in,
217 | 					data_type_itf* data_out
218 | 	   	   	   	   	   )
219 | {
220 | 	Tparam layer_num_local[16];
221 | 	Tparam param_conv_local[16];
222 | 	Tparam param_pool_local[16];
223 | 
224 | 	for (unsigned int ll = 0; ll < 16; ll++)
225 | 	{
226 | #pragma HLS PIPELINE
227 | 		layer_num_local[ll] = param_port[ll];
228 | 	}
229 | 
230 | 	cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl;
231 | 	cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <<endl;
232 | 	for (unsigned int l = 0; l < layer_num_local[0]; l++)
233 | 	{
234 | 		cout << "LAYER ACC: CONV Processing " << l << "th layer ..." << endl;
235 | 		int l_temp = l*16;
236 | 		for (unsigned int i = 0; i < 16; i++)
237 | 		{
238 | #pragma HLS PIPELINE
239 | 			param_conv_local[i] = param_port[16 + l_temp + i];
240 | 			param_pool_local[i] = param_port[16 + l_temp + i];
241 | 		}
242 | 		if (param_pool_local[15] == 0)
243 | 		{
244 | 			cout << "LAYER ACC: Execute conv layer without pool, enable_pool =  " << param_pool_local[15] << endl;
245 | 			conv_layer_acc_1(param_conv_local[0], // N
246 | 							 param_conv_local[1], // K
247 | 							 param_conv_local[2], // M
248 | 							 param_conv_local[3], // Rin
249 | 							 param_conv_local[4], // C
250 | 							 param_conv_local[5], // R
251 | 							 param_conv_local[6], // C
252 | 							 param_conv_local[7], // S
253 | 							 param_conv_local[8], // P
254 | 							 param_conv_local[9], // act
255 | 							 param_conv_local[14], //inport
256 | 							 param_conv_local[10], // w_offset
257 | 							 param_conv_local[11], // b_offset
258 | 							 param_conv_local[12], // in_offset
259 | 							 param_conv_local[13], // out_offset
260 | 							 bias_in,
261 | 							 weight_in,
262 | 							 data_in,
263 | 							 data_out);
264 | 		}
265 | 		else
266 | 		{
267 | 			cout << "LAYER ACC: POOL Execute pool layer with pooling enable: " << param_pool_local[15] << endl;
268 | 			max_pool_layer_acc_1(param_pool_local[0], // R_in,
269 | 							   param_pool_local[1], // C_in
270 | 							   param_pool_local[2], // N
271 | 							   param_pool_local[3], // K
272 | 							   param_pool_local[4], // R
273 | 							   param_pool_local[5], // C
274 | 							   param_pool_local[6], // S
275 | 							   param_pool_local[7], // P
276 | 							   param_pool_local[8], // act
277 | 							   param_pool_local[9], // in_offset
278 | 							   param_pool_local[10], // out_offset
279 | 							   data_out,
280 | 							   data_out);
281 | 		}
282 | 	}
283 | };
284 | 
285 | #endif
286 | 


--------------------------------------------------------------------------------
/netGenerator/paramExtractor/extract.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import caffe
  3 | import numpy as np
  4 | import argparse
  5 | import os
  6 | import math
  7 | import time
  8 | from caffe.proto import caffe_pb2
  9 | from google.protobuf import text_format
 10 | 
 11 | def write_param_inline(name_string, param_list, txt_file):
 12 |     f = open(txt_file, "a")
 13 |     strs = name_string
 14 |     f.write(strs)
 15 |     for n in range(0, len(param_list)):
 16 |         f.write(str(param_list[n]) + " ")
 17 |     f.write('\n')
 18 |     f.close()
 19 |     
 20 | 
 21 | 
 22 | def extract_caffe_model(model, weights, output_path, storefile):
 23 |     """extract caffe model's parameters and write them to files
 24 |     Args:
 25 |       model: path of '.prototxt'
 26 |       weights: path of '.caffemodel'
 27 |       output_path: output path of numpy params
 28 |     Returns:
 29 |       None
 30 |     """
 31 | 
 32 |     nn_layer_type = []
 33 |     nn_in_data_size_conv = []
 34 |     nn_channel_size_conv = []
 35 |     nn_padding_conv = []
 36 |     nn_stride_conv = []
 37 |     nn_in_number_conv = []
 38 |     nn_out_number_conv = []
 39 |     nn_group_conv = []
 40 |     nn_bias_conv = []
 41 |     nn_in_data_size_pooling = []
 42 |     nn_channel_size_pooling = []
 43 |     nn_padding_pooling = []
 44 |     nn_stride_pooling = []
 45 |     nn_in_number_pooling = []
 46 |     nn_in_data_size_fc = []
 47 |     nn_in_number_fc = []
 48 |     nn_out_number_fc = []
 49 |     nn_channel_size_fc = []
 50 |     nn_conv_cutable = []
 51 |     nn_pool_cutable = []
 52 |     nn_fc_cutable = []
 53 | 
 54 | 
 55 |     # np.set_printoptions(threshold='nan')
 56 | 
 57 |     net_unit = []
 58 |     net = caffe.Net(model, caffe.TEST)
 59 |     # net.copy_from(weights)
 60 |     parsible_net = caffe_pb2.NetParameter()
 61 |     text_format.Merge(open(args.model).read(), parsible_net)
 62 |     print(parsible_net)
 63 | 
 64 |     # if not os.path.exists(output_path):
 65 |     #     os.makedirs(output_path)
 66 | 
 67 |     layer_out_dim = []
 68 |     layer_name_list = []
 69 |     layer_param_list = []
 70 |     layer_input_list = []
 71 | 
 72 |     print("----------------------layer_name, feature dim in net blobs--------------------------")
 73 |     for layer_name, dim in net.blobs.items():
 74 |         layer_name_list.append(layer_name)
 75 |         layer_out_dim.append((dim.data.shape[-1], dim.data.shape[-2]))
 76 |         layer_param_list.append([layer_name, dim.data.shape[-1], dim.data.shape[-2]])
 77 |         print(layer_name, dim.data.shape, len(dim.data.shape), dim.data.shape[1], dim.data.shape[-1], dim.data.shape[-2])
 78 |         layer_input_list.append((layer_name, dim.data.shape[1]))
 79 |         print("layer param numbers:", len(layer_param_list))
 80 |     print(layer_input_list)
 81 | 
 82 | 
 83 |     tops = net.top_names
 84 |     bots = net.bottom_names
 85 |     layer_types = net.layer_dict
 86 | 
 87 |     print("----------------------layer tops & layer bottoms--------------------------")
 88 |     print(tops)
 89 | 
 90 |     top_num = 0
 91 |     top_flag = []
 92 |     bot_flag = []
 93 |     cutable = []
 94 |     print("traverse through tops----------------------------------")
 95 |     for top_key, top_value in tops.items():
 96 |         top_num = top_num+1
 97 |         if len(top_value) > 1:
 98 |             top_flag.append(0)
 99 |             print(top_key, top_value)
100 |         else:
101 |             top_flag.append(1)
102 |     bot_num = 0
103 |     print("traverse through bottoms----------------------------------")
104 |     for bot_key, bot_value in bots.items():
105 |         if len(bot_value) > 1:
106 |             bot_flag.append(0)
107 |             print(bot_key, bot_value)
108 |         else:
109 |             bot_flag.append(1)
110 |         bot_num = bot_num + 1
111 | 
112 |     print("top_flag:", top_flag)
113 |     print("bot_flag:", bot_flag)
114 |     unit_flag = 1
115 |     for i in range(0, bot_num):
116 |         if top_flag[i] == 0 and bot_flag[i] == 1:
117 |             unit_flag = 0
118 |         elif top_flag[i] == 1 and bot_flag[i] == 0:
119 |             unit_flag = 1
120 |         else:
121 |             unit_flag = unit_flag
122 |         print(unit_flag, end=' ')
123 |         cutable.append(unit_flag)
124 |     print()
125 |     print("cutable flag:", cutable)
126 | 
127 |     print("final cutable points: ", cutable)
128 |     layer_num = 0
129 |     param_num = 0
130 | 
131 |     for layer in net.layers:
132 |         # print(layer.type)
133 |         if layer.type == 'Convolution' or layer.type == 'Pooling' or layer.type == 'Concat' or layer.type == 'InnerProduct':
134 |             print(layer_param_list[param_num], layer.type, cutable[layer_num])
135 |             # f.write(layer.type + " ")
136 |             nn_layer_type.append(layer.type)
137 |             if layer.type == 'Convolution':
138 |                 nn_in_data_size_conv.append(layer_param_list[param_num][1])
139 |                 nn_conv_cutable.append(cutable[layer_num])
140 |             if layer.type == 'Pooling':
141 |                 nn_in_data_size_pooling.append(layer_param_list[param_num][1])
142 |                 nn_pool_cutable.append(cutable[layer_num])
143 |             if layer.type == 'Concat' or layer.type == 'InnerProduct':
144 |                 nn_fc_cutable.append(cutable[layer_num])
145 | 
146 |         if layer.type == 'Convolution' or layer.type == 'Pooling' or layer.type == 'Concat' or layer.type == 'InnerProduct' or layer.type == 'LRN':
147 |             param_num = param_num + 1
148 |         layer_num = layer_num + 1
149 | 
150 |     # for x, y in layer_dic.items():
151 |     #     print(x, y)
152 |     layer_count = 0
153 |     temp_layer_list = []
154 | 
155 |     # for layer in parsible_net.layer:
156 |     #     temp_layer_list.append(layer)
157 | 
158 | 
159 |     count = 0
160 |     for layer in parsible_net.layer:
161 |         if layer.type == "Convolution":
162 |             kernel = layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1
163 |             stride = layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1
164 |             pad = layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0
165 |             print(layer.name)
166 |             tmp_count = 0
167 |             tmp_dim_list = []
168 |             for layer_name, dim in net.blobs.items():
169 |                 tmp_dim_list.append(dim.data.shape[1])
170 |                 tmp_count = tmp_count + 1
171 |                 if tmp_count > 1:
172 |                     if layer_name == layer.name:
173 |                         inchannel = tmp_dim_list[-2]
174 |             outchannel = layer.convolution_param.num_output
175 |             group = layer.convolution_param.group
176 |             nn_channel_size_conv.append(kernel)
177 |             nn_stride_conv.append(stride)
178 |             nn_padding_conv.append(pad)
179 |             nn_in_number_conv.append(inchannel)
180 |             nn_out_number_conv.append(outchannel)
181 |             nn_bias_conv.append(outchannel)
182 |             nn_group_conv.append(group)
183 |         if layer.type == "Pooling":
184 |             kernel = layer.pooling_param.kernel_size
185 |             stride = layer.pooling_param.stride
186 |             pad = layer.pooling_param.pad
187 |             tmp_count = 0
188 |             tmp_dim_list = []
189 |             print(layer.name)
190 |             for layer_name, dim in net.blobs.items():
191 |                 tmp_dim_list.append(dim.data.shape[1])
192 |                 tmp_count = tmp_count + 1
193 |                 if tmp_count > 1:
194 |                     # print("previous layer info: ", layer_name, dim.data.shape[1], tmp_dim_list[-2])
195 |                     if layer_name == layer.name:
196 |                         in_num = tmp_dim_list[-2]
197 |             nn_channel_size_pooling.append(kernel)
198 |             nn_stride_pooling.append(stride)
199 |             nn_padding_pooling.append(pad)
200 |             nn_in_number_pooling.append(in_num)
201 |         if layer.type == "InnerProduct":
202 |             output = layer.inner_product_param.num_output
203 |             tmp_count = 0
204 |             tmp_dim_list = []
205 |             print(layer.name)
206 |             for layer_name, dim in net.blobs.items():
207 |                 tmp_dim_list.append(dim.data.shape)
208 |                 tmp_count = tmp_count + 1
209 |                 # print("--------------------")
210 |                 # print(tmp_dim_list)
211 |                 if tmp_count > 1:
212 |                     # print("previous layer info: ", layer_name, dim.data.shape)
213 |                     if layer_name == layer.name:
214 |                         if len(tmp_dim_list[-2]) == 2:
215 |                             in_num = tmp_dim_list[-2][-1]
216 |                         if len(tmp_dim_list[-2]) == 4:
217 |                             in_num = tmp_dim_list[-2][-1] * tmp_dim_list[-2][-2] * tmp_dim_list[-2][-3]
218 |             #TODO: here is an error with fc in data size, need to be modified
219 |             nn_channel_size_fc.append(1)
220 |             #nn_in_data_size_fc.append(1)
221 |             nn_out_number_fc.append(output)
222 |             nn_in_number_fc.append(in_num)
223 | 
224 |         count = count + 1
225 | 
226 |     print("Start writing param to file")
227 | 
228 |     # Writing the extracted params to an intermediate file
229 |     write_param_inline("Network Structure: ", nn_layer_type, storefile)
230 |     write_param_inline("nn_in_data_size_conv: ", nn_in_data_size_conv, storefile)
231 |     write_param_inline("nn_channel_size_conv: ", nn_channel_size_conv, storefile)
232 |     write_param_inline("nn_padding_conv: ", nn_padding_conv, storefile)
233 |     write_param_inline("nn_stride_conv: ", nn_stride_conv, storefile)
234 |     write_param_inline("nn_in_number_conv: ", nn_in_number_conv, storefile)
235 |     write_param_inline("nn_out_number_conv: ", nn_out_number_conv, storefile)
236 |     write_param_inline("nn_group_conv: ", nn_group_conv, storefile)
237 |     write_param_inline("nn_bias_conv: ", nn_bias_conv, storefile)
238 |     write_param_inline("nn_in_data_size_pooling: ", nn_in_data_size_pooling, storefile)
239 |     write_param_inline("nn_channel_size_pooling: ", nn_channel_size_pooling, storefile)
240 |     write_param_inline("nn_padding_pooling: ", nn_padding_pooling, storefile)
241 |     write_param_inline("nn_stride_pooling: ", nn_stride_pooling, storefile)
242 |     write_param_inline("nn_in_number_pooling: ", nn_in_number_pooling, storefile)
243 |     write_param_inline("nn_in_data_size_fc: ", nn_in_data_size_fc, storefile)
244 |     write_param_inline("nn_in_number_fc: ", nn_in_number_fc, storefile)
245 |     write_param_inline("nn_out_number_fc: ", nn_out_number_fc, storefile)
246 |     write_param_inline("nn_channel_size_fc: ", nn_channel_size_fc, storefile)
247 |     write_param_inline("conv_cut_flag: ", nn_conv_cutable, storefile)
248 |     write_param_inline("pool_cut_flag: ", nn_pool_cutable, storefile)
249 |     write_param_inline("fc_cut_flag: ", nn_fc_cutable, storefile)
250 | 
251 | 
252 | 
253 | if __name__ == '__main__':
254 |     parser = argparse.ArgumentParser()
255 |     parser.add_argument("--model", help="model prototxt path .prototxt")
256 |     parser.add_argument("--weights", help="caffe model weights path .caffemodel")
257 |     parser.add_argument("--output", help="output path")
258 |     args = parser.parse_args()
259 |     extract_caffe_model(args.model, args.weights, args.output, "net_config_params.txt")
260 |     # gen_net_config_params("net_config_params.txt")
261 | 


--------------------------------------------------------------------------------
/netGenerator/dse/helping_functions.py:
--------------------------------------------------------------------------------
  1 | EOL = "\n"
  2 | SEPARATER = "   "
  3 | SPACE = " "
  4 | PARAMETER_BEGIN = "("
  5 | PARAMETER_END = ")"
  6 | BODY_BEGIN = "{"
  7 | BODY_END = "}"
  8 | ARRAY_BEGIN = "["
  9 | ARRAY_END = "]"
 10 | CLASS_BEGIN = "<"
 11 | CLASS_END = ">"
 12 | COMMA = ","
 13 | COMMA_SPACE = ", "
 14 | EOS = ";"
 15 | CALL_SYMBOL = "."
 16 | FOR = "for"
 17 | EQUAL = " = "
 18 | INCREMENT = "++"
 19 | LESS = " < "
 20 | 
 21 | 
 22 | def read_params(file_name):
 23 |     arr = []
 24 |     with open(file_name) as f:
 25 |         lines = f.readlines()
 26 |         for line in lines:
 27 |             l = line.strip().split(':')
 28 |             arr.extend(l)
 29 |     return arr
 30 | 
 31 | 
 32 | def prompt(s):
 33 |     var = raw_input(s)
 34 |     return var
 35 | 
 36 | 
 37 | def generate_for_loop(counter, counter_type, begin, end, for_body, lc, inc, prefix=SEPARATER):
 38 | 
 39 |     for_l = FOR + SPACE + PARAMETER_BEGIN + counter_type + SPACE + counter +  EQUAL + str(begin) +\
 40 |            EOS + SPACE + counter + LESS + str(end) + EOS + SPACE + counter
 41 |     if inc == 1:
 42 |         for_l += INCREMENT
 43 |     else:
 44 |         for_l += " += " + str(inc)
 45 | 
 46 |     for_l += PARAMETER_END + SPACE + BODY_BEGIN + EOL
 47 | 
 48 |     for b in for_body:
 49 |         for_l +=  prefix*(lc+1) + b + EOL
 50 | 
 51 |     for_l += prefix*lc + BODY_END + EOL
 52 | 
 53 |     return for_l
 54 | 
 55 | 
 56 | def generate_for_loop1(counter, counter_type, begin, end, for_body, prefix=SEPARATER):
 57 | 
 58 |     for_l = FOR + SPACE + PARAMETER_BEGIN + counter_type + SPACE + counter +  EQUAL + str(begin) +\
 59 |            EOS + SPACE + counter + LESS + str(end) + EOS + SPACE + counter + INCREMENT
 60 |     for_l += PARAMETER_END + SPACE + BODY_BEGIN + prefix
 61 |     for_l += for_body
 62 |     for_l += prefix + BODY_END + EOL
 63 |     return for_l
 64 | 
 65 | 
 66 | def generate_while(cond, body, k, prefix=SEPARATER):
 67 |     w_str = "while "
 68 |     w_str += PARAMETER_BEGIN + cond + PARAMETER_END + SPACE + BODY_BEGIN + EOL
 69 |     for b in body:
 70 |         w_str += prefix*(k+1) + b + EOL
 71 |     w_str += prefix*k + BODY_END + EOL*2
 72 |     return w_str
 73 | 
 74 | 
 75 | def generate_if(condition, body, else_body, k, prefix=SEPARATER):
 76 |     if_str =  "if "
 77 |     if_str += PARAMETER_BEGIN + condition + PARAMETER_END + SPACE + BODY_BEGIN +\
 78 |         EOL
 79 |     for b in body:
 80 |         if_str +=  prefix*(k+1) + b + EOL
 81 |     if_str += prefix*k + BODY_END + EOL
 82 | 
 83 |     if else_body != "":
 84 |         if_str += prefix*k + "else " + BODY_BEGIN + EOL
 85 |         for e_b in else_body:
 86 |             if_str += prefix*(k+1) +\
 87 |                 e_b + EOL
 88 |         if_str += prefix*k + BODY_END + EOL
 89 | 
 90 |     return if_str
 91 | 
 92 | 
 93 | def params_values(s, arr2):
 94 |     if s in arr2:
 95 |         data = arr2[arr2.index(s)+1]
 96 |         values = data.split()
 97 |         return values
 98 |     else:
 99 |         return 0
100 | 
101 | 
102 | def extraction(arr):
103 |     arr1 = []
104 |     arr2 = []
105 |     layers_order = params_values("Network Structure", arr)
106 |     nn_in_data_size_conv_values = params_values("nn_in_data_size_conv", arr)
107 |     nn_in_number_conv_values = params_values("nn_in_number_conv", arr)
108 |     nn_group_conv_values = params_values("nn_group_conv", arr)
109 |     nn_channel_size_conv_values = params_values("nn_channel_size_conv", arr)
110 |     nn_out_number_conv_values = params_values("nn_out_number_conv", arr)
111 |     nn_padding_conv_values = params_values("nn_padding_conv", arr)
112 |     nn_stride_conv_values = params_values("nn_stride_conv", arr)
113 |     nn_bias_conv_values = params_values("nn_bias_conv", arr)
114 | 
115 |     nn_in_data_size_pooling_values = params_values("nn_in_data_size_pooling", arr)
116 |     nn_channel_size_pooling_values = params_values("nn_channel_size_pooling", arr)
117 |     nn_padding_pooling_values = params_values("nn_padding_pooling", arr)
118 |     nn_stride_pooling_values = params_values("nn_stride_pooling", arr)
119 |     nn_in_number_pooling_values = params_values("nn_in_number_pooling", arr)
120 | 
121 |     nn_in_number_fc_values = params_values("nn_in_number_fc", arr)
122 |     nn_in_data_size_fc_values = params_values("nn_in_data_size_fc", arr)
123 |     nn_channel_size_fc_values = params_values("nn_channel_size_fc", arr)
124 |     nn_out_number_fc_values = params_values("nn_out_number_fc", arr)
125 | 
126 |     nn_local_size_lrn_values = params_values("nn_local_size_lrn", arr)
127 | 
128 |     nn_in_number_batch_norm_values = params_values("nn_in_number_batch_norm", arr)
129 |     nn_in_number_scale_values = params_values("nn_in_number_scale", arr)
130 | 
131 |     nn_in_number_eltwise_values = params_values("nn_in_number_eltwise", arr)
132 |     nn_input_size_eltwise_values = params_values("nn_input_size_eltwise", arr)
133 | 
134 |     nn_in_number_concat_values = params_values("nn_in_number_concat", arr)
135 |     nn_input_size_concat_values = params_values("nn_input_size_concat", arr)
136 | 
137 |     conv_cut_flag_values = params_values("conv_cut_flag", arr)
138 |     pool_cut_flag_values = params_values("pool_cut_flag", arr)
139 |     fc_cut_flag_values = params_values("fc_cut_flag", arr)
140 | 
141 |     arr1.append(layers_order)
142 |     arr2.append("layers_order")
143 | 
144 |     n = len(nn_in_data_size_conv_values)
145 |     if nn_in_number_fc_values != 0:
146 |         n = n + len(nn_in_number_fc_values)
147 |     arr1.append(str(n))
148 |     arr2.append("n")
149 | 
150 |     arr1.append(nn_in_number_conv_values)
151 |     arr2.append("nn_in_number_conv")
152 |     arr1.append(nn_in_data_size_conv_values)
153 |     arr2.append("nn_in_data_size_conv")
154 |     arr1.append(nn_channel_size_conv_values)
155 |     arr2.append("nn_channel_size_conv")
156 |     arr1.append(nn_padding_conv_values)
157 |     arr2.append("nn_padding_conv")
158 |     arr1.append(nn_stride_conv_values)
159 |     arr2.append("nn_stride_conv")
160 |     arr1.append(nn_out_number_conv_values)
161 |     arr2.append("nn_out_number_conv")
162 |     arr1.append(nn_group_conv_values)
163 |     arr2.append("nn_group_conv")
164 |     arr1.append(nn_local_size_lrn_values)
165 |     arr2.append("nn_local_size_lrn")
166 |     arr1.append(nn_in_data_size_pooling_values)
167 |     arr2.append("nn_in_data_size_pooling")
168 |     arr1.append(nn_channel_size_pooling_values)
169 |     arr2.append("nn_channel_size_pooling")
170 |     arr1.append(nn_padding_pooling_values)
171 |     arr2.append("nn_padding_pooling")
172 |     arr1.append(nn_stride_pooling_values)
173 |     arr2.append("nn_stride_pooling")
174 |     arr1.append(nn_in_number_pooling_values)
175 |     arr2.append("nn_in_number_pooling")
176 |     arr1.append(nn_in_number_fc_values)
177 |     arr2.append("nn_in_number_fc")
178 |     arr1.append(nn_in_data_size_fc_values)
179 |     arr2.append("nn_in_data_size_fc")
180 |     arr1.append(nn_channel_size_fc_values)
181 |     arr2.append("nn_channel_size_fc")
182 |     arr1.append(nn_out_number_fc_values)
183 |     arr2.append("nn_out_number_fc")
184 |     arr1.append(nn_in_number_batch_norm_values)
185 |     arr2.append("nn_in_number_batch_norm")
186 |     arr1.append(nn_in_number_scale_values)
187 |     arr2.append("nn_in_number_scale")
188 |     arr1.append(nn_in_number_eltwise_values)
189 |     arr2.append("nn_in_number_eltwise")
190 |     arr1.append(nn_input_size_eltwise_values)
191 |     arr2.append("nn_input_size_eltwise")
192 |     arr1.append(nn_in_number_concat_values)
193 |     arr2.append("nn_in_number_concat")
194 |     arr1.append(nn_input_size_concat_values)
195 |     arr2.append("nn_input_size_concat")
196 |     arr1.append(conv_cut_flag_values)
197 |     arr2.append("conv_cut_flag")
198 |     arr1.append(pool_cut_flag_values)
199 |     arr2.append("pool_cut_flag")
200 |     arr1.append(fc_cut_flag_values)
201 |     arr2.append("fc_cut_flag")
202 | 
203 |     val = str(int(nn_in_number_conv_values[0])) + " * " + \
204 | 	      str(int(nn_in_data_size_conv_values[0])) + " * " + \
205 | 	      str(int(nn_in_data_size_conv_values[0]))
206 |     arr1.append(val)
207 |     arr2.append("in_data_mem_size")
208 | 
209 |     val = ""
210 |     for i in range(len(nn_in_number_conv_values)):
211 |         val += str(int(nn_in_number_conv_values[i]) * int(nn_out_number_conv_values[i]) /\
212 | 			int(nn_group_conv_values[i])*int(nn_channel_size_conv_values[i])*\
213 | 			int(nn_channel_size_conv_values[i]))
214 |         if (i+1) != len(nn_in_number_conv_values):
215 |             val += " + "
216 |     arr1.append(val)
217 |     arr2.append("conv_weight_size")
218 | 
219 |     val = ""
220 |     if nn_bias_conv_values != 0:
221 |         for i, v in enumerate(nn_bias_conv_values):
222 |             val += v
223 |             if (i+1) != len(nn_bias_conv_values):
224 |                 val += " + "
225 |         arr1.append(val)
226 |         arr2.append("conv_bias_size")
227 | 
228 |     val = ""
229 |     if nn_in_number_fc_values != 0:
230 |         for j in range(len(nn_in_number_fc_values)):
231 |             val += str(int(nn_in_number_fc_values[j])*int(nn_out_number_fc_values[j])*int(nn_channel_size_fc_values[j])*int(nn_channel_size_fc_values[j]))
232 |             if (j+1) != len(nn_in_number_fc_values):
233 |                 val += " + "
234 |         arr1.append(val)
235 |         arr2.append("fc_weight_size")
236 | 
237 |     val = ""
238 |     if nn_out_number_fc_values != 0:
239 |         for i, out in enumerate(nn_out_number_fc_values):
240 |             val += out
241 |             if (i+1) != len(nn_out_number_fc_values):
242 |                 val += " + "
243 |         arr1.append(val)
244 |         arr2.append("fc_bias_size")
245 | 
246 |     val = ""
247 |     if nn_out_number_fc_values != 0:
248 |         arr1.append(str(nn_out_number_fc_values[len(nn_out_number_fc_values)-1]))
249 |         arr2.append("fc_out_size")
250 |     else:
251 |         arr1.append(str(nn_in_number_pooling_values[len(nn_in_number_pooling_values)-1]))
252 |         arr2.append("out_size")
253 | 
254 |     val = ""
255 |     if nn_in_number_batch_norm_values != 0:
256 |         for i, v in enumerate(nn_in_number_batch_norm_values):
257 |             val += v
258 |             if (i+1) != len(nn_in_number_batch_norm_values):
259 |                 val += " + "
260 |         arr1.append(val)
261 |         arr2.append("nn_batch_norm_size")
262 | 
263 |     val = ""
264 |     if nn_in_number_scale_values != 0:
265 |         for i, v in enumerate(nn_in_number_scale_values):
266 |             val += v
267 |             if (i+1) != len(nn_in_number_scale_values):
268 |                 val += " + "
269 |         arr1.append(val)
270 |         arr2.append("nn_scale_size")
271 | 
272 |     val = ""
273 |     if nn_in_number_eltwise_values != 0:
274 |         arr1.append(val)
275 |         arr2.append("nn_in_number_eltwise_size")
276 | 
277 |     val = ""
278 |     if nn_input_size_eltwise_values != 0:
279 |         arr1.append(val)
280 |         arr2.append("nn_input_size_eltwise_size")
281 | 
282 |     val = ""
283 |     if nn_in_number_concat_values != 0:
284 |         arr1.append(val)
285 |         arr2.append("nn_in_number_concat_size")
286 | 
287 |     val = ""
288 |     if nn_input_size_concat_values != 0:
289 |         arr1.append(val)
290 |         arr2.append("nn_input_size_concat_size")
291 | 
292 |     maximum = []
293 |     for l in range(len(nn_in_data_size_conv_values)):
294 |         out = (int(nn_in_data_size_conv_values[l]) + int(nn_padding_conv_values[l]) * 2 -\
295 |                       int(nn_channel_size_conv_values[l]))/int(nn_stride_conv_values[l]) + 1;
296 |         val = out * out * int(nn_out_number_conv_values[l])
297 |         maximum.append(val)
298 |     for l1 in range(len(nn_in_data_size_pooling_values)):
299 |         out = (int(nn_in_data_size_pooling_values[l1]) + int(nn_padding_pooling_values[l1])*2 -\
300 |                       int(nn_channel_size_pooling_values[l1]))/int(nn_stride_pooling_values[l1]) + 1
301 |         val = int(out) * int(out) * int(nn_in_number_pooling_values[l1])
302 |         maximum.append(val)
303 |     if nn_in_number_fc_values != 0:
304 |         for l2 in range(len(nn_in_number_fc_values)):
305 |             val = int(nn_in_number_fc_values[l2]) * int(nn_channel_size_fc_values[l2]) *\
306 | 			      int(nn_channel_size_fc_values[l2])
307 |             maximum.append(val)
308 |     maxim = max(maximum)
309 |     arr1.append(str(maxim))
310 |     arr2.append("maximum")
311 | 
312 |     return arr1, arr2
313 | 
314 | 
315 | 


--------------------------------------------------------------------------------
/acc_runtime/local_acc/demos/convTest/runtime.cpp:
--------------------------------------------------------------------------------
  1 | #include <iosfwd>
  2 | #include <memory>
  3 | #include <string>
  4 | #include <utility>
  5 | #include <vector>
  6 | #include <iostream>
  7 | #include <assert.h>
  8 | 
  9 | #include "acc_config.h"
 10 | #include "acc_ctrl.h"
 11 | #include <unistd.h>
 12 | #include <sys/types.h>
 13 | #include <sys/stat.h>
 14 | #include <fcntl.h>
 15 | #include <stdlib.h>
 16 | #include <math.h>
 17 | 
 18 | using namespace std;
 19 | 
 20 | void fc_software(float* feature_in,float* feature_out,float* weight,float* bias,int num_input,int num_output);
 21 | 
 22 | void software_validate_conv(int num_input,
 23 |                             int num_output,
 24 |                             int kernel_size,
 25 |                             int stride,
 26 |                             int padding,
 27 |                             int feature_in_size,
 28 |                             int feature_out_size,
 29 |                             short int feature_in[][32],
 30 |                             short int feature_out[][32],
 31 |                             short int weight[][32],
 32 |                             int* bias
 33 |                             );
 34 | void software_validate_pooling(int num_input,
 35 |                                int feature_in_size,
 36 |                                int feature_out_size,
 37 |                                short int feature_in[][32],
 38 |                                short int feature_out[][32]);
 39 | 
 40 | void input_feature_ready(short int input_feature[][32]);
 41 | void weight_ready(short int weight[][32]);
 42 | void para_ready(int para[][512]);
 43 | 
 44 | short int temp_array[20480][32] = {0};
 45 | int main()
 46 | {
 47 |   short int input_feature[16*16][32] = {0};
 48 |   short int output_feature[16*16][32] = {0};
 49 |   short int weight[200][32] = {0};
 50 |   int bias[512] = {0};
 51 |   int para[3][512] = {0};
 52 | 
 53 |   //1.input feature ready
 54 |   input_feature_ready(input_feature);
 55 |   //2.weight ready
 56 |   weight_ready(weight);
 57 |   //3.para ready
 58 |   para_ready(para);
 59 | 
 60 |   acc_ctrl sub_net0(ACC0_PARA_OFFSET,    //para_offset_addr
 61 |                     ACC0_WEIGHT_OFFSET,  //weight_offset_addr
 62 |                     ACC0_DATA_IN_OFFSET, //data_in_offset_addr
 63 |                     ACC0_CTRL_OFFSET     //ctrl_addr
 64 |                     );
 65 | 
 66 |   acc_ctrl sub_net1(ACC1_PARA_OFFSET,    //para_offset_addr
 67 |                     ACC1_WEIGHT_OFFSET,  //weight_offset_addr
 68 |                     ACC1_DATA_IN_OFFSET, //data_in_offset_addr
 69 |                     ACC1_CTRL_OFFSET     //ctrl_addr
 70 |                     );
 71 | 
 72 |   acc_ctrl sub_net2(ACC2_PARA_OFFSET,    //para_offset_addr
 73 |                     ACC2_WEIGHT_OFFSET,  //weight_offset_addr
 74 |                     ACC2_DATA_IN_OFFSET, //data_in_offset_addr
 75 |                     ACC2_CTRL_OFFSET     //ctrl_addr
 76 |                     );
 77 | 
 78 |   sub_net0.write_para(para[0],512*4);
 79 |   sub_net1.write_para(para[1],512*4);
 80 |   sub_net2.write_para(para[2],512*4);
 81 | 
 82 |   sub_net0.write_weight(weight,200*64);
 83 |   sub_net1.write_weight(weight,200*64);
 84 |   sub_net2.write_weight(weight,200*64);
 85 | 
 86 |   sub_net0.write_data(input_feature,16*16*64);
 87 | 
 88 |   sub_net0.start_process(0);
 89 |   sub_net0.start_process(1);
 90 |   sub_net1.start_process(0);
 91 |   sub_net1.start_process(1);
 92 |   sub_net2.start_process(0);
 93 |   sub_net2.start_process(1);
 94 |  
 95 |   sub_net2.read_data(output_feature,16*16*64);
 96 | 
 97 |   int i,j,k;
 98 |   for(i=0;i<16;i++)
 99 |   {
100 |     for(j=0;j<16;j++)
101 |        cout << setw(10)<< output_feature[i*16+j][0]/64.0 <<" ";
102 |     cout << endl;
103 |   }
104 |   cout <<"test finish"<< endl;
105 |   return 0;
106 | }
107 | 
108 | void input_feature_ready(short int input_feature[][32])
109 | {
110 |   int i,j;
111 |   for(i=0;i<16*16;i++)
112 |   {
113 |     for(j=0;j<1;j++)
114 |       input_feature[i][j] = 1<<6;
115 |     for(j=1;j<32;j++)
116 |       input_feature[i][j] = 0<<6;
117 |   }
118 | }
119 | 
120 | void weight_ready(short int weight[][32])
121 | {
122 |   int i;
123 |   for(i=0;i<20;i++)
124 |   {
125 |     weight[i][0] = 16;
126 |   }
127 |   weight[20][0] = 0;
128 |   weight[21][0] = 0;
129 | }
130 | 
131 | void para_ready(int para[][512])
132 | {
133 |     int i;
134 | 
135 |     //===========================subnet0
136 |     //0-1.layer_num
137 |     para[0][0] = 1;
138 |     //0-2.conv para
139 |     para[0][16+0] = 1;//N
140 |     para[0][16+1] = 3;//K
141 |     para[0][16+2] = 1;//M
142 |     para[0][16+3] = 16;//Rin
143 |     para[0][16+4] = 16;//Cin
144 |     para[0][16+5] = 16;//R
145 |     para[0][16+6] = 16;//C
146 |     para[0][16+7] = 1;//S
147 |     para[0][16+8] = 1;//P
148 |     para[0][16+9] = 1;//act
149 |     para[0][16+10] = 0;//weight_offset
150 |     para[0][16+11] = 20;//bias_offset
151 |     para[0][16+12] = 0;//in_offset
152 |     para[0][16+13] = 0;//out_offset
153 |     para[0][16+14] = 0;//inport
154 |     para[0][16+15] = 0;
155 | 
156 |     //1-1.layer_num
157 |     para[0][256+0] = 1;
158 |     //1-2.conv_para
159 |     para[0][256+16+0] = 1; //N
160 |     para[0][256+16+1] = 3; //K
161 |     para[0][256+16+2] = 1; //M
162 |     para[0][256+16+3] = 16; //Rin
163 |     para[0][256+16+4] = 16; //Cin
164 |     para[0][256+16+5] = 16;  //R
165 |     para[0][256+16+6] = 16;  //C
166 |     para[0][256+16+7] = 1;  //S
167 |     para[0][256+16+8] = 1;  //P
168 |     para[0][256+16+9] = 1;  //act
169 |     para[0][256+16+10] = 0; //weight_offset
170 |     para[0][256+16+11] = 20; //bias_offset
171 |     para[0][256+16+12] = 0; //in_offset
172 |     para[0][256+16+13] = 0; //out_offset
173 |     para[0][256+16+14] = 0; //inport
174 |     para[0][256+16+15] = 0;
175 | 
176 | 
177 |     //===========================subnet1
178 |     //0-1.layer_num
179 |     para[1][0] = 1;
180 |     //0-2.conv para
181 |     para[1][16+0] = 1;//N
182 |     para[1][16+1] = 3;//K
183 |     para[1][16+2] = 1;//M
184 |     para[1][16+3] = 16;//Rin
185 |     para[1][16+4] = 16;//Cin
186 |     para[1][16+5] = 16;//R
187 |     para[1][16+6] = 16;//C
188 |     para[1][16+7] = 1;//S
189 |     para[1][16+8] = 1;//P
190 |     para[1][16+9] = 1;//act
191 |     para[1][16+10] = 0;//weight_offset
192 |     para[1][16+11] = 20;//bias_offset
193 |     para[1][16+12] = 0;//in_offset
194 |     para[1][16+13] = 0;//out_offset
195 |     para[1][16+14] = 0;//inport
196 |     para[1][16+15] = 0;
197 | 
198 |     //1-1.layer_num
199 |     para[1][256+0] = 1;
200 |     //1-2.conv_para
201 |     para[1][256+16+0] = 1; //N
202 |     para[1][256+16+1] = 3; //K
203 |     para[1][256+16+2] = 1; //M
204 |     para[1][256+16+3] = 16; //Rin
205 |     para[1][256+16+4] = 16; //Cin
206 |     para[1][256+16+5] = 16;  //R
207 |     para[1][256+16+6] = 16;  //C
208 |     para[1][256+16+7] = 1;  //S
209 |     para[1][256+16+8] = 1;  //P
210 |     para[1][256+16+9] = 1;  //act
211 |     para[1][256+16+10] = 0; //weight_offset
212 |     para[1][256+16+11] = 20; //bias_offset
213 |     para[1][256+16+12] = 0; //in_offset
214 |     para[1][256+16+13] = 0; //out_offset
215 |     para[1][256+16+14] = 0; //inport
216 |     para[1][256+16+15] = 0;
217 | 
218 |     //===========================subnet2
219 |     //0-1.layer_num
220 |     para[2][0] = 1;
221 |     //0-2.conv para
222 |     para[2][16+0] = 1;//N
223 |     para[2][16+1] = 3;//K
224 |     para[2][16+2] = 1;//M
225 |     para[2][16+3] = 16;//Rin
226 |     para[2][16+4] = 16;//Cin
227 |     para[2][16+5] = 16;//R
228 |     para[2][16+6] = 16;//C
229 |     para[2][16+7] = 1;//S
230 |     para[2][16+8] = 1;//P
231 |     para[2][16+9] = 1;//act
232 |     para[2][16+10] = 0;//weight_offset
233 |     para[2][16+11] = 20;//bias_offset
234 |     para[2][16+12] = 0;//in_offset
235 |     para[2][16+13] = 0;//out_offset
236 |     para[2][16+14] = 0;//inport
237 |     para[2][16+15] = 0;
238 | 
239 |     //1-1.layer_num
240 |     para[2][256+0] = 1;
241 |     //1-2.conv_para
242 |     para[2][256+16+0] = 1; //N
243 |     para[2][256+16+1] = 3; //K
244 |     para[2][256+16+2] = 1; //M
245 |     para[2][256+16+3] = 16; //Rin
246 |     para[2][256+16+4] = 16; //Cin
247 |     para[2][256+16+5] = 16;  //R
248 |     para[2][256+16+6] = 16;  //C
249 |     para[2][256+16+7] = 1;  //S
250 |     para[2][256+16+8] = 1;  //P
251 |     para[2][256+16+9] = 1;  //act
252 |     para[2][256+16+10] = 0; //weight_offset
253 |     para[2][256+16+11] = 20; //bias_offset
254 |     para[2][256+16+12] = 0; //in_offset
255 |     para[2][256+16+13] = 0; //out_offset
256 |     para[2][256+16+14] = 0; //inport
257 |     para[2][256+16+15] = 0;
258 | }
259 | 
260 | void software_validate_conv(int num_input,
261 |                             int num_output,
262 |                             int kernel_size,
263 |                             int stride,
264 |                             int padding,
265 |                             int feature_in_size,
266 |                             int feature_out_size,
267 |                             short int feature_in[][32],
268 |                             short int feature_out[][32],
269 |                             short int weight[][32],
270 |                             int* bias
271 |                             )
272 | {
273 |     int i,j,k,x,y,z;
274 |     int temp;
275 |     short int* temp_array = new short int[(feature_in_size+2*padding) * (feature_in_size+2*padding) * num_input];
276 | 
277 |     for(k = 0 ; k < num_input; k++)
278 |         for(i = 0 ; i < feature_in_size+2*padding; i++)
279 |             for(j = 0 ; j < feature_in_size+2*padding ; j++)
280 |                 temp_array[ k * (feature_in_size+2*padding ) * (feature_in_size+2*padding ) + i * (feature_in_size+2*padding ) + j] = 0;
281 | 
282 |     for(j = 0 ; j < num_input; j++)
283 |         for(x = padding; x < feature_in_size + padding; x++)
284 |             for(y = padding; y < feature_in_size + padding; y++)
285 |                 temp_array[ j * (feature_in_size+2*padding ) * (feature_in_size+2*padding ) + x * (feature_in_size+2*padding ) + y]
286 |                             = feature_in[(j/32) * feature_in_size * feature_in_size +(x-padding)*feature_in_size + (y-padding)][j%32];
287 |                             //input_feature[(j/32) * feature_in_size * feature_in_size +(x-padding)*feature_in_size + (y-padding)].range(16*(j%32)+15,16*(j%32));
288 | 
289 |     cout <<"software processing..." << endl;
290 |     for(i = 0 ; i < num_output; i++)
291 |     {
292 |         for(x = 0 ; x < feature_in_size-kernel_size+1+2*padding; x+=stride)
293 |             for(y = 0; y < feature_in_size-kernel_size+1+2*padding;y+=stride)
294 |             {
295 |                 //temp = bias[i].range(15,0);
296 |                 //temp = 0;
297 |                 temp = bias[i];
298 |                 for(j = 0 ; j < num_input; j++)
299 |                 {
300 |                     for(k = 0 ; k < kernel_size;k++)
301 |                     {
302 |                         for(z = 0 ; z < kernel_size; z++)
303 |                         {
304 | 
305 |                             temp += (int)(temp_array[j * (feature_in_size+2*padding ) * (feature_in_size+2*padding ) + x * (feature_in_size+2*padding) + y + (k*(feature_in_size+2*padding) + z)])
306 |                                     *(int)(weight[(i/32)*num_input*kernel_size*kernel_size +j*kernel_size*kernel_size + k*kernel_size + z][i%32])/64;
307 |                         }
308 |                     }
309 |                 }
310 |                 feature_out[(i/32) *feature_out_size * feature_out_size + (x/stride) * feature_out_size + y/stride][i%32]= (temp < 0) ? 0 : temp;
311 |                 // if( feature_out[(i/32) *feature_out_size * feature_out_size + (x/stride) * feature_out_size + y/stride][i%32] <0)
312 |                 // {
313 |                 //     feature_out[(i/32) *feature_out_size * feature_out_size + (x/stride) * feature_out_size + y/stride][i%32] = 0;
314 |                 //     //cout <<"!!"<< i << ":" <<x << y;
315 |                 // } 
316 |             }
317 |     }
318 | }
319 | 
320 | 
321 | void software_validate_pooling(int num_input,
322 |                                int feature_in_size,
323 |                                int feature_out_size,
324 |                                short int feature_in[][32],
325 |                                short int feature_out[][32])     //for padding = 0   stride = 2   kernel_size = 3
326 | {
327 | 
328 |     int i,j,k,x,y,z;
329 |     int temp_array[9] = {0};
330 |     int temp_max = 0;
331 |     for(i = 0 ; i < num_input; i++)
332 |     {
333 |         for(j = 0 ; j < feature_in_size - 2; j+=2)
334 |         {
335 |             for(k = 0 ; k <  feature_in_size - 2; k+=2)
336 |             {
337 |                 for(x = 0 ; x < 3; x++)
338 |                     for(y = 0 ; y < 3; y++)
339 |                         temp_array[x*3+y] = feature_in[i/32*feature_in_size*feature_in_size + j*feature_in_size + k + (x*feature_in_size+y)][i%32];
340 | 
341 |                     
342 |                 temp_max = temp_array[0];
343 | 
344 |                 for(z = 1; z < 9; z++)
345 |                     if(temp_array[z] > temp_max)
346 |                         temp_max = temp_array[z];
347 |                 
348 |                 feature_out[i/32*feature_out_size*feature_out_size + (j/2) * feature_out_size + k/2][i%32] = temp_max;
349 |             }
350 |         }
351 |     }
352 | }
353 | 


--------------------------------------------------------------------------------
/netGenerator/netGen/generate_accInst.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | def conv_default_function(idx, parameters):
  4 | 
  5 | 	creator = "conv_acc<"+",".join(parameters)+"> convAcc" + str(idx) +";"
  6 | 	func = '''void conv_layer_acc_'''+str(idx)+'''(
  7 | 		Tparam N,
  8 | 		Tparam K,
  9 | 		Tparam M,
 10 | 		Tparam R_IN,
 11 | 		Tparam C_IN,
 12 | 		Tparam C_OUT,
 13 | 		Tparam R_OUT,
 14 | 		Tparam S,
 15 | 		Tparam P,
 16 | 		Tparam act,
 17 | 		Tparam inport,
 18 | 		Tparam weight_offset,
 19 | 		Tparam bias_offset,
 20 | 		Tparam in_offset,
 21 | 		Tparam out_offset,
 22 | 		//ap_fixed<32,26>* layer_bias,
 23 | 		data_type_itf* i_weight,
 24 | 		data_type_itf* i_data,
 25 | 		data_type_itf* out_data
 26 | 	){
 27 | 	    convAcc'''+str(idx)+'''.conv_layer_acc_mbuf(N, K, M, R_IN, C_IN, C_OUT, R_OUT, S, P, act, inport, weight_offset, bias_offset, in_offset, out_offset, i_weight, i_data, out_data);
 28 | 	};'''
 29 | 	return creator, func
 30 | 
 31 | def max_pool_default_function(idx, parameters):
 32 | 	creator = "max_pool_acc<"+",".join(parameters)+"> maxPoolAcc"+str(idx)+";"
 33 | 	func = '''void max_pool_layer_acc_'''+str(idx)+'''(
 34 | 		Tparam R_in,
 35 | 		Tparam C_in,
 36 | 		Tparam N,
 37 | 		Tparam K,
 38 | 		Tparam R,
 39 | 		Tparam C,
 40 | 		Tparam S,
 41 | 		Tparam P,
 42 | 		Tparam act,
 43 | 		Tparam i_offset,
 44 | 		Tparam o_offset,
 45 | 		data_type_itf* i_data,
 46 | 		data_type_itf* o_data){
 47 | 		    maxPoolAcc'''+str(idx)+'''.max_pool_layer_mbuf(R_in, C_in, N, K, R, C, S, P, act, i_offset, o_offset, i_data, o_data);
 48 | 		};'''
 49 | 	return creator, func
 50 | 
 51 | def fc_default_function(idx, parameters):
 52 | 	creator = "fc_acc<"+",".join(parameters)+"> fcAcc"+str(idx)+";"
 53 | 
 54 | 	func = '''void fc_layer_acc_'''+str(idx)+'''(
 55 | 		Tparam N,
 56 | 		Tparam K,
 57 | 		Tparam M,
 58 | 		Tparam R_IN,
 59 | 		Tparam C_IN,
 60 | 		Tparam C_OUT,
 61 | 		Tparam R_OUT,
 62 | 		Tparam S,
 63 | 		Tparam P,
 64 | 		Tparam act,
 65 | 		Tparam weight_offset,
 66 | 		Tparam bias_offset,
 67 | 		Tparam in_offset,
 68 | 		Tparam out_offset,
 69 | 		data_type_itf* layer_bias,
 70 | 		data_type_itf* i_weight,
 71 | 		data_type_itf* i_data,
 72 | 		data_type_itf* out_data
 73 | 		){
 74 | 			fcAcc'''+str(idx)+'''.fc_layer_acc_mbuf(N, M, R_IN, C_IN, K, act,
 75 | 					i_weight, layer_bias,
 76 | 					weight_offset, bias_offset, in_offset, out_offset,
 77 | 					i_data, out_data);
 78 | 		};'''
 79 | 	return creator, func
 80 | 
 81 | 
 82 | def conv_pool_default_func(parameters):
 83 | 	if len(parameters) == 3:
 84 | 		idx1, idx2, idx3 = parameters
 85 | 		idx1 = idx1.strip()
 86 | 		idx2 = idx2.strip()
 87 | 		idx3 = idx3.strip()
 88 | 		print("conv pool generation parameter list len(): ", len(parameters))
 89 | 		creator, func = conv_pool(idx1, idx2, idx3)
 90 | 	elif len(parameters) == 2:
 91 | 		idx1, idx2 = parameters
 92 | 		idx1 = idx1.strip()
 93 | 		idx2 = idx2.strip()
 94 | 		print("conv pool generation parameter list len(): ", len(parameters))
 95 | 		creator, func = single_conv(idx1, idx2)
 96 | 
 97 | 	return creator, func
 98 | 
 99 | 
100 | def conv_pool(idx1, idx2, idx3):
101 | 	func ='''void conv_pool_acc_'''+str(idx1)+'''(
102 | 	Tparam* param_port,
103 | 	//ap_fixed<32,26>* bias_in,
104 | 	data_type_itf* weight_in,
105 | 	data_type_itf* data_in,
106 | 	data_type_itf* data_out
107 | 	   )
108 | 	{
109 | 	    Tparam layer_num_local[16];
110 | 	    Tparam param_conv_local[16];
111 | 	    Tparam param_pool_local[16];
112 | 
113 | 	    for (unsigned int ll = 0; ll < 16; ll++)
114 | 	    {
115 | 	    	layer_num_local[ll] = param_port[ll];
116 | 	    }
117 | 
118 | 	    cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl;
119 | 	    cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <<endl;
120 | 	    for (unsigned int l = 0; l < layer_num_local[0]; l++) {
121 | 	    	cout << "LAYER ACC: CONV Processing " << l << "th layer ..." << endl;
122 | 	        for (unsigned int i = 0; i < 16; i++)
123 | 	        {
124 | 	            param_conv_local[i] = param_port[16 + l*16 + i];
125 | 	            param_pool_local[i] = param_port[16 + 256 + l*16 + i];
126 | 	        }
127 | 	        if (param_pool_local[15] == 0)
128 | 	        {
129 | 	            cout << "LAYER ACC: Execute conv layer without pool, enable_pool =  " << param_pool_local[15] << endl;
130 | 	            conv_layer_acc_'''+str(idx2)+'''(param_conv_local[0], // N
131 | 	                         param_conv_local[1], // K
132 | 	                         param_conv_local[2], // M
133 | 	                         param_conv_local[3], // Rin
134 | 	                         param_conv_local[4], // C
135 | 	                         param_conv_local[5], // R
136 | 	                         param_conv_local[6], // C
137 | 	                         param_conv_local[7], // S
138 | 	                         param_conv_local[8], // P
139 | 	                         param_conv_local[9], // act
140 | 	                         param_conv_local[14], //inport
141 | 	                         param_conv_local[10], // w_offset
142 | 	                         param_conv_local[11], // b_offset
143 | 	                         param_conv_local[12], // in_offset
144 | 	                         param_conv_local[13], // out_offset
145 | 	                         //bias_in,
146 | 	                         weight_in,
147 | 	                         data_in,
148 | 	                         data_out);
149 | 	        }
150 | 	        else
151 | 	        {
152 | 	            cout << "LAYER ACC: POOL Execute pool layer with pooling enable: " << param_pool_local[15] << endl;
153 | 	            max_pool_layer_acc_'''+str(idx3)+'''(param_pool_local[0], // R_in,
154 | 	                           param_pool_local[1], // C_in
155 | 	                           param_pool_local[2], // N
156 | 	                           param_pool_local[3], // K
157 | 	                           param_pool_local[4], // R
158 | 	                           param_pool_local[5], // C
159 | 	                           param_pool_local[6], // S
160 | 	                           param_pool_local[7], // P
161 | 	                           param_pool_local[8], // act
162 | 	                           param_pool_local[9], // in_offset
163 | 	                           param_pool_local[10], // out_offset
164 | 	                           data_out,
165 | 	                           data_out);
166 | 	        }
167 | 	    }
168 | 	};'''
169 | 
170 | 	return "", func
171 | 
172 | 
173 | def single_conv(idx1, idx2):
174 | 	func ='''void conv_pool_acc_'''+str(idx1)+'''(
175 | 	Tparam* param_port,
176 | 	//ap_fixed<32,26>* bias_in,
177 | 	data_type_itf* weight_in,
178 | 	data_type_itf* data_in,
179 | 	data_type_itf* data_out
180 | 	   )
181 | 	{
182 | 	    Tparam layer_num_local[16];
183 | 	    Tparam param_conv_local[16];
184 | 	    Tparam param_pool_local[16];
185 | 
186 | 	    for (unsigned int ll = 0; ll < 16; ll++)
187 | 	    {
188 | 	    	layer_num_local[ll] = param_port[ll];
189 | 	    }
190 | 
191 | 	    cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl;
192 | 	    cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <<endl;
193 | 	    for (unsigned int l = 0; l < layer_num_local[0]; l++) {
194 | 	    	cout << "LAYER ACC: CONV Processing " << l << "th layer ..." << endl;
195 | 	        for (unsigned int i = 0; i < 16; i++)
196 | 	        {
197 | 	            param_conv_local[i] = param_port[16 + l*16 + i];
198 | 	            param_pool_local[i] = param_port[16 + 256 + l*16 + i];
199 | 	        }
200 | 	        if (param_pool_local[15] == 0)
201 | 	        {
202 | 	            cout << "LAYER ACC: Execute conv layer without pool, enable_pool =  " << param_pool_local[15] << endl;
203 | 	            conv_layer_acc_'''+str(idx2)+'''(param_conv_local[0], // N
204 | 	                         param_conv_local[1], // K
205 | 	                         param_conv_local[2], // M
206 | 	                         param_conv_local[3], // Rin
207 | 	                         param_conv_local[4], // C
208 | 	                         param_conv_local[5], // R
209 | 	                         param_conv_local[6], // C
210 | 	                         param_conv_local[7], // S
211 | 	                         param_conv_local[8], // P
212 | 	                         param_conv_local[9], // act
213 | 	                         param_conv_local[14], //inport
214 | 	                         param_conv_local[10], // w_offset
215 | 	                         param_conv_local[11], // b_offset
216 | 	                         param_conv_local[12], // in_offset
217 | 	                         param_conv_local[13], // out_offset
218 | 	                         //bias_in,
219 | 	                         weight_in,
220 | 	                         data_in,
221 | 	                         data_out);
222 | 	        }
223 | 	        }
224 | 	        }'''
225 | 
226 | 	return "", func
227 | 
228 | 
229 | def conv_default_func(parameters):
230 | 	idx1,idx2, idx3 = parameters
231 | 	idx1 = idx1.strip()
232 | 	idx2 = idx2.strip()
233 | 	idx3 = idx3.strip()
234 | 
235 | 	func ='''void conv_pool_acc_'''+str(idx1)+'''(
236 | 	Tparam* param_port,
237 | 	//ap_fixed<32,26>* bias_in,
238 | 	data_type_itf* weight_in,
239 | 	data_type_itf* data_in,
240 | 	data_type_itf* data_out
241 | 	   )
242 | 	{
243 | 	    Tparam layer_num_local[16];
244 | 	    Tparam param_conv_local[16];
245 | 	    Tparam param_pool_local[16];
246 | 
247 | 	    for (unsigned int ll = 0; ll < 16; ll++)
248 | 	    {
249 | 	    	layer_num_local[ll] = param_port[ll];
250 | 	    }
251 | 
252 | 	    cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl;
253 | 	    cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <<endl;
254 | 	    for (unsigned int l = 0; l < layer_num_local[0]; l++) {
255 | 	    	cout << "LAYER ACC: CONV Processing " << l << "th layer ..." << endl;
256 | 	        for (unsigned int i = 0; i < 16; i++)
257 | 	        {
258 | 	            param_conv_local[i] = param_port[16 + l*16 + i];
259 | 	            param_pool_local[i] = param_port[16 + 256 + l*16 + i];
260 | 	        }
261 | 	        if (param_pool_local[15] == 0)
262 | 	        {
263 | 	            cout << "LAYER ACC: Execute conv layer without pool, enable_pool =  " << param_pool_local[15] << endl;
264 | 	            conv_layer_acc_'''+str(idx2)+'''(param_conv_local[0], // N
265 | 	                         param_conv_local[1], // K
266 | 	                         param_conv_local[2], // M
267 | 	                         param_conv_local[3], // Rin
268 | 	                         param_conv_local[4], // C
269 | 	                         param_conv_local[5], // R
270 | 	                         param_conv_local[6], // C
271 | 	                         param_conv_local[7], // S
272 | 	                         param_conv_local[8], // P
273 | 	                         param_conv_local[9], // act
274 | 	                         param_conv_local[14], //inport
275 | 	                         param_conv_local[10], // w_offset
276 | 	                         param_conv_local[11], // b_offset
277 | 	                         param_conv_local[12], // in_offset
278 | 	                         param_conv_local[13], // out_offset
279 | 	                         //bias_in,
280 | 	                         weight_in,
281 | 	                         data_in,
282 | 	                         data_out);
283 | 	        }
284 | 	        else
285 | 	        {
286 | 	            cout << "LAYER ACC: POOL Execute pool layer with pooling enable: " << param_pool_local[15] << endl;
287 | 	            max_pool_layer_acc_'''+str(idx3)+'''(param_pool_local[0], // R_in,
288 | 	                           param_pool_local[1], // C_in
289 | 	                           param_pool_local[2], // N
290 | 	                           param_pool_local[3], // K
291 | 	                           param_pool_local[4], // R
292 | 	                           param_pool_local[5], // C
293 | 	                           param_pool_local[6], // S
294 | 	                           param_pool_local[7], // P
295 | 	                           param_pool_local[8], // act
296 | 	                           param_pool_local[9], // in_offset
297 | 	                           param_pool_local[10], // out_offset
298 | 	                           data_out,
299 | 	                           data_out);
300 | 	        }
301 | 	    }
302 | 	};'''
303 | 
304 | 	return "", func
305 | 
306 | 
307 | def load_parameter(filename):
308 | 	lists = []
309 | 	with open(filename) as f:
310 | 		while 1:
311 | 			line = f.readline()
312 | 			if not line:
313 | 				break
314 | 			lists.append(line.strip().split(","))
315 | 
316 | 	ps_list = {}
317 | 	print("loaded parameters")
318 | 	for l in lists:
319 | 		if l[0] not in ps_list:
320 | 			ps_list[l[0]] = []
321 | 		ps_list[l[0]].append(l[1:])
322 | 
323 | 	return ps_list
324 | 
325 | def switch_function(idx, flag, parameters):
326 | 	if flag =="conv":
327 | 		return conv_default_function(idx, parameters)
328 | 	elif flag =="max_pool":
329 | 		return max_pool_default_function(idx, parameters)
330 | 	elif flag =="fc":
331 | 		return fc_default_function(idx, parameters)
332 | 	elif flag =="conv_pool":
333 | 		return conv_pool_default_func(parameters)
334 | 	else:
335 | 		print("Error with function generation flag")
336 | 
337 | def generate_file(ps_file, store_file):
338 | 	ps = load_parameter(ps_file)
339 | 
340 | 	keys = ["conv", "max_pool", "fc", "conv_pool"]
341 | 
342 | 	with open(store_file, "w") as wf:
343 | 		strs = '''#ifndef _ACC_INSTANCE_H_
344 | #define _ACC_INSTANCE_H_
345 | #include "config.h"
346 | #include "conv_acc_2ibuf.h"
347 | #include "fc_acc_innerpp.h"
348 | #include "max_pool_acc_innerpp.h"\n
349 | using namespace std;'''
350 | 		wf.write(strs + "\n")
351 | 		for key in keys:
352 | 			if key not in ps:
353 | 				continue
354 | 			lists = ps[key]
355 | 			for i in range(len(lists)):
356 | 				head, func = switch_function(i, key, lists[i])
357 | 				wf.write(head + "\n\n" + func + "\n\n")
358 | 		wf.write("#endif\n")
359 | 	print("ok")
360 | 
361 | if __name__ == "__main__":
362 |     parser = argparse.ArgumentParser()
363 |     parser.add_argument("--params", help="accelerator param file")
364 |     args = parser.parse_args()
365 |     generate_file(args.params, "acc_instance.h")
366 | 
367 | 
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 


--------------------------------------------------------------------------------
/fpga_cnn/src/conv_acc_innerpp.h:
--------------------------------------------------------------------------------
  1 | #ifndef _CONV_ACC_H_
  2 | #define _CONV_ACC_H_
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include "hls_stream.h"
  7 | #include "activation_functions.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | template<typename Itf, typename Tparam, typename T, typename W, typename G, int Tm, int Tn, int Tr, int Tc, int S_max, int K_max, int IBUF_t, int WBUF_t, int OBUF_t>
 12 | class conv_acc {
 13 | 
 14 | private:
 15 |     int conv_layer_number;
 16 | 
 17 | public:
 18 |     conv_acc() : conv_layer_number(0) { conv_layer_number = 0; };
 19 | 
 20 |     ////------------------------------C++ debugging functions---------------------------------------////
 21 |     // Reset output buffer
 22 |     void out_buf_reset(G buf[][Tr][Tc]) {
 23 |         for (int i = 0; i < Tm; i++) {
 24 |             for (int j = 0; j < Tr; j++) {
 25 |                 for (int k = 0; k < Tc; k++) {
 26 |                     buf[i][j][k] = G(0);
 27 |                 }
 28 |             }
 29 |         }
 30 |     }
 31 |     // Reset weight buffer
 32 |     void w_buf_reset(int K, W buf[][Tm][K_max][K_max]) {
 33 |         for (int i = 0; i < Tn; i++) {
 34 |             for (int j = 0; j < Tm; j++) {
 35 |                 for (int k = 0; k < K; k++) {
 36 |                     for (int l = 0; l < K; l++) {
 37 |                         buf[i][j][k][l] = W(0);
 38 |                     }
 39 |                 }
 40 |             }
 41 |         }
 42 |     }
 43 |     // Reset bias buffer
 44 |     void b_buf_reset(W buf[]) {
 45 |         for (int i = 0; i < Tm; i++) {
 46 |             buf[i] = W(0);
 47 |         }
 48 |     }
 49 | 
 50 |     ////-----------------------------Accelerator Functions---------------------------------------////
 51 |     // Load bias data
 52 |     void b_buf_load(W buf[], ap_fixed<32,26> *layer_bias, int bias_offset, int m) {
 53 |         for (int i = 0; i < Tm; i++) {
 54 |             buf[i].range(15,0) = (*(layer_bias + bias_offset + i + m)).range(15,0);
 55 | //            cout << "Read bias location: " << bias_offset + i + m << "  Read bias data: " << buf[i] << endl;
 56 |         }
 57 |     }
 58 |     // Tn << 32 && N << 32
 59 |     void in_buf_load_axi(
 60 |             T buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max],
 61 |             Itf* i_data,
 62 |             int in_offset, int n, int r, int c, int S, int K, int P, int R_IN, int C_IN, int N ) {
 63 |         Itf data_tmp = 0;
 64 |         // valid data portion
 65 |         for (int j = r * S - P; j < (r + Tr - 1) * S + K - P ; j++) {//
 66 |             for (int k = c * S - P; k < (c + Tc -1) * S + K - P; k++) {
 67 | #pragma HLS PIPELINE
 68 |                 for (int i = 0; i < Tn; i += Tn) {
 69 | #pragma HLS UNROLL
 70 |                     if ((i + n >= N) || j < 0 || j >= R_IN || k < 0 || k >= C_IN) {
 71 |                         for (int wr = 0; wr < Tn; wr++) {
 72 | #pragma HLS UNROLL
 73 |                             buf[wr][j - r * S + P][k - c * S + P] = T(0);
 74 |                         }
 75 |                     } else {
 76 |                         data_tmp = *(i_data + in_offset + (i + n)/32 * R_IN * C_IN + j * R_IN + k);
 77 |                         for (int wr = 0; wr < Tn; wr++) {
 78 | #pragma HLS UNROLL
 79 |                             buf[wr][j - r * S + P][k - c * S + P].range(15,0) = data_tmp.range(((n + wr)%32 + 1) * 16 - 1, ((n + wr)%32) * 16);
 80 |                         }
 81 |                     }
 82 |                 }
 83 |             }
 84 |         }
 85 |     }
 86 |     void in_buf_load_bram(
 87 |             T buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max],
 88 |             Itf* i_data,
 89 |             int in_offset, int n, int r, int c, int S, int K, int P, int R_IN, int C_IN, int N ) {
 90 |         Itf data_tmp = 0;
 91 |         // valid data portion
 92 |         for (int j = r * S - P; j < (r + Tr - 1) * S + K - P ; j++) {//
 93 |             for (int k = c * S - P; k < (c + Tc -1) * S + K - P; k++) {
 94 | #pragma HLS PIPELINE
 95 |                 for (int i = 0; i < Tn; i += Tn) {
 96 | #pragma HLS UNROLL
 97 |                     if ((i + n >= N) || j < 0 || j >= R_IN || k < 0 || k >= C_IN) {
 98 |                         for (int wr = 0; wr < Tn; wr++) {
 99 | #pragma HLS UNROLL
100 |                             buf[wr][j - r * S + P][k - c * S + P] = T(0);
101 |                         }
102 |                     } else {
103 |                         data_tmp = *(i_data + in_offset + (i + n)/32 * R_IN * C_IN + j * R_IN + k);
104 |                         for (int wr = 0; wr < Tn; wr++) {
105 | #pragma HLS UNROLL
106 |                             buf[wr][j - r * S + P][k - c * S + P].range(15,0) = data_tmp.range(((n + wr)%32 + 1) * 16 - 1, ((n + wr)%32) * 16);
107 |                         }
108 |                     }
109 |                 }
110 |             }
111 |         }
112 |     }
113 | 
114 |     void in_buf_load(
115 |             bool inport,
116 |             T buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max],
117 |             Itf* i_data,
118 |             Itf* out_data,
119 |             int in_offset, int n, int r, int c, int S, int K, int P, int R_IN, int C_IN, int N ) {
120 | 
121 |              if(inport == 0) {
122 |                     in_buf_load_axi(buf, i_data, in_offset, n, r, c, S, K, P, R_IN, C_IN, N);
123 |                     cout << "input data with i_data!" << endl;
124 |               } else {
125 |                     in_buf_load_bram(buf, out_data, in_offset, n, r, c, S, K, P, R_IN, C_IN, N);
126 |                     cout << "input data with out_data!" << endl;
127 |               }
128 |     }
129 | 
130 |     // Load weight squeezed in the N dimension
131 |     void w_buf_load_512(W buf[][Tm][K_max][K_max],
132 |                         Itf *layer_weights,
133 |                         int weight_offset,
134 |                         int n, int m, int K, int N, int M)
135 |     {
136 |         Itf w_tmp = 0;
137 |         for (int k1 = 0; k1 < K; k1++) {
138 |             for (int k2 = 0; k2 < K; k2++) {
139 | #pragma HLS PIPELINE
140 |                 for (int i = 0; i < Tm; i++) { // Tm greater than 32
141 |                     for (int j = 0; j < Tn; j += Tn) { // Tn smaller than 32
142 | #pragma HLS UNROLL
143 |                     	w_tmp = *(layer_weights + weight_offset + ((j + n)/32)* M * K * K + (i + m) * K * K + k1*K + k2);
144 |                         for (int wr = 0; wr < Tn; wr++) {
145 | #pragma HLS UNROLL
146 |                             buf[wr][i][k1][k2].range(15,0) = w_tmp.range(((n + wr)%32 + 1) * 16 - 1, ((n+wr)%32) * 16);
147 |                         }
148 |                     }
149 |                 }
150 |             }
151 |         }
152 |     }
153 | 
154 |     // Load weight squeezed in the M dimension
155 |     void w_buf_load_512_tm(W buf[][Tm][K_max][K_max],
156 |                         Itf *layer_weights,
157 |                         int weight_offset,
158 |                         int n, int m, int K, int N, int M)
159 |     {
160 |         Itf w_tmp = 0;
161 |         for (int k1 = 0; k1 < K; k1++) {
162 |             for (int k2 = 0; k2 < K; k2++) {
163 |                 for (int j = 0; j < Tn; j++) { // Tn smaller than 32
164 | #pragma HLS PIPELINE
165 |                     for (int i = 0; i < Tm; i+=32) { // Tm greater than 32
166 |                     	w_tmp = *(layer_weights + weight_offset + ((j + n)/32)* M * K * K + (i + m) * K * K + k1*K + k2);
167 |                         for (int wr = 0; wr < 32; wr++) {
168 | #pragma HLS UNROLL
169 |                             buf[j][i+wr][k1][k2].range(15,0) = w_tmp.range((wr%32 + 1) * 16 - 1, ((wr)%32) * 16);
170 |                         }
171 |                     }
172 |                 }
173 |             }
174 |         }
175 |     }
176 | 
177 | // Convolution computation kernel Tm, Tn based
178 |     void conv_engine(T in_buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max], W w_buf[][Tm][K_max][K_max],
179 |                      W b_buf[], G out_buf[][Tr][Tc], int S, int n, int N, int r, int c, int K, int R_OUT, int C_OUT,
180 |                      int w_offset, int i_offset) {
181 |         if (n >= 0 && n - Tn < N) {
182 |             for (int i = 0; i < K; i++) {
183 |                 for (int j = 0; j < K; j++) {
184 |                     for (int tr = 0; tr < Tr && tr < R_OUT - r; tr++) {
185 |                         for (int tc = 0; tc < Tc; tc++) {
186 | #pragma HLS PIPELINE
187 |                             for (int tm = 0; tm < Tm; tm++) {
188 | #pragma HLS UNROLL
189 |                                 for (int tn = 0; tn < Tn; tn++) {
190 | #pragma HLS UNROLL
191 |                                     if (i == 0 && j == 0 && tn == 0 && n == 0)
192 |                                         out_buf[tm][tr][tc] = b_buf[tm] + w_buf[tn][tm][i + w_offset][j] *
193 |                                                                           in_buf[tn][S * (tr) + i + i_offset][S * (tc) +
194 |                                                                                                               j];
195 |                                     else
196 |                                         out_buf[tm][tr][tc] = out_buf[tm][tr][tc] + w_buf[tn][tm][i + w_offset][j] *
197 |                                                                                     in_buf[tn][S * (tr) + i + i_offset][
198 |                                                                                             S * (tc) + j];
199 |                                 }
200 |                             }
201 |                         }
202 |                     }
203 |                 }
204 |             }
205 |         }
206 |     }
207 | 
208 |     // Ouput out_buf data to output interface
209 |     void output_res_512(ap_fixed<16,10> out_buf[][Tr][Tc],
210 |     					ap_int<512>*	out_data,
211 |                         int 			out_offset,
212 | 						int n, int m, int r, int c, int N, int M,
213 |                         int R_OUT, int C_OUT, bool act)
214 |     {
215 |         ap_int<512>     out_tmp = 0;
216 |         ap_fixed<16,10> tmp = 0;
217 |         ap_fixed<16,10> tmp_outbuf = 0;
218 |         if (n >= N - Tn)
219 |         {
220 |             for (int j = r; (j < r + Tr) && (j < R_OUT); j++)
221 |             {
222 |                 for (int k = c; (k < c + Tc) && (k < C_OUT); k++)
223 |                 {
224 |                 	for (int wr = 0; wr < Tm && wr < M; wr += 32) // Tm should always greater than 32, otherwise this will not work
225 |                 	{
226 | #pragma HLS PIPELINE
227 |                 		for (int wr_d = 0; wr_d < 32; wr_d++)
228 |                         {
229 | #pragma HLS UNROLL
230 |                             if(m + wr + wr_d < M)
231 |                             {
232 |                                 tmp_outbuf = RELU(out_buf[wr + wr_d][j - r][k - c]);
233 |                                 tmp.range(15, 0) = tmp_outbuf.range(15, 0);
234 |                             }
235 |                             else
236 |                             {
237 |                                 tmp.range(15,0) = 0;
238 |                             }
239 |                             out_tmp.range(16 * (wr_d + 1) - 1, 16 * (wr_d)) = tmp.range(15,0);
240 |                         }
241 |                         *(out_data + out_offset + ((m / Tm) + (wr / 32)) * R_OUT * C_OUT + j * C_OUT + k) = out_tmp;
242 |                 	}
243 |                 }
244 |             }
245 |         }
246 |     }
247 | //  + int(wr / 32)
248 | ///////////////////////------------------conv accelerator----------------//////////////////////////
249 | #if _LAYER_MODE_ // layer function with cast port
250 |     void conv_layer_acc_mbuf(
251 |             int N,            //input feature number
252 |             int K,            //input kernel size
253 |             int M,            // output feature number
254 |             int R_IN,         // input Row
255 |             int C_IN,         // input column
256 |             int R_OUT,        // output Row
257 |             int C_OUT,        // output column
258 |             int S,            // stride size
259 |             int P,            // padding size
260 |             bool act,         // activation function bit (1-- with act, 0--without act)
261 |             bool inport,
262 |             int weight_offset,
263 |             int bias_offset,
264 |             int in_offset,
265 |             int out_offset,
266 | 			ap_fixed<32,26> *layer_bias,
267 |             Itf *i_weight,
268 |             Itf *i_data,
269 |             Itf *out_data ) { // out[M][R][C]
270 | 
271 |         /***************local data buffer groups******************************/
272 |         T in_buf_0[Tn][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max];
273 |         W w_buf_0[Tn][Tm][K_max][K_max];
274 |         W b_buf_0[Tm];
275 |         G out_buf_0[Tm][Tr][Tc];
276 | 
277 | #pragma HLS ARRAY_PARTITION variable = in_buf_0 complete dim = 1
278 | #pragma HLS ARRAY_PARTITION variable = w_buf_0 complete dim = 1
279 | #pragma HLS ARRAY_PARTITION variable = w_buf_0 complete dim = 2
280 | #pragma HLS ARRAY_PARTITION variable = b_buf_0 complete
281 | #pragma HLS ARRAY_PARTITION variable = out_buf_0 complete dim = 1
282 | 
283 |         //--------------------------Initial data load ---------------------------------------------//
284 |         for (int r = 0; r < R_OUT; r += Tr)
285 |         {
286 |             for (int c = 0; c < C_OUT; c += Tc)
287 |             {
288 |                 for (int m = 0; m < M; m += Tm)
289 |                 {
290 |                     for (int n = 0; n < N; n += Tn)
291 |                     {
292 | 
293 |                         //--------------------------Load input B W D in ping-pong manner-------------------------//
294 |                         b_buf_load(b_buf_0, layer_bias, bias_offset, m);
295 | 
296 | //                        w_buf_load_512_tm(w_buf_0, i_weight, weight_offset, n, m, K, N, M);
297 | 
298 |                         in_buf_load(inport, in_buf_0, i_data, out_data, in_offset, n, r, c, S, K, P, R_IN, C_IN, N);
299 | 
300 |                         //------------------------------compute buffered data -----------------------------------//
301 |                         conv_engine(in_buf_0, w_buf_0, b_buf_0, out_buf_0, S, n, N, r, c, K, R_OUT, C_OUT, 0, 0);
302 | 
303 |                         //---------------------------transfer output data----------------------------------------//
304 |                         output_res_512(out_buf_0,out_data,out_offset, n, m, r, c, N, M, R_OUT, C_OUT, act);
305 | 
306 |                 }
307 |             }
308 |         }
309 |     }
310 | 
311 | 
312 |     };
313 | 
314 | #endif
315 | 
316 | };
317 | 
318 | #endif
319 | 


--------------------------------------------------------------------------------