├── docs ├── flow.png ├── system_overview_aws.pdf └── system_overview_local.pdf ├── fpga_cnn ├── src │ ├── pow_function.h │ ├── image_converter.h │ ├── data_type.h │ ├── config.h │ ├── resize_image.h │ ├── activation_functions.h │ ├── construct_net.h │ ├── ff_test.cpp │ ├── max_pool_acc_innerpp.h │ ├── fc_acc_innerpp.h │ ├── acc_instance.h │ └── conv_acc_innerpp.h └── testbench │ ├── pooling_validate.h │ ├── conv_validate.h │ ├── fc_validate.h │ ├── pooling_validate.cpp │ ├── print_array.h │ ├── fc_validate.cpp │ └── conv_validate.cpp ├── acc_runtime ├── aws_acc │ ├── api_lib │ │ ├── src │ │ │ ├── acc_ctrl.o │ │ │ └── acc_ctrl.cpp │ │ └── inc │ │ │ ├── cl_tsc.h │ │ │ └── acc_ctrl.h │ └── README.md └── local_acc │ ├── demos │ └── convTest │ │ ├── runtime │ │ ├── runtime.o │ │ ├── acc_ctrl.o │ │ ├── Makefile │ │ ├── acc_config.h │ │ └── runtime.cpp │ ├── api_lib │ ├── inc │ │ ├── cl_tsc.h │ │ └── acc_ctrl.h │ └── src │ │ └── acc_ctrl.cpp │ └── README.md ├── scripts └── hls_impl │ ├── syn.sh │ └── hls_script.tcl ├── netGenerator ├── clean.sh ├── README.md ├── run_generator.sh ├── dse │ ├── model_partition.py │ ├── param_write.py │ ├── global_search.py │ ├── model_extract.py │ ├── model_split.py │ ├── tm_tn_multiAcc.py │ ├── task_analysis.py │ └── helping_functions.py ├── alex.prototxt ├── netGen │ ├── generate_consNet.py │ └── generate_accInst.py └── paramExtractor │ └── extract.py ├── examples └── AlexNet │ ├── net_config_params.txt │ └── acc_ins_params.txt ├── LICENSE └── README.md /docs/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/docs/flow.png -------------------------------------------------------------------------------- /docs/system_overview_aws.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/docs/system_overview_aws.pdf -------------------------------------------------------------------------------- /fpga_cnn/src/pow_function.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/fpga_cnn/src/pow_function.h -------------------------------------------------------------------------------- /docs/system_overview_local.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/docs/system_overview_local.pdf -------------------------------------------------------------------------------- /fpga_cnn/src/image_converter.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/fpga_cnn/src/image_converter.h -------------------------------------------------------------------------------- /acc_runtime/aws_acc/api_lib/src/acc_ctrl.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/aws_acc/api_lib/src/acc_ctrl.o -------------------------------------------------------------------------------- /acc_runtime/local_acc/demos/convTest/runtime: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/local_acc/demos/convTest/runtime -------------------------------------------------------------------------------- /acc_runtime/local_acc/demos/convTest/runtime.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/local_acc/demos/convTest/runtime.o -------------------------------------------------------------------------------- /acc_runtime/local_acc/demos/convTest/acc_ctrl.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microideax/Open-Dnn/HEAD/acc_runtime/local_acc/demos/convTest/acc_ctrl.o -------------------------------------------------------------------------------- /scripts/hls_impl/syn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in 0 1 2 4 | do 5 | rm -rf sub_net_$i vivado_hls.log 6 | done 7 | 8 | echo "Cleaned existing projects!!!" 9 | 10 | echo "Start generating sub-net IPs ..." 11 | 12 | vivado_hls -f hls_script.tcl 13 | #vivado_hls -f hls_script.tcl 14 | -------------------------------------------------------------------------------- /netGenerator/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | rm -rf paramExtractor/*.txt 5 | rm -rf dse/*.txt 6 | rm -rf netGen/*.txt 7 | rm -rf netGen/*.h 8 | rm -rf *.h 9 | 10 | rm -rf ../gen_proj 11 | rm -rf ./dse/__pycache__/ 12 | rm -rf ./dse/.idea/ 13 | rm -rf ./netGen/.idea/ 14 | rm -rf ./paramExtractor/.idea/ 15 | 16 | echo "Cleaned all the intermediate files and newly generated file!!!" 17 | -------------------------------------------------------------------------------- /netGenerator/README.md: -------------------------------------------------------------------------------- 1 | #This is the README file for the NN model generation. 2 | 3 | Follow the steps below to generate the CNN accelerator: 4 | 1. Extract the parameters of an input CNN in paramExtractor/ folder. Copy the net_config_params.txt to dse/ folder. 5 | 2. Run design space exploration in dse/ folder and get the acc_ins_param.txt. 6 | 3. Mv acc_ins_param.txt to netGen folder to generate the accelerators. 7 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/demos/convTest/Makefile: -------------------------------------------------------------------------------- 1 | runtime:runtime.o acc_ctrl.o 2 | g++ -o runtime runtime.o acc_ctrl.o -I ../../api_lib/inc 3 | runtime.o:runtime.cpp ../../api_lib/src/acc_ctrl.cpp 4 | g++ -c runtime.cpp ../../api_lib/src/acc_ctrl.cpp -I ../../api_lib/inc 5 | acc_ctrl.o:../../api_lib/src/acc_ctrl.cpp 6 | g++ -c ../../api_lib/src/acc_ctrl.cpp -I ../../api_lib_inc 7 | 8 | clean: 9 | rm -f *.o runtime 10 | -------------------------------------------------------------------------------- /acc_runtime/aws_acc/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Table of Contents 4 | 5 | 1. AGFI number: agfi-03391747dc68cd939 6 | 2. Testing demos on this AGFI. 7 | 3. Make sure your sdk_setup.sh is executed successfully. 8 | 9 | # Environmental Settings 10 | 1. F1.2Xlarge instance 11 | 2. Pre-installed caffe framework 12 | 3. Pre-installed Opencv package support 13 | 4. Only suitable to FPGA development AMI 14 | 15 | # How to use 16 | ## Hardware setup 17 | *Map the AGFI to the platform first, all these demos work with the same AGFI 18 | 19 | ## Compile and execute the host program 20 | 1. cd into demo folder, then make 21 | 2. execute the demo with the compiled executable file 22 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/demos/convTest/acc_config.h: -------------------------------------------------------------------------------- 1 | #ifndef ACC_CONFIG_H 2 | #define ACC_CONFIG_H 3 | 4 | #define ACC0_PARA_OFFSET 0x10000000 5 | #define ACC0_WEIGHT_OFFSET 0x12000000 6 | #define ACC0_DATA_IN_OFFSET 0x13000000 7 | #define ACC0_CTRL_OFFSET 0x00000000 8 | 9 | 10 | #define ACC1_PARA_OFFSET 0x14000000 11 | #define ACC1_WEIGHT_OFFSET 0x16000000 12 | #define ACC1_DATA_IN_OFFSET 0x00000000 13 | #define ACC1_CTRL_OFFSET 0x00010000 14 | 15 | 16 | #define ACC2_PARA_OFFSET 0x17000000 17 | #define ACC2_WEIGHT_OFFSET 0x19000000 18 | #define ACC2_DATA_IN_OFFSET 0x00000000 19 | #define ACC2_CTRL_OFFSET 0x00020000 20 | 21 | 22 | 23 | #endif -------------------------------------------------------------------------------- /fpga_cnn/testbench/pooling_validate.h: -------------------------------------------------------------------------------- 1 | #ifndef POOLING_VALIDATE_H 2 | #define POOLING_VALIDATE_H 3 | 4 | #include "/opt/Xilinx/Vivado/2018.1/include/gmp.h" 5 | #include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h" 6 | #include "ap_fixed.h" 7 | 8 | class pooling_validate 9 | { 10 | public: 11 | int num_input; 12 | int stride; 13 | int kernel_size; 14 | int inputfeature_size; 15 | int outputfeature_size; 16 | int act; 17 | 18 | ap_int<512> *input_feature; 19 | ap_int<512> *output_feature; 20 | ap_int<512> *output_feature_software; 21 | ap_uint<32> config_list[16]; 22 | 23 | pooling_validate(int num_input,int stride,int kernel_size,int inputfeature_size,int act); 24 | void print_feature_in(void); 25 | }; 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /fpga_cnn/src/data_type.h: -------------------------------------------------------------------------------- 1 | // Baseline data type define for the entire design 2 | // TODO: 3 | 4 | #ifndef _DATA_TYPE_H_ 5 | #define _DATA_TYPE_H_ 6 | 7 | #include 8 | //#include 9 | //#include 10 | 11 | using namespace std; 12 | 13 | typedef unsigned int uint; 14 | typedef uint cnn_size_t; 15 | /* 16 | typedef ap_fixed<64, 32> cnn_data_64; 17 | typedef ap_fixed<32, 16> cnn_data_32; 18 | typedef ap_fixed<16, 8> cnn_data_16; 19 | typedef ap_fixed<8, 4> cnn_data_8; 20 | typedef ap_fixed<4, 2> cnn_data_4; 21 | */ 22 | typedef std::vector > std_vec_t; 23 | typedef std::vector std_tensor_t; 24 | typedef std::vector std_tensor_t_3d; 25 | 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /acc_runtime/aws_acc/api_lib/inc/cl_tsc.h: -------------------------------------------------------------------------------- 1 | #ifndef _CL_TSC_H_ 2 | #define _CL_TSC_H_ 3 | 4 | #define CPU_FREQUENCY (3600) 5 | #include 6 | inline uint64_t ticks() { 7 | uint32_t lo, hi; 8 | __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); 9 | return ((uint64_t)hi << 32) | lo; 10 | } 11 | 12 | inline double cycles_to_nanoseconds(uint64_t cycles) { 13 | return (uint64_t)((double) cycles / CPU_FREQUENCY * 1000); 14 | } 15 | 16 | inline double cycles_to_microseconds(uint64_t cycles) { 17 | return cycles_to_nanoseconds(cycles) / 1000; 18 | } 19 | 20 | inline double cycles_to_milliseconds(uint64_t cycles) { 21 | return cycles_to_nanoseconds(cycles) / 1000000; 22 | } 23 | 24 | inline double cycles_to_seconds(uint64_t cycles) { 25 | return cycles_to_nanoseconds(cycles) / 1000000000; 26 | } 27 | 28 | #endif //_CL_TSC_H_ 29 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/api_lib/inc/cl_tsc.h: -------------------------------------------------------------------------------- 1 | #ifndef _CL_TSC_H_ 2 | #define _CL_TSC_H_ 3 | 4 | #define CPU_FREQUENCY (3600) 5 | #include 6 | inline uint64_t ticks() { 7 | uint32_t lo, hi; 8 | __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); 9 | return ((uint64_t)hi << 32) | lo; 10 | } 11 | 12 | inline double cycles_to_nanoseconds(uint64_t cycles) { 13 | return (uint64_t)((double) cycles / CPU_FREQUENCY * 1000); 14 | } 15 | 16 | inline double cycles_to_microseconds(uint64_t cycles) { 17 | return cycles_to_nanoseconds(cycles) / 1000; 18 | } 19 | 20 | inline double cycles_to_milliseconds(uint64_t cycles) { 21 | return cycles_to_nanoseconds(cycles) / 1000000; 22 | } 23 | 24 | inline double cycles_to_seconds(uint64_t cycles) { 25 | return cycles_to_nanoseconds(cycles) / 1000000000; 26 | } 27 | 28 | #endif //_CL_TSC_H_ 29 | -------------------------------------------------------------------------------- /examples/AlexNet/net_config_params.txt: -------------------------------------------------------------------------------- 1 | Network Structure: Convolution Pooling Convolution Pooling Convolution Convolution Convolution Pooling InnerProduct InnerProduct InnerProduct 2 | nn_in_data_size_conv: 227 27 13 13 13 3 | nn_channel_size_conv: 11 5 3 3 3 4 | nn_padding_conv: 0 2 1 1 1 5 | nn_stride_conv: 4 1 1 1 1 6 | nn_in_number_conv: 3 96 256 384 384 7 | nn_out_number_conv: 96 256 384 384 256 8 | nn_group_conv: 1 2 1 2 2 9 | nn_bias_conv: 96 256 384 384 256 10 | nn_in_data_size_pooling: 55 27 13 11 | nn_channel_size_pooling: 3 3 3 12 | nn_padding_pooling: 0 0 0 13 | nn_stride_pooling: 2 2 2 14 | nn_in_number_pooling: 96 256 256 15 | nn_in_data_size_fc: 16 | nn_in_number_fc: 9216 4096 4096 17 | nn_out_number_fc: 4096 4096 1000 18 | nn_channel_size_fc: 1 1 1 19 | conv_cut_flag: 1 1 1 1 1 20 | pool_cut_flag: 1 1 1 21 | fc_cut_flag: 1 1 1 22 | -------------------------------------------------------------------------------- /fpga_cnn/testbench/conv_validate.h: -------------------------------------------------------------------------------- 1 | #ifndef CONV_VALIDATE_H 2 | #define CONV_VALIDATE_H 3 | #include "/opt/Xilinx/Vivado/2018.1/include/gmp.h" 4 | #include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h" 5 | #include "ap_fixed.h" 6 | 7 | class conv_validate 8 | { 9 | public: 10 | int layer_num; 11 | ap_int<512> weight[16384]; 12 | ap_int<512> input_feature[6400]; 13 | ap_int<512> output_feature[4096]; 14 | ap_int<512> output_feature_software[4096]; 15 | ap_fixed<32,26> bias[1024]; 16 | ap_uint<32>* param_list; 17 | 18 | 19 | conv_validate(ap_uint<32>* param_list); //(int layer_num, int num_input,int num_output,int kernel_size,int stride,int padding, int inputfeature_size, int inport); 20 | void print_weight(void); 21 | void print_feature_in(void); 22 | void print_bias(void); 23 | // void print_feature_out(void); 24 | // 25 | // void software_conv_process(void); 26 | // void print_feature_out_softeare(void); 27 | // void test_fun(void); 28 | }; 29 | 30 | 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /examples/AlexNet/acc_ins_params.txt: -------------------------------------------------------------------------------- 1 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 96, 3, 11, 11, 5, 5, 32, 32, 32 2 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 128, 11, 27, 27, 5, 5, 32, 32, 32 3 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 96, 19, 13, 13, 5, 5, 32, 32, 32 4 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 64, 17, 13, 13, 5, 5, 32, 32, 32 5 | conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 64, 11, 13, 13, 5, 5, 32, 32, 32 6 | max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3 7 | max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3 8 | max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3 9 | conv_pool, 0,0,0 10 | conv_pool, 1,1,1 11 | conv_pool, 2,2 12 | conv_pool, 3,3 13 | conv_pool, 4,4,2 14 | sub_net_0,2,1024,20300,9662,22688,4374,14580 15 | sub_net_1,1,1024,27660,2704,4056 16 | sub_net_2,2,1024,69140,4056,4056,4056,3380 17 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Table of Contents 4 | 5 | 1. API functions to call the accelerators. 6 | 2. Testing demos. 7 | 8 | # Environmental Settings 9 | 1. UltraScale+ VU118 board in PCIe mode (xdma driver is required) 10 | 2. Pre-installed caffe framework 11 | 3. Pre-installed Opencv package support 12 | 4. Pre-installed Vivado design suits 13 | 14 | # How to use 15 | 16 | ## Starting from a design 17 | Generate or download the bitstream to the platform 18 | 1. cd into a demo in the bitstream/ folder 19 | 2. start vivado in tcl mode with 'vivado -mode tcl' command 20 | 3. modify the demo name in the make_spi_mcs.tcl file 21 | 4. source the modified make_spi_mcs.tcl in vivado 22 | 5. source the program_spi.tcl after the .mcs files are generated 23 | 24 | ## Starting from a bistream file 25 | 1. Download the bitstream to the flash on the board 26 | 2. Re-start your system 27 | 28 | ## Compile and execute the host program 29 | 1. cd into a demo in the demos/ folder 30 | 2. make 31 | 3. execute the executable file 32 | -------------------------------------------------------------------------------- /fpga_cnn/testbench/fc_validate.h: -------------------------------------------------------------------------------- 1 | #ifndef FC_VALIDATE_H 2 | #define FC_VALIDATE_H 3 | #include "/opt/Xilinx/Vivado/2018.1/include/gmp.h" 4 | #include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h" 5 | #include "ap_fixed.h" 6 | 7 | class fc_validate 8 | { 9 | public: 10 | int layer_num; 11 | int num_input; 12 | int num_output; 13 | int act; 14 | 15 | // ap_int<512> weight[1024]; 16 | // ap_int<512> in_feature[1024]; 17 | // ap_int<512> out_feature[1024]; 18 | // ap_int<512> out_feature_software[1024]; 19 | // ap_int<512> bias[32]; 20 | 21 | ap_int<512> *weight; 22 | ap_int<512> *in_feature; 23 | ap_int<512> *out_feature; 24 | ap_int<512> *out_feature_software; 25 | ap_int<512> bias[4096]; 26 | 27 | ap_uint<32> lnum_list[16]; 28 | ap_uint<32> config_list[16*16]; 29 | 30 | fc_validate(int layer_num, int num_input, int num_output, int act); 31 | void print_weight(void); 32 | void print_feature_in(void); 33 | void print_bias(void); 34 | void print_feature_out(void); 35 | 36 | void software_fc_process(void); 37 | void print_software_out(void); 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/api_lib/inc/acc_ctrl.h: -------------------------------------------------------------------------------- 1 | #ifndef ACC_CTRL_H 2 | #define ACC_CTRL_H 3 | 4 | #include "cl_tsc.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | //==============board level interface============// 17 | #define DEVICE_H2C "/dev/xdma0_h2c_1" 18 | #define DEVICE_C2H "/dev/xdma0_c2h_2" 19 | #define DEVICE_CTRL "/dev/xdma0_user" 20 | 21 | #define MAP_SIZE (8*1024UL) 22 | //================ctrl port addr================= 23 | 24 | class acc_ctrl 25 | { 26 | private: 27 | uint32_t para_offset_addr; 28 | uint32_t weight_offset_addr; 29 | uint32_t data_in_offset_addr; 30 | off_t ctrl_addr; 31 | public: 32 | acc_ctrl( 33 | uint32_t para_offset_addr, 34 | uint32_t weight_offset_addr, 35 | uint32_t data_in_offset_addr, 36 | off_t ctrl_addr 37 | ); 38 | void write_weight(short int weight[][32],int weight_length); 39 | void write_para(int* para_list,int para_length); 40 | void write_data(short int feature[][32],int feature_length); 41 | void start_process(int mode); 42 | void read_data(short int feature[][32],int feature_length); 43 | 44 | }; 45 | 46 | 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /acc_runtime/aws_acc/api_lib/inc/acc_ctrl.h: -------------------------------------------------------------------------------- 1 | #ifndef ACC_CTRL_H 2 | #define ACC_CTRL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "cl_tsc.h" 15 | 16 | #include "fpga_pci.h" 17 | #include "fpga_mgmt.h" 18 | #include "fpga_dma.h" 19 | 20 | using namespace std; 21 | //==============board level interface============// 22 | #define DEVICE_H2C "/dev/xdma0_h2c_1" 23 | #define DEVICE_C2H "/dev/xdma0_c2h_2" 24 | #define DEVICE_CTRL "/dev/xdma0_user" 25 | 26 | //================ctrl port addr================= 27 | 28 | class acc_ctrl 29 | { 30 | private: 31 | uint32_t para_offset_addr; 32 | uint32_t bias_offset_addr; 33 | uint32_t weight_offset_addr; 34 | uint32_t data_in_offset_addr; 35 | off_t ctrl_addr; 36 | public: 37 | acc_ctrl( 38 | uint32_t para_offset_addr, 39 | uint32_t weight_offset_addr, 40 | uint32_t data_in_offset_addr, 41 | off_t ctrl_addr 42 | ); 43 | 44 | void write_bias(int* bias,int bias_length); 45 | void write_weight(short int weight[][32],int weight_length); 46 | void write_para(int* para_list,int para_length); 47 | void write_data(short int feature[][32],int feature_length); 48 | void start_process(int mode); 49 | void read_data(short int feature[][32],int feature_length,int memId); 50 | }; 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /netGenerator/run_generator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts i:k:h: option 4 | do 5 | case "${option}" 6 | in 7 | i) INFILE=${OPTARG};; 8 | h) HELP=1;; 9 | esac 10 | done 11 | 12 | #if HELP==1 13 | # echo "./run_generator.sh -i input.prototxt to execute the generation!" 14 | #fi 15 | 16 | ./clean.sh 17 | 18 | echo $INFILE 19 | mkdir -p ../gen_proj 20 | mkdir -p ../gen_proj/hls_proj/src 21 | mkdir -p ../gen_proj/hls_proj/testbench 22 | mkdir -p ../gen_proj/impl_proj/aws_impl 23 | mkdir -p ../gen_proj/impl_proj/local_impl 24 | 25 | echo "script executed!!!" 26 | #--------------1.param extract----------------------- 27 | python3.5 paramExtractor/extract.py --model $INFILE 28 | mv net_config_params.txt dse/ 29 | echo "Finished network parameter extraction." 30 | echo " " 31 | #--------------2.design space exploration------------ 32 | python3.5 dse/tm_tn_multiAcc.py dse/net_config_params.txt 33 | mv acc_ins_params.txt netGen/ 34 | echo "Finished accelerator design space exploration." 35 | echo " " 36 | #--------------3.code generation--------------------- 37 | python3.5 netGen/generate_accInst.py --params netGen/acc_ins_params.txt 38 | python3.5 netGen/generate_consNet.py --params netGen/acc_ins_params.txt 39 | echo "Finished accelerators and sub-nets generation." 40 | echo "Constructing the testing and implementation folder..." 41 | #TODO: move all the files into the correct positions, src/testbench/ 42 | cp ../fpga_cnn/src/* ../gen_proj/hls_proj/src/ 43 | cp ../fpga_cnn/testbench/* ../gen_proj/hls_proj/testbench/ 44 | mv *.h ../gen_proj/hls_proj/src/ 45 | 46 | cp ../scripts/hls_impl/* ../gen_proj/hls_proj/ 47 | cp ../scripts/sys_gen/local_impl/* ../gen_proj/impl_proj/local_impl/ 48 | cp ../scripts/sys_gen/aws_impl/* ../gen_proj/impl_proj/aws_impl/ 49 | 50 | echo "Files copied" 51 | echo "Generation done!!!" 52 | 53 | exit 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) <2019> 2 | 3 | 4 | All rights reserved. 5 | 6 | 7 | General terms and conditions for use of the Open-DNN software. The software 8 | programs comprising "Open-DNN" and the documentation provided with them are 9 | copyright Yao Chen, Deming Chen, the Advanced Digital Sciences Center, Singapore 10 | and the University of Illinois at Urbana-Champaign. 11 | 12 | Only non-commercial, not-for-profit use of this software is permitted. No part 13 | of this software may be incorporated into a commercial product without the 14 | written consent of the authors (Yao Chen and Deming Chen). Similarly, use of 15 | this software to assist in the development of new commercial FPGA designs is 16 | prohibited, unless the written consent of the authors is obtained. 17 | 18 | This software is provided "as is" with no warranties or guarantees of support. 19 | All users of the software must take the copy from this site. You may modify or 20 | use the source code for other non-commercial, not-for-profit research endeavours, 21 | provided that all copyright attribution on the source code is retained, and the 22 | original or modified source code is not redistributed, in whole or in part, or 23 | included in or with any commercial product, except by written agreement with 24 | the authors, and full and complete attribution for use of the code is given in 25 | any resulting publications. Subject to these conditions, the software is 26 | provided free of charge to all interested parties. 27 | 28 | When referencing this particular open-source software in a publication, please 29 | cite the following publication: 30 | Yao Chen, Jiong He, Xiaofan Zhang, Cong Hao and Deming Chen, "Cloud-DNN: An Open 31 | Framework for Mapping DNN Models to Cloud FPGAs", Proceedings of ACM/SIGDA 32 | International Symposium on Field Programmable Gate Arrays, February 2019. 33 | -------------------------------------------------------------------------------- /fpga_cnn/testbench/pooling_validate.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "pooling_validate.h" 5 | using namespace std; 6 | 7 | pooling_validate::pooling_validate(int num_input,int stride,int kernel_size,int inputfeature_size,int act) 8 | { 9 | 10 | int i,j,k; 11 | this->num_input = num_input; 12 | this->stride = stride; 13 | this->kernel_size = kernel_size; 14 | this->inputfeature_size = inputfeature_size; 15 | this->act = act; 16 | outputfeature_size = (inputfeature_size-kernel_size)/stride + 1; 17 | 18 | config_list[0] = inputfeature_size; 19 | config_list[1] = inputfeature_size; 20 | config_list[2] = num_input; 21 | config_list[3] = kernel_size; 22 | config_list[4] = outputfeature_size; 23 | config_list[5] = outputfeature_size; 24 | config_list[6] = stride; 25 | config_list[7] = 0; 26 | config_list[8] = act; 27 | config_list[9] = 0; 28 | config_list[10] = 0; 29 | config_list[11] = 0; 30 | config_list[12] = 0; 31 | config_list[13] = 0; 32 | config_list[14] = 0; 33 | config_list[15] = 0; 34 | 35 | input_feature = new ap_int<512>[inputfeature_size * inputfeature_size * (int)(ceil(((double)num_input)/32))]; 36 | output_feature = new ap_int<512>[outputfeature_size * outputfeature_size * (int)(ceil(((double)num_input)/32))]; 37 | output_feature_software = new ap_int<512>[outputfeature_size * outputfeature_size * (int)(ceil(((double)num_input)/32))]; 38 | 39 | 40 | for(i = 0 ; i < num_input; i++) 41 | for(j = 0 ; j < inputfeature_size ;j++) 42 | for(k = 0 ; k < inputfeature_size; k++) 43 | // input_feature[i/32*inputfeature_size*inputfeature_size + j*inputfeature_size + k].range(i%32) = rand()%10; 44 | ; 45 | } 46 | 47 | 48 | //void pooling_validate::print_feature_in(void) 49 | //{ 50 | // int i,j,k; 51 | // for(i = 0 ; i < num_input; i++) 52 | // for(j = 0 ; j < inputfeature_size ; j++) 53 | // for(k = 0 ; k < inputfeature_size; k++) 54 | // ; 55 | // 56 | //} 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /scripts/hls_impl/hls_script.tcl: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | ## This file is modified from the tcl script generated with vivado_hls 2018.2. 3 | ## This modified version is used to ease the run of synthesis of the sub_nets. 4 | ## Please edit based on the user manual if necessary. 5 | ############################################################################## 6 | 7 | #if { $argc != 1} { 8 | # puts "The hls_script.tcl script requires only one input." 9 | # puts "For example. hls_script.tcl 0" 10 | # puts "Please try again" 11 | #} else { 12 | # puts [$PNUM = $argv 0] 13 | #} 14 | 15 | open_project ip_gen 16 | 17 | 18 | #add design files to the project 19 | add_files ./src/data_type.h 20 | add_files ./src/config.h 21 | add_files ./src/activation_functions.h 22 | add_files ./src/conv_acc_2ibuf.h 23 | add_files ./src/max_pool_acc_innerpp.h 24 | add_files ./src/fc_acc_innerpp.h 25 | add_files ./src/acc_instance.h 26 | add_files ./src/construct_net.h 27 | add_files ./src/ff_test.cpp 28 | 29 | add_files -tb ./testbench/conv_validate.h 30 | add_files -tb ./testbench/conv_validate.cpp 31 | add_files -tb ./testbench/pooling_validate.h 32 | add_files -tb ./testbench/pooling_validate.cpp 33 | add_files -tb ./testbench/fc_validate.h 34 | add_files -tb ./testbench/fc_validate.cpp 35 | add_files -tb ./testbench/print_array.h 36 | add_files -tb ./src/ff_test.cpp 37 | 38 | for {set i 0} {$i < 3} {incr i} { 39 | set_top sub_net_$i 40 | 41 | open_solution -reset "sub_net_$i" 42 | # UltraScale+ 43 | set_part {xcvu9p-flgb2104-2-i} -tool vivado 44 | 45 | create_clock -period 1.6 -name default 46 | config_compile -name_max_length 500 -pipeline_loops 0 47 | csim_design -clean -compiler gcc 48 | csynth_design 49 | 50 | # If the co-sim is required for verification, uncomment the following line. 51 | #cosim_design -compiler gcc -trace_level all 52 | 53 | export_design -flow syn -rtl verilog -format ip_catalog 54 | } 55 | 56 | exit 57 | 58 | } 59 | -------------------------------------------------------------------------------- /fpga_cnn/src/config.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _CONFIG_H_ 3 | #define _CONFIG_H_ 4 | 5 | #include "ap_fixed.h" 6 | #include "ap_int.h" 7 | 8 | 9 | //extern int layer_count = 0; 10 | //define data type 11 | typedef ap_fixed<16,10> data_type; 12 | typedef ap_fixed<16,10> data_type_w; 13 | typedef ap_fixed<16,10> data_type_o; 14 | typedef ap_fixed<32,26> b_type; 15 | 16 | typedef unsigned int uint; 17 | 18 | typedef ap_uint<32> Tparam; 19 | // counter datatype used in the entire design 20 | typedef ap_int<512> data_type_itf; 21 | //typedef ap_fixed<16,0> param_type; 22 | 23 | #define READ_LEN_i uint(sizeof(ap_int<512>)/sizeof(data_type)) 24 | #define READ_LEN_w uint(sizeof(ap_int<512>)/sizeof(data_type_w)) 25 | #define READ_LEN_o uint(sizeof(ap_int<512>)/sizeof(data_type_o)) 26 | 27 | #define DATA_LEN uint(sizeof(data_type)) 28 | #define DATA_O_LEN int(sizeof(data_type_itf)/sizeof(data_type_o)) 29 | #define DATA_O int(sizeof(data_type_o)) 30 | 31 | // C++ compilation debug mode 32 | #define _LAYER_MODE_ 1 33 | 34 | #ifdef _LAYER_MODE_ 35 | #define IBUF (Tr-1)*S_max + K_max 36 | //#define 37 | #endif 38 | 39 | // C++ compilation debug mode 40 | //#ifdef _LAYER_MODE_ 41 | //#define _ACC_MODE_ 0 42 | //#else 43 | //#define _ACC_MODE_ 1 44 | //#endif 45 | 46 | #define _HLS_MODE_ 1 47 | 48 | //#define _BATCH_MODE_ 1 49 | #ifdef _BATCH_MODE_ 50 | #define _KERNEL_DEBUG_ 0 51 | #else 52 | #define _KERNEL_DEBUG_ 1 53 | #endif 54 | #ifndef _HLS_MODE_ 55 | #define _C_DEBUG_MODE_ 1 56 | #endif 57 | 58 | #define _8CH_ 1 59 | 60 | 61 | //network configuration PARAMETERS 62 | int weight_bias_record = 0; 63 | int weight_bias_count_1 = 0; 64 | int weight_bias_count_2 = 0; 65 | int nn_in_data_size_conv[2] = {28, 14}; 66 | int nn_in_number_conv[2] = {1, 6}; 67 | int nn_out_number_conv[2] = {6, 16}; 68 | int nn_channel_size_conv[2] = {5, 5}; 69 | int nn_padding_conv[2] = {2, 0}; 70 | int nn_group_conv[2] = {1, 1}; 71 | int nn_in_data_size_pooling[2] = {28, 10}; 72 | int nn_in_number_pooling[2] = {6, 16}; 73 | int nn_channel_size_pooling[2] = {2, 2}; 74 | int nn_in_data_size_fc[1] = {5}; 75 | int nn_in_number_fc[1] = {16}; 76 | int nn_out_number_fc[1] = {10}; 77 | int nn_channel_size_fc[1] = {5}; 78 | 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /netGenerator/dse/model_partition.py: -------------------------------------------------------------------------------- 1 | # Author Jiong He on 28 June 2018 at ADSC 2 | #!/usr/bin/env python 3 | 4 | from itertools import permutations 5 | 6 | 7 | def partition(layerlist): 8 | """ 9 | partition is a generator that generates all possible partitions of a given layer list in to all possible 10 | number (i.e., 1 to len(LayerList)) of subpartititons. 11 | :param layerlist: a list containing each layer information in the form of a tuple (layer index, layer name). 12 | :return: generate one possible partition 13 | """ 14 | if len(layerlist) == 1: 15 | yield [layerlist] 16 | else: 17 | first = layerlist[0] 18 | for each_partition in partition(layerlist[1:]): 19 | # first choice: insert the first element into each of the subpartition's subsets 20 | for groupIdx, group in enumerate(each_partition, 0): 21 | yield each_partition[:groupIdx] + [[first] + group] + each_partition[groupIdx + 1:] 22 | # second choice: insert the first as an individual subset 23 | yield [[first]] + each_partition 24 | 25 | 26 | def partition_to_k(layerlist, k, order=False): 27 | """ 28 | partition_to_k calls partition function and filters those partitions that has k number of groups. If order is True, 29 | all permutations of this partition will be treated as different partition and output one by one. 30 | :param layerlist: a list containing each layer information in the form of a tuple (layer index, layer name). 31 | :param k: number of groups to partitioned into 32 | :param order: whether the order of groups matter 33 | :return: generate one possible partition 34 | """ 35 | for each_partition_candidate in partition(layerlist): 36 | if len(each_partition_candidate) == k: 37 | if not order: 38 | yield each_partition_candidate 39 | else: 40 | for enum_item in permutations(each_partition_candidate): 41 | yield enum_item 42 | 43 | 44 | if __name__ == "__main__": 45 | # layer_list_1 = [(0, 'c'), (1, 'p'), (2, 'c'), (3, 'c'), (4, 'p'), (5, 'c')] 46 | layer_list_2 = [(0, 'c'), (1, 'p'), (2, 'c'), (3, 'c')] 47 | layer_list_3 = [1, 2, 3, 4] 48 | for idx, item in enumerate(partition_to_k(layer_list_3, 3, False), 1): 49 | print(item) -------------------------------------------------------------------------------- /fpga_cnn/src/resize_image.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _RESIZE_IMAGE_H_ 3 | #define _RESIZE_IMAGE_H_ 4 | 5 | #include 6 | template 7 | void resize_image(float(&x)[channels][height][width], int h, int w, float(&y)[channels][size][size]) { 8 | //(1) 9 | int w0 = w; 10 | int h0 = h; 11 | int w1 = size; 12 | int h1 = size; 13 | float fw = float(w0) / (w1); 14 | float fh = float(h0) / (h1); 15 | 16 | float image_max = 0.0; 17 | float image_min = x[0][0][0]; 18 | for (int i = 0; i < channels; i++) { 19 | for (int j = 0; j < height; j++) { 20 | for (int k = 0; k < width; k++) { 21 | if (x[i][j][k]>image_max) 22 | image_max = x[i][j][k]; 23 | if (x[i][j][k]para_offset_addr = para_offset_addr; 10 | this->weight_offset_addr = weight_offset_addr; 11 | this->data_in_offset_addr = data_in_offset_addr; 12 | this->ctrl_addr = ctrl_addr; 13 | } 14 | 15 | void acc_ctrl::write_weight( short int weight[][32], 16 | int weight_length) 17 | { 18 | cout <<"write weight start" << endl; 19 | int write_fd; 20 | int rc; 21 | int slot_id = 0; 22 | write_fd = fpga_dma_open_queue( FPGA_DMA_XDMA, slot_id,/*channel*/ 0, /*is_read*/ false); 23 | rc = fpga_dma_burst_write(write_fd, (uint8_t*)weight,weight_length,weight_offset_addr); 24 | if(write_fd >= 0) 25 | close(write_fd); 26 | cout <<"write weight finish" << endl; 27 | } 28 | 29 | void acc_ctrl::write_para( int* para_list, 30 | int para_length) 31 | { 32 | cout <<"write para start" << endl; 33 | int write_fd; 34 | int rc; 35 | int slot_id = 0; 36 | write_fd = fpga_dma_open_queue( FPGA_DMA_XDMA, slot_id,/*channel*/ 0, /*is_read*/ false); 37 | rc = fpga_dma_burst_write(write_fd, (uint8_t*)para_list,para_length,para_offset_addr); 38 | if(write_fd >= 0) 39 | close(write_fd); 40 | cout <<"write para finish" << endl; 41 | } 42 | 43 | void acc_ctrl::write_data( short int feature[][32], 44 | int feature_length) 45 | { 46 | cout <<"write data start"<= 0) 54 | close(write_fd); 55 | cout <<"write data finish"<= 0) 82 | { 83 | rc = fpga_pci_detach(pci_bar_handle); 84 | if (rc) 85 | { 86 | cout <<"Failure while detaching from the fpga."<= 0) 102 | { 103 | close(read_fd); 104 | } 105 | cout << "read finish" << endl; 106 | } 107 | 108 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/api_lib/src/acc_ctrl.cpp: -------------------------------------------------------------------------------- 1 | #include "acc_ctrl.h" 2 | 3 | using namespace std; 4 | 5 | const char *device_h2c = DEVICE_H2C; 6 | const char *device_c2h = DEVICE_C2H; 7 | const char *device_ctrl = DEVICE_CTRL; 8 | 9 | acc_ctrl::acc_ctrl( uint32_t para_offset_addr, 10 | uint32_t weight_offset_addr, 11 | uint32_t data_in_offset_addr, 12 | off_t ctrl_addr 13 | ) 14 | { 15 | this->para_offset_addr = para_offset_addr; 16 | this->weight_offset_addr = weight_offset_addr; 17 | this->data_in_offset_addr = data_in_offset_addr; 18 | this->ctrl_addr = ctrl_addr; 19 | } 20 | 21 | void acc_ctrl::write_weight( short int weight[][32], 22 | int weight_length) 23 | { 24 | cout <<"write weight start" << endl; 25 | int fpga_fd; 26 | int rc; 27 | off_t off; 28 | fpga_fd= open(device_h2c,O_RDWR); 29 | assert(fpga_fd >= 0); 30 | off = lseek(fpga_fd,weight_offset_addr,SEEK_SET); 31 | rc = write(fpga_fd, weight, weight_length); 32 | assert(rc == weight_length); 33 | close(fpga_fd); 34 | cout <<"write weight finish" << endl; 35 | } 36 | 37 | 38 | void acc_ctrl::write_para( int* para_list, 39 | int para_length) 40 | { 41 | cout <<"write para start" << endl; 42 | int fpga_fd; 43 | int rc; 44 | off_t off; 45 | fpga_fd= open(device_h2c,O_RDWR); 46 | assert(fpga_fd >= 0); 47 | off = lseek(fpga_fd,para_offset_addr,SEEK_SET); 48 | rc = write(fpga_fd, para_list, para_length); 49 | assert(rc == para_length); 50 | close(fpga_fd); 51 | cout <<"write para finish" << endl; 52 | } 53 | 54 | 55 | void acc_ctrl::write_data( short int feature[][32], 56 | int feature_length) 57 | { 58 | cout <<"write data start"<= 0); 64 | off = lseek(fpga_fd,data_in_offset_addr,SEEK_SET); 65 | rc = write(fpga_fd, feature, feature_length); 66 | assert(feature_length); 67 | close(fpga_fd); 68 | cout <<"write data finish"<= 0); 83 | cout <<"fpga_ctrl_open"<= 0); 128 | off = lseek(fpga_fd,0xC0000000,SEEK_SET); 129 | cout << "read finish00" << endl; 130 | rc = read(fpga_fd, feature, feature_length); 131 | assert(rc == feature_length); 132 | close(fpga_fd); 133 | cout << "read finish" << endl; 134 | } -------------------------------------------------------------------------------- /fpga_cnn/testbench/print_array.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by yaochen on 22/10/18. 3 | // 4 | 5 | #ifndef _PRINT_ARRAY_H_ 6 | #define _PRINT_ARRAY_H_ 7 | 8 | #include 9 | #include "config.h" 10 | 11 | using namespace std; 12 | 13 | int print_array_3d(string array_name, int channel, int r_dim, int c_dim, data_type_itf *array){ 14 | 15 | ap_fixed<16,10> print_tmp = 0; 16 | cout << array_name << endl; 17 | 18 | for (int ch = 0; ch < channel; ch++) { 19 | cout << "print output channel: " << ch << endl; 20 | for (int j = 0; j < r_dim; j++) { 21 | for (int i = 0; i < c_dim; i++) { 22 | print_tmp.range(15,0) = array[j*r_dim + i].range(ch*16+15, ch*16); 23 | #if _HLS_MODE_ 24 | cout << print_tmp << " "; 25 | #else 26 | cout << print_tmp << " "; 27 | #endif 28 | } 29 | cout << endl; 30 | } 31 | cout << endl; 32 | } 33 | cout << endl; 34 | cout << endl; 35 | 36 | return 0; 37 | } 38 | 39 | int squeeze_input(string array_name, 40 | int channel, 41 | int r_dim, 42 | int c_dim, 43 | data_type_o *i_array, 44 | data_type_itf *o_array, 45 | bool dis_enable) { 46 | 47 | ap_fixed<16,10> print_tmp = 0; 48 | cout << array_name << endl; 49 | 50 | for (int ch = 0; ch < channel; ch++) { 51 | for (int i = 0; i < r_dim; i++) { 52 | for (int j = 0; j < c_dim; j++) { 53 | for (int wd = 0; wd <= ch && wd < 32; wd++) { 54 | 55 | o_array[i * r_dim + j].range((wd + 1) * 16 - 1, wd * 16) = i_array[i * r_dim + j].range(15, 0); 56 | #if _C_DEBUG_MODE_ 57 | if (dis_enable) {cout << setw(3) << i_array[i * 28 + j] << " ";} 58 | #else 59 | if (dis_enable) {cout << i_array[i * 28 + j] << " ";} 60 | #endif 61 | } 62 | } 63 | if (dis_enable) {cout << endl;} 64 | } 65 | if (dis_enable) {cout << endl;} 66 | } 67 | if (dis_enable) {cout << endl;} 68 | 69 | return 0; 70 | } 71 | 72 | int squeeze_weight(string array_name, 73 | int i_channel, 74 | int o_channel, 75 | int kernel_size, 76 | data_type_w *i_data, 77 | data_type_itf *o_array, 78 | bool dis_enable) { 79 | 80 | ap_fixed<16,10> print_tmp = 0; 81 | cout << array_name << endl; 82 | 83 | if (dis_enable) { 84 | cout << "Printing squeezed weight data ----------------------------- " << endl; 85 | } 86 | 87 | for (int i = 0; i < i_channel; i++){ 88 | for (int j = 0; j < o_channel; j++) { 89 | for (int k1 = 0; k1 < kernel_size; k1++) { 90 | for (int k2 = 0; k2 < kernel_size; k2++) { 91 | data_type_w w = *(i_data + i * o_channel * kernel_size * kernel_size + 92 | j * kernel_size * kernel_size + k1 * kernel_size + k2); 93 | // for(int ch = 0; ch < 32 && ch < i_channel; ch++){ 94 | // ap_fixed<16,10> w = 95 | // i_data[ch*o_channel*kernel_size*kernel_size + j*kernel_size*kernel_size + k1*kernel_size + k2]; 96 | o_array[i / 32 * o_channel * kernel_size * kernel_size + j * kernel_size * kernel_size + 97 | k1 * kernel_size + k2].range(i * 16 + 15, i * 16) = w.range(15, 0); 98 | if (dis_enable) { cout << w << " "; } 99 | } 100 | }if (dis_enable) { cout << endl; } 101 | }if (dis_enable) {cout << endl;} 102 | } 103 | if (dis_enable) {cout << endl;} 104 | 105 | if (dis_enable) { 106 | cout << "Finished printing squeezed weight data ----------------------------- " << endl; 107 | } 108 | 109 | return 0; 110 | 111 | } 112 | 113 | 114 | #endif //_PRINT_ARRAY_H_ 115 | -------------------------------------------------------------------------------- /netGenerator/dse/param_write.py: -------------------------------------------------------------------------------- 1 | def conv_param_write(conv_param_list, store_file): 2 | 3 | with open(store_file, "w") as wf: 4 | 5 | for i in range(0, len(conv_param_list)): 6 | for j in range(0, len(conv_param_list[i][1])): 7 | conv_param = "conv, data_type_itf, Tparam, data_type, data_type_w, data_type_o, " \ 8 | + str(conv_param_list[i][1][j][0]) + ", " \ 9 | + str(conv_param_list[i][1][j][1]) + ", " \ 10 | + str(conv_param_list[i][1][j][2]) + ", " \ 11 | + str(conv_param_list[i][1][j][2]) + ", 5, 5, 32, 32, 32" 12 | wf.write(conv_param + "\n") 13 | wf.close() 14 | 15 | 16 | def pool_param_write(pool_param_list, store_file): 17 | 18 | with open(store_file, "a+") as wf: 19 | # for i in range(0, len(parameters)): 20 | # for j in range(0, len(parameters[i][1])): 21 | # pool_param = "max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, " \ 22 | # + str(parameters[i][1][j][0]) + ", " \ 23 | # + str(parameters[i][1][j][1]) + ", " \ 24 | # + str(parameters[i][1][j][2]) + ", 2, 3" 25 | # wf.write(pool_param + "\n") 26 | for i in range(0, len(pool_param_list)): 27 | pool_param = "max_pool, data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3" 28 | wf.write(pool_param + "\n") 29 | wf.close() 30 | 31 | 32 | def layer_acc_param_write(layer_acc_list, store_file): 33 | with open(store_file, "a+") as wf: 34 | conv_core_counter = 0 35 | pool_core_counter = 0 36 | for i in range(0, len(layer_acc_list)): 37 | pool_flag = False 38 | for j in range(0, len(layer_acc_list[i])): 39 | print(layer_acc_list[i]) 40 | print(layer_acc_list[i][j]) 41 | print(layer_acc_list[i][j][-1]) 42 | if layer_acc_list[i][j][-1] == True: 43 | pool_flag = True 44 | print("Found layer with pooling") 45 | # else: 46 | # pool_flag = False 47 | if pool_flag == True: 48 | layer_acc = "conv_pool, " + str(conv_core_counter) + "," + str(conv_core_counter) + "," + str( 49 | pool_core_counter) + "\n" 50 | conv_core_counter += 1 51 | pool_core_counter += 1 52 | else: 53 | layer_acc = "conv_pool, " + str(conv_core_counter) + "," + str(conv_core_counter) + "\n" 54 | conv_core_counter += 1 55 | wf.write(layer_acc) 56 | 57 | 58 | #TODO: write the sub_net function parameters into file 59 | def sub_param_write(subn_param_list, store_file): 60 | 61 | with open(store_file, "a+") as wf: 62 | for i in range(0, len(subn_param_list)): 63 | print(subn_param_list[i]) 64 | subn_param = "sub_net_" 65 | for j in range(0, 4): 66 | subn_param += str(subn_param_list[i][j])+"," 67 | for j in range(0, len(subn_param_list[i][4])): 68 | if j < len(subn_param_list[i][4])-1: 69 | subn_param += str(int(subn_param_list[i][4][j]))+"," 70 | else: 71 | subn_param += str(int(subn_param_list[i][4][j])) 72 | wf.write(subn_param + "\n") 73 | wf.close() 74 | 75 | 76 | def generate_param_file(conv_param_list, pool_param_list, layer_acc_list, subn_param_list, store_file): 77 | 78 | # with open(store_file, "w") as wf: 79 | 80 | # write the conv parameters 81 | # for i in range(0, len(parameters)): 82 | # for j in range(0, len(parameters[i][1])): 83 | conv_param_write(conv_param_list, store_file) 84 | pool_param_write(pool_param_list, store_file) 85 | layer_acc_param_write(layer_acc_list, store_file) 86 | sub_param_write(subn_param_list, store_file) 87 | 88 | 89 | # write the pooling parameters 90 | 91 | # wirte conv_pool function parameters 92 | 93 | 94 | if __name__ == "__main__": 95 | generate_param_file(conv_param_list, pool_param_list, layer_acc_list, subn_param_list, "acc_ins_params.txt") 96 | # conv_param_write(parameters) -------------------------------------------------------------------------------- /fpga_cnn/src/activation_functions.h: -------------------------------------------------------------------------------- 1 | //This file contains the popular activation functions used in CNNs 2 | //TODO: modify the commented function to be compatible with gcc compilation. 3 | //TODO: change the functions into class based expression. 4 | 5 | #ifndef _ACTIVATION_FUNCTIONS_H_ 6 | #define _ACTIVATION_FUNCTIONS_H_ 7 | 8 | #include 9 | #include 10 | 11 | #include "data_type.h" 12 | //using namespace std; 13 | /* 14 | identity = i; 15 | sigmod = s; 16 | relu = r; 17 | leaky_relu = l; 18 | elu = e; 19 | tan_h = t; 20 | tan_hp1m2 = h; 21 | */ 22 | template 23 | T relu(T data){ 24 | // return (T(0) >= data ? T(0) : data); 25 | if (data > T(0)){ 26 | //cout << "data in range " << data << " ====> " << data << endl; 27 | return data; 28 | } 29 | else { 30 | //cout << "data out range " << data << " ====> " << 0 << endl; 31 | return T(0); 32 | } 33 | } 34 | 35 | template 36 | ap_fixed RELU(ap_fixed data){ 37 | if (data > 0){ 38 | return data; 39 | } 40 | else 41 | return 0; 42 | }; 43 | 44 | ap_fixed<64, 32> Relu_64(ap_fixed<64,32> data){ 45 | if (data > 0){ 46 | return data; 47 | } 48 | else 49 | return 0; 50 | }; 51 | 52 | ap_fixed<32,16> Relu_32(ap_fixed<32,16> data){ 53 | if(data > 0) return data; 54 | else return 0; 55 | }; 56 | ap_fixed<24,16> Relu_24(ap_fixed<24,16> data){ 57 | if(data > 0) return data; 58 | else return 0; 59 | }; 60 | ap_fixed<20,16> Relu_20(ap_fixed<20,16> data){ 61 | if(data > 0) return data; 62 | else return 0; 63 | }; 64 | ap_fixed<16,12> Relu_16(ap_fixed<16,12> data){ 65 | if(data > 0) return data; 66 | else return 0; 67 | }; 68 | 69 | ap_fixed<8,4> Relu_8(ap_fixed<8,4> data){ 70 | if(data > 0) return data; 71 | else return 0; 72 | }; 73 | 74 | float f(char type, float data) { 75 | if (type == 'i') // identity 76 | { 77 | return data; 78 | } 79 | else if (type == 's') { // sigmod 80 | return float(1) / (float(1) + exp(-data)); 81 | } 82 | else if (type == 'r') { //relu 83 | return ((float(0) > data) ? float(0) : data); 84 | } 85 | else if (type == 'l') { //leak_relu 86 | return (data > float(0)) ? data : float(0.01) * data; 87 | } 88 | else if (type == 'e') { // elu 89 | return (data float(0) ? float(1) : float(0); 128 | } 129 | else if (type == "leaky_relu") { 130 | return data > float(0) ? float(1) : float(0.01); 131 | } 132 | else if (type == "elu") { 133 | return (data > float(0) ? float(1) : (float(1) + data)); 134 | } 135 | else if (type == "tan_h") { 136 | return float(1) - data*data; 137 | } 138 | else if (type == "tan_hp1m2") { 139 | return 2 * data *(float(1) - data); 140 | } 141 | else return false; 142 | } 143 | */ 144 | //vec_t df(string& type, const vec_t& y, uint index) { 145 | // vec_t v(0, 0); 146 | // if (type == "softmax") 147 | // { 148 | // vec_t v(y.size(), 0); 149 | // for (uint i = 0; i < y.size(); i++) 150 | // v[i] = (i == index) ? y[index] * (float(1) - y[index]) : -y[i] * y[index]; 151 | // 152 | // return v; 153 | // } 154 | // else return v; 155 | //} 156 | 157 | #endif 158 | -------------------------------------------------------------------------------- /fpga_cnn/src/construct_net.h: -------------------------------------------------------------------------------- 1 | #ifndef _CONSTRUCT_NET_H_ 2 | #define _CONSTRUCT_NET_H_ 3 | 4 | //#include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h" 5 | //#include "/opt/Xilinx/Vivado/2018.1/include/gmp.h" 6 | //#include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h" 7 | //#include "/opt/Xilinx/Vivado/2018.1/include/gmp.h" 8 | 9 | #include "config.h" 10 | #include 11 | #include 12 | #include "acc_instance.h" 13 | using namespace std; 14 | 15 | void sub_net_0( 16 | Tparam param_port[1024], 17 | // ap_fixed<32,26> bias_in[4096], 18 | data_type_itf weight_in[131072], 19 | data_type_itf data_in_0[65536], 20 | data_type_itf data_out_0[32768], 21 | data_type_itf data_in_1[32768], 22 | data_type_itf data_out_1[4096], 23 | int select 24 | ) 25 | { 26 | 27 | #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS 28 | #pragma HLS INTERFACE s_axilite port=select bundle=CRTL_BUS 29 | 30 | #pragma HLS INTERFACE s_axilite port=param_port bundle=CRTL_BUS 31 | #pragma HLS INTERFACE m_axi port=param_port offset=slave depth=1024 bundle=PARAM_IN 32 | //#pragma HLS INTERFACE s_axilite port=bias_in bundle=CRTL_BUS 33 | //#pragma HLS INTERFACE m_axi port=bias_in offset=slave depth=4096 bundle=BIAS_IN 34 | #pragma HLS INTERFACE s_axilite port=weight_in bundle=CRTL_BUS 35 | #pragma HLS INTERFACE m_axi port=weight_in offset=slave depth=131072 bundle=WEIGHT_IN 36 | 37 | #pragma HLS INTERFACE s_axilite port=data_in_0 bundle=CRTL_BUS 38 | #pragma HLS INTERFACE m_axi port=data_in_0 offset=slave depth=65536 bundle=DATA_IN 39 | 40 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3 41 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3 42 | #pragma HLS INTERFACE bram port=data_out_1 43 | 44 | 45 | int acc0_mem_inport_offset = 0; 46 | int acc0_mem_outport_offset = 0; 47 | int acc1_mem_inport_offset = 0; 48 | int acc1_mem_outport_offset = 0; 49 | 50 | 51 | if (select == 0) 52 | { 53 | acc0_mem_inport_offset = 0; 54 | acc0_mem_outport_offset = 0; 55 | acc1_mem_inport_offset = 16384; 56 | acc1_mem_outport_offset = 2048; 57 | } 58 | else 59 | { 60 | acc0_mem_inport_offset = 16384; 61 | acc0_mem_outport_offset = 16384; 62 | acc1_mem_inport_offset = 0; 63 | acc1_mem_outport_offset = 0; 64 | } 65 | 66 | conv_pool_acc_0(param_port, bias_in, weight_in, data_in_0 + acc0_mem_inport_offset, data_out_0 + acc0_mem_outport_offset); 67 | conv_pool_acc_1(param_port + 256, bias_in+256, weight_in, data_in_1 + acc1_mem_inport_offset, data_out_1 + acc1_mem_outport_offset); 68 | 69 | }; 70 | 71 | 72 | void sub_net_1( 73 | Tparam param_port[1024], 74 | ap_fixed<32,26> bias_in[4096], 75 | data_type_itf weight_in[131072], 76 | data_type_itf data_in_0[65536], 77 | data_type_itf data_out_0[32768], 78 | data_type_itf data_in_1[32768], 79 | data_type_itf data_out_1[4096], 80 | int select 81 | ) 82 | { 83 | 84 | #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS 85 | #pragma HLS INTERFACE s_axilite port=select bundle=CRTL_BUS 86 | 87 | #pragma HLS INTERFACE s_axilite port=param_port bundle=CRTL_BUS 88 | #pragma HLS INTERFACE m_axi port=param_port offset=slave depth=1024 bundle=PARAM_IN 89 | //#pragma HLS INTERFACE s_axilite port=bias_in bundle=CRTL_BUS 90 | //#pragma HLS INTERFACE m_axi port=bias_in offset=slave depth=4096 bundle=BIAS_IN 91 | #pragma HLS INTERFACE s_axilite port=weight_in bundle=CRTL_BUS 92 | #pragma HLS INTERFACE m_axi port=weight_in offset=slave depth=131072 bundle=WEIGHT_IN 93 | 94 | #pragma HLS INTERFACE s_axilite port=data_in_0 bundle=CRTL_BUS 95 | #pragma HLS INTERFACE m_axi port=data_in_0 offset=slave depth=65536 bundle=DATA_IN 96 | 97 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3 98 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3 99 | #pragma HLS INTERFACE bram port=data_out_1 100 | 101 | 102 | int acc0_mem_inport_offset = 0; 103 | int acc0_mem_outport_offset = 0; 104 | int acc1_mem_inport_offset = 0; 105 | int acc1_mem_outport_offset = 0; 106 | 107 | 108 | if (select == 0) 109 | { 110 | acc0_mem_inport_offset = 0; 111 | acc0_mem_outport_offset = 0; 112 | acc1_mem_inport_offset = 16384; 113 | acc1_mem_outport_offset = 2048; 114 | } 115 | else 116 | { 117 | acc0_mem_inport_offset = 16384; 118 | acc0_mem_outport_offset = 16384; 119 | acc1_mem_inport_offset = 0; 120 | acc1_mem_outport_offset = 0; 121 | } 122 | 123 | conv_pool_acc_0(param_port, bias_in, weight_in, data_in_0 + acc0_mem_inport_offset, data_out_0 + acc0_mem_outport_offset); 124 | conv_pool_acc_1(param_port + 256, bias_in+256, weight_in, data_in_1 + acc1_mem_inport_offset, data_out_1 + acc1_mem_outport_offset); 125 | 126 | }; 127 | 128 | 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /netGenerator/alex.prototxt: -------------------------------------------------------------------------------- 1 | name: "AlexNet" 2 | layer { 3 | name: "data" 4 | type: "Input" 5 | top: "data" 6 | input_param { shape: { dim: 10 dim: 3 dim: 227 dim: 227 } } 7 | } 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 0 20 | } 21 | convolution_param { 22 | num_output: 96 23 | kernel_size: 11 24 | stride: 4 25 | } 26 | } 27 | layer { 28 | name: "relu1" 29 | type: "ReLU" 30 | bottom: "conv1" 31 | top: "conv1" 32 | } 33 | layer { 34 | name: "norm1" 35 | type: "LRN" 36 | bottom: "conv1" 37 | top: "norm1" 38 | lrn_param { 39 | local_size: 5 40 | alpha: 0.0001 41 | beta: 0.75 42 | } 43 | } 44 | layer { 45 | name: "pool1" 46 | type: "Pooling" 47 | bottom: "norm1" 48 | top: "pool1" 49 | pooling_param { 50 | pool: MAX 51 | kernel_size: 3 52 | stride: 2 53 | } 54 | } 55 | layer { 56 | name: "conv2" 57 | type: "Convolution" 58 | bottom: "pool1" 59 | top: "conv2" 60 | param { 61 | lr_mult: 1 62 | decay_mult: 1 63 | } 64 | param { 65 | lr_mult: 2 66 | decay_mult: 0 67 | } 68 | convolution_param { 69 | num_output: 256 70 | pad: 2 71 | kernel_size: 5 72 | group: 2 73 | } 74 | } 75 | layer { 76 | name: "relu2" 77 | type: "ReLU" 78 | bottom: "conv2" 79 | top: "conv2" 80 | } 81 | layer { 82 | name: "norm2" 83 | type: "LRN" 84 | bottom: "conv2" 85 | top: "norm2" 86 | lrn_param { 87 | local_size: 5 88 | alpha: 0.0001 89 | beta: 0.75 90 | } 91 | } 92 | layer { 93 | name: "pool2" 94 | type: "Pooling" 95 | bottom: "norm2" 96 | top: "pool2" 97 | pooling_param { 98 | pool: MAX 99 | kernel_size: 3 100 | stride: 2 101 | } 102 | } 103 | layer { 104 | name: "conv3" 105 | type: "Convolution" 106 | bottom: "pool2" 107 | top: "conv3" 108 | param { 109 | lr_mult: 1 110 | decay_mult: 1 111 | } 112 | param { 113 | lr_mult: 2 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 384 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu3" 124 | type: "ReLU" 125 | bottom: "conv3" 126 | top: "conv3" 127 | } 128 | layer { 129 | name: "conv4" 130 | type: "Convolution" 131 | bottom: "conv3" 132 | top: "conv4" 133 | param { 134 | lr_mult: 1 135 | decay_mult: 1 136 | } 137 | param { 138 | lr_mult: 2 139 | decay_mult: 0 140 | } 141 | convolution_param { 142 | num_output: 384 143 | pad: 1 144 | kernel_size: 3 145 | group: 2 146 | } 147 | } 148 | layer { 149 | name: "relu4" 150 | type: "ReLU" 151 | bottom: "conv4" 152 | top: "conv4" 153 | } 154 | layer { 155 | name: "conv5" 156 | type: "Convolution" 157 | bottom: "conv4" 158 | top: "conv5" 159 | param { 160 | lr_mult: 1 161 | decay_mult: 1 162 | } 163 | param { 164 | lr_mult: 2 165 | decay_mult: 0 166 | } 167 | convolution_param { 168 | num_output: 256 169 | pad: 1 170 | kernel_size: 3 171 | group: 2 172 | } 173 | } 174 | layer { 175 | name: "relu5" 176 | type: "ReLU" 177 | bottom: "conv5" 178 | top: "conv5" 179 | } 180 | layer { 181 | name: "pool5" 182 | type: "Pooling" 183 | bottom: "conv5" 184 | top: "pool5" 185 | pooling_param { 186 | pool: MAX 187 | kernel_size: 3 188 | stride: 2 189 | } 190 | } 191 | layer { 192 | name: "fc6" 193 | type: "InnerProduct" 194 | bottom: "pool5" 195 | top: "fc6" 196 | param { 197 | lr_mult: 1 198 | decay_mult: 1 199 | } 200 | param { 201 | lr_mult: 2 202 | decay_mult: 0 203 | } 204 | inner_product_param { 205 | num_output: 4096 206 | } 207 | } 208 | layer { 209 | name: "relu6" 210 | type: "ReLU" 211 | bottom: "fc6" 212 | top: "fc6" 213 | } 214 | layer { 215 | name: "drop6" 216 | type: "Dropout" 217 | bottom: "fc6" 218 | top: "fc6" 219 | dropout_param { 220 | dropout_ratio: 0.5 221 | } 222 | } 223 | layer { 224 | name: "fc7" 225 | type: "InnerProduct" 226 | bottom: "fc6" 227 | top: "fc7" 228 | param { 229 | lr_mult: 1 230 | decay_mult: 1 231 | } 232 | param { 233 | lr_mult: 2 234 | decay_mult: 0 235 | } 236 | inner_product_param { 237 | num_output: 4096 238 | } 239 | } 240 | layer { 241 | name: "relu7" 242 | type: "ReLU" 243 | bottom: "fc7" 244 | top: "fc7" 245 | } 246 | layer { 247 | name: "drop7" 248 | type: "Dropout" 249 | bottom: "fc7" 250 | top: "fc7" 251 | dropout_param { 252 | dropout_ratio: 0.5 253 | } 254 | } 255 | layer { 256 | name: "fc8" 257 | type: "InnerProduct" 258 | bottom: "fc7" 259 | top: "fc8" 260 | param { 261 | lr_mult: 1 262 | decay_mult: 1 263 | } 264 | param { 265 | lr_mult: 2 266 | decay_mult: 0 267 | } 268 | inner_product_param { 269 | num_output: 1000 270 | } 271 | } 272 | layer { 273 | name: "prob" 274 | type: "Softmax" 275 | bottom: "fc8" 276 | top: "prob" 277 | } 278 | -------------------------------------------------------------------------------- /netGenerator/dse/global_search.py: -------------------------------------------------------------------------------- 1 | 2 | import helping_functions 3 | import sys 4 | import math 5 | from model_partition import partition_to_k 6 | from model_split import model_split_by_list 7 | from model_split import gop_calculate 8 | import pprint 9 | import threading 10 | import multiprocessing 11 | import time 12 | from local_search import local_search 13 | 14 | result_Q = multiprocessing.Queue() 15 | PROCESS_NUM = 4 16 | 17 | 18 | class SearchProcess(multiprocessing.Process): 19 | def __init__(self, param_v, processIdx, result_Q): 20 | multiprocessing.Process.__init__(self) 21 | self.layer_list = param_v[0] 22 | self.acc_cluster_num = param_v[1] 23 | self.conv_N = param_v[2] 24 | self.conv_M = param_v[3] 25 | self.conv_r = param_v[4] 26 | self.conv_R = param_v[5] 27 | self.conv_K = param_v[6] 28 | self.conv_S = param_v[7] 29 | self.flag = param_v[8] 30 | self.overall_lat = param_v[9] 31 | self.processIdx = processIdx 32 | self.result_Q = result_Q 33 | 34 | def run(self): 35 | 36 | start = time.time() 37 | process_gop_list = [] 38 | process_item_list = [] 39 | process_util_list = [] 40 | process_pair_list = [] 41 | 42 | search_counter = 0 43 | 44 | print("Process " + str(self.processIdx) + " starts global search.") 45 | 46 | for idx, item in enumerate(partition_to_k(self.layer_list, self.acc_cluster_num, False), 0): 47 | if idx % PROCESS_NUM == self.processIdx: 48 | sub_gop_list = [] 49 | search_counter = search_counter + 1 50 | sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \ 51 | = model_split_by_list(self.conv_N, self.conv_M, self.conv_r, self.conv_R, self.conv_K, self.conv_S, self.flag, item) 52 | sub_pair_list, sub_lat_list, sub_util_list = \ 53 | local_search(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) 54 | 55 | for i in range(0, len(sub_conv_N)): 56 | sub_gop_list.append(gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i])) 57 | 58 | if max(sub_lat_list) < self.overall_lat: 59 | overall_lat = max(sub_lat_list) 60 | if len(process_pair_list) < 6: 61 | process_item_list.append(item) 62 | process_pair_list.append(sub_pair_list) 63 | # process_pair_list.append([overall_lat]) 64 | process_util_list.append([overall_lat]) 65 | process_gop_list.append(sub_gop_list) 66 | # process_util_list.append(sub_util_list) 67 | # process_pair_list.append(sub_util_list) 68 | # else: 69 | # max_among_mins = process_pair_list.index(max(overall_lat)) 70 | # process_pair_list.remove(process_pair_list[max_among_mins]) 71 | # process_pair_list.append(sub_pair_list) 72 | # process_pair_list.append([overall_lat]) 73 | # process_pair_list.append(sub_util_list) 74 | 75 | # print "For set ID: " + str(idx) + ", the final explored points = ", search_counter 76 | 77 | if len(process_pair_list) != 0: 78 | self.result_Q.put((process_pair_list, process_item_list, process_gop_list, process_util_list)) 79 | 80 | end = time.time() 81 | print("Thread ", self.processIdx, " :", (end - start)) 82 | 83 | 84 | def global_search(layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat): 85 | """ 86 | :param layer_list: a list containing each layer information in the form of a tuple (layer index, layer name). 87 | :param acc_cluster_num: 88 | :param conv_N: 89 | :param conv_M: 90 | :param conv_r: 91 | :param conv_R: 92 | :param conv_K: 93 | :param conv_S: 94 | :param flag: 95 | :param pair_list: 96 | :param overall_lat: 97 | :return: 98 | """ 99 | sub_conv_N = [] 100 | sub_conv_M = [] 101 | sub_conv_r = [] 102 | sub_conv_R = [] 103 | sub_conv_K = [] 104 | sub_conv_S = [] 105 | sub_flag = [] 106 | sub_pair_list = [] 107 | sub_lat_list = [] 108 | sub_util_list = [] 109 | 110 | gop_list = [] 111 | item_list = [] 112 | util_list = [] 113 | pair_list = [] 114 | 115 | processes = [] 116 | for i in range(PROCESS_NUM): 117 | p = SearchProcess((layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat), i, result_Q) 118 | processes.append(p) 119 | 120 | for p in processes: 121 | p.start() 122 | 123 | for p in processes: 124 | p.join() 125 | 126 | results = list() 127 | while not result_Q.empty(): 128 | results.append(result_Q.get()) 129 | for item in results: 130 | pair_list = pair_list + item[0] 131 | item_list = item_list + item[1] 132 | gop_list = gop_list + item[2] 133 | util_list = util_list + item[3] 134 | 135 | return pair_list, item_list, gop_list, util_list 136 | -------------------------------------------------------------------------------- /fpga_cnn/testbench/fc_validate.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "fc_validate.h" 6 | 7 | using namespace std; 8 | 9 | fc_validate::fc_validate(int layer_num, int num_input, int num_output, int act) { 10 | int i, j, k; 11 | this->layer_num = layer_num; 12 | this->num_input = num_input; 13 | this->num_output = num_output; 14 | this->act = act; 15 | 16 | this->lnum_list[0] = layer_num; 17 | 18 | this->config_list[0] = num_input; 19 | this->config_list[1] = 0; 20 | this->config_list[2] = num_output; 21 | this->config_list[3] = 0; 22 | this->config_list[4] = 0; 23 | this->config_list[5] = 0; 24 | this->config_list[6] = 0; 25 | this->config_list[7] = 0; 26 | this->config_list[8] = 0; 27 | this->config_list[9] = 0; 28 | this->config_list[10] = 0; 29 | this->config_list[11] = 0; 30 | this->config_list[12] = 0; 31 | this->config_list[13] = 0; 32 | this->config_list[14] = 0; 33 | this->config_list[15] = 0; 34 | 35 | // ap_int<512> weight[32]; 36 | // ap_int<512> in_feature[32]; 37 | // ap_int<512> bias[32]; 38 | // ap_int<512> out_feature[32]; 39 | // ap_int<512> out_feature_software[32]; 40 | 41 | weight = new ap_int<512>[1024*1024]; 42 | in_feature = new ap_int<512>[1024]; 43 | // bias = new ap_fixed<32,26>[num_output]; 44 | out_feature = new ap_int<512>[1024]; 45 | out_feature_software = new ap_int<512>[1024]; 46 | 47 | 48 | // initial weight data with random numbers 49 | for(i = 0; i < (num_input/32)*num_output; i++){ 50 | for(j = 0; j < 32; j++){ 51 | weight[i].range(16*j+15, 16*j) = rand()%10 - 5; // rand()%10 - 5 52 | } 53 | } 54 | for(i = (num_input/32) * num_output; i < ((int)(ceil((float)num_input/32))) * num_output; i++) 55 | { 56 | for(j = 0 ; j < num_input % 32; j++) 57 | weight[i].range(15+16*j,16*j) = rand()%10 - 5; //rand()%10 - 5; 58 | for(j = num_input % 32 ; j < 32; j++) 59 | weight[i].range(15+16*j,16*j) = 0; 60 | } 61 | 62 | // initial input data with random numbers 63 | for(i = 0 ; i < num_input/32; i++){ 64 | for(j = 0 ; j < 32; j++){ 65 | in_feature[i].range(15+16*j,16*j) = (rand()%2 -1) * 64; //(rand()%2 -1) * 64 66 | } 67 | } 68 | for(i = num_input/32 ; i < (int)(ceil(float(num_input)/32)); i++){ 69 | for(j = 0 ; j < num_input % 32; j++){ 70 | in_feature[i].range(15+16*j, 16*j) = (rand()%2 - 1) * 64; // (rand()%2 - 1) * 64 71 | } 72 | for(j = num_input % 32; j < 32; j++){ 73 | in_feature[i].range(15+16*j, 16*j) = 0; 74 | } 75 | } 76 | 77 | // initial bias data with random numbers 78 | for(i = 0; i < num_output/32; i++){ 79 | for(j = 0; j < 32; j++){ 80 | bias[i].range(15+16*j, 16*j) = 0*64; // (rand()%2) * 64 81 | } 82 | } 83 | } 84 | 85 | void fc_validate::print_feature_in(void){ 86 | int i,j; 87 | cout << "fc input feature:" << endl; 88 | for(i = 0 ; i < (int)(ceil(float(num_input)/32)); i++) 89 | { 90 | cout < 2 | #include 3 | #include 4 | #include "conv_validate.h" 5 | using namespace std; 6 | 7 | 8 | conv_validate::conv_validate(ap_uint<32>* param_list) 9 | { 10 | int i,j,k; 11 | int input_num; 12 | int input_feature_size; 13 | int output_num; 14 | 15 | this->param_list = param_list; 16 | input_num = param_list[16+0]; 17 | input_feature_size = param_list[16+3]; 18 | 19 | layer_num = param_list[0]; 20 | for(i = 0 ; i < 4096; i++) 21 | input_feature[i] = 0; 22 | for(i = 0 ; i < 2048; i++) 23 | weight[i] = 0; 24 | for(i = 0 ; i < 4096; i++) 25 | output_feature[i] = 0; 26 | for(i = 0 ; i < 1024; i++) 27 | bias[i] = 0; 28 | 29 | 30 | for(i = 0 ; i < 1;i++) 31 | { 32 | for(j = 0 ; j < 16 * 16;j++) 33 | input_feature[j].range(15+16*i,16*i) = 64; 34 | } 35 | 36 | 37 | 38 | 39 | for(i = 0 ; i < 9;i++) 40 | { 41 | weight[i].range(15,0) = 16; 42 | } 43 | 44 | 45 | for(i = 0 ; i < 9;i++) 46 | { 47 | weight[i].range(16+15,16+0) = 0; 48 | } 49 | 50 | for(i = 0 ; i < 9;i++) 51 | { 52 | weight[i].range(16*2+15,16*2+0) = 0; 53 | } 54 | 55 | 56 | 57 | 58 | // for(j = 0 ; j < 8; j++) 59 | // for(i = 0 ; i < 72;i++) 60 | // { 61 | // weight[i].range(15+16*j,0+16*j) = 64; 62 | // } 63 | 64 | 65 | 66 | // for(i = 18 ; i < 27;i++) 67 | // { 68 | // weight[i].range(15,0) = 64; 69 | // } 70 | } 71 | 72 | 73 | 74 | 75 | void conv_validate :: print_feature_in(void) 76 | { 77 | int i,j; 78 | cout << "input feature:" << endl; 79 | for(i = 0 ; i <12*12; i++) 80 | { 81 | cout < 2 | 3 | //Optional 4 | //#include "/opt/Xilinx/Vivado/2018.1/include/gmp.h" 5 | //#include "/opt/Xilinx/Vivado/2018.1/include/mpfr.h" 6 | 7 | #include "ap_fixed.h" 8 | #include "construct_net.h" 9 | #include "../testbench/conv_validate.h" 10 | #include "../testbench/pooling_validate.h" 11 | #include "../testbench/fc_validate.h" 12 | 13 | using namespace std; 14 | 15 | void construct_para(ap_uint<32>* para); 16 | void print_para_list(ap_uint<32>* para); 17 | 18 | int main() 19 | { 20 | int i,j,k; 21 | cout <<"Test Begin..."< para_list[512]; 24 | ap_int<512> data_temp[2048]; 25 | ap_int<512> data_temp2[2048]; 26 | ap_int<512> data_temp3[2048]; 27 | ap_int<512> data_temp4[2048]; 28 | ap_int<512> data_temp5[2048]; 29 | ap_int<512> data_temp6[2048]; 30 | 31 | for(i = 0 ; i < 512; i++) 32 | para_list[i] = 0; 33 | for(i = 0 ; i < 2048; i++) 34 | { 35 | data_temp[i] = 0; 36 | data_temp2[i] = 0; 37 | data_temp3[i] = 0; 38 | data_temp4[i] = 0; 39 | data_temp5[i] = 0; 40 | data_temp6[i] = 0; 41 | } 42 | 43 | 44 | construct_para(para_list); 45 | print_para_list(para_list); 46 | conv_validate conv_test(para_list); 47 | 48 | 49 | sub_net_0( 50 | para_list, //ap_uint<32> param_port[528], 51 | //conv_test.bias, //ap_fixed<32,26> bias_in[1024], 52 | conv_test.weight, //data_type_itf weight_in[2048], 53 | conv_test.input_feature, //data_type_itf data_in_0[2048], 54 | data_temp, //data_type_itf data_out_0[2048], 55 | // data_temp, //data_type_itf data_in_1[2048], 56 | //conv_test.output_feature, //data_type_itf data_out_1[2048], 57 | // data_temp2, 58 | 0 //int select 59 | ); 60 | 61 | /* 62 | sub_net_1( 63 | para_list, //ap_uint<32> param_port[528], 64 | //conv_test.bias, //ap_fixed<32,26> bias_in[1024], 65 | conv_test.weight, //data_type_itf weight_in[2048], 66 | conv_test.input_feature, //data_type_itf data_in_0[2048], 67 | data_temp, //data_type_itf data_out_0[2048], 68 | data_temp, //data_type_itf data_in_1[2048], 69 | data_temp2,//conv_test.output_feature, //data_type_itf data_out_1[2048], 70 | 1 //int select 71 | ); 72 | 73 | 74 | sub_net_2( 75 | para_list, //ap_uint<32> param_port[528], 76 | //conv_test.bias, //ap_fixed<32,26> bias_in[1024], 77 | conv_test.weight, //data_type_itf weight_in[2048], 78 | data_temp2, //data_type_itf data_in_0[2048], 79 | data_temp3, //data_type_itf data_out_0[2048], 80 | data_temp3, //data_type_itf data_in_1[2048], 81 | data_temp4,//conv_test.output_feature, //data_type_itf data_out_1[2048], 82 | 0 //int select 83 | ); 84 | */ 85 | 86 | cout <<"Test Finish"<* para) 92 | { 93 | int i; 94 | //0-1.layer_num 95 | para[0] = 1; 96 | //0-2.conv para 97 | para[16+0] = 1;//N 98 | para[16+1] = 3;//K 99 | para[16+2] = 1;//M 100 | para[16+3] = 16;//Rin 101 | para[16+4] = 16;//Cin 102 | para[16+5] = 16;//R 103 | para[16+6] = 16;//C 104 | para[16+7] = 1;//S 105 | para[16+8] = 1;//P 106 | para[16+9] = 1;//act 107 | para[16+10] = 0;//weight_offset 108 | para[16+11] = 0;//bias_offset 109 | para[16+12] = 0;//in_offset 110 | para[16+13] = 0;//out_offset 111 | para[16+14] = 0;//inport 112 | para[16+15] = 0; 113 | //0-3.conv para 114 | para[32+0] = 192;//N 115 | para[32+1] = 3;//K 116 | para[32+2] = 128;//M 117 | para[32+3] = 13;//Rin 118 | para[32+4] = 13;//Cin 119 | para[32+5] = 13;//R 120 | para[32+6] = 13;//C 121 | para[32+7] = 1;//S 122 | para[32+8] = 1;//P 123 | para[32+9] = 1;//act 124 | para[32+10] = 0;//weight_offset 125 | para[32+11] = 0;//bias_offset 126 | para[32+12] = 0;//in_offset 127 | para[32+13] = 512;//out_offset 128 | para[32+14] = 1;//inport 129 | para[32+15] = 0; 130 | //0-4.conv para 131 | para[48+0] = 1;//N 132 | para[48+1] = 3;//K 133 | para[48+2] = 1;//M 134 | para[48+3] = 16;//Rin 135 | para[48+4] = 16;//Cin 136 | para[48+5] = 16;//R 137 | para[48+6] = 16;//C 138 | para[48+7] = 1;//S 139 | para[48+8] = 1;//P 140 | para[48+9] = 1;//act 141 | para[48+10] = 0;//weight_offset 142 | para[48+11] = 0;//bias_offset 143 | para[48+12] = 512;//in_offset 144 | para[48+13] = 0;//out_offset 145 | para[48+14] = 1;//inport 146 | para[48+15] = 0; 147 | 148 | 149 | 150 | 151 | 152 | //1-1.layer_num 153 | para[256+0] = 1; 154 | //1-2.conv_para 155 | para[256+16+0] = 1; //N 156 | para[256+16+1] = 3; //K 157 | para[256+16+2] = 1; //M 158 | para[256+16+3] = 16; //Rin 159 | para[256+16+4] = 16; //Cin 160 | para[256+16+5] = 16; //R 161 | para[256+16+6] = 16; //C 162 | para[256+16+7] = 1; //S 163 | para[256+16+8] = 1; //P 164 | para[256+16+9] = 1; //act 165 | para[256+16+10] = 0; //weight_offset 166 | para[256+16+11] = 0; //bias_offset 167 | para[256+16+12] = 0; //in_offset 168 | para[256+16+13] = 0; //out_offset 169 | para[256+16+14] = 0; //inport 170 | para[256+16+15] = 0; 171 | 172 | //1-3.conv_para 173 | para[256+32+0] = 1; //N 174 | para[256+32+1] = 3; //K 175 | para[256+32+2] = 1; //M 176 | para[256+32+3] = 16; //Rin 177 | para[256+32+4] = 16; //Cin 178 | para[256+32+5] = 16; //R 179 | para[256+32+6] = 16; //C 180 | para[256+32+7] = 1; //S 181 | para[256+32+8] = 1; //P 182 | para[256+32+9] = 1; //act 183 | para[256+32+10] = 0; //weight_offset 184 | para[256+32+11] = 0; //bias_offset 185 | para[256+32+12] = 0; //in_offset 186 | para[256+32+13] = 512; //out_offset 187 | para[256+32+14] = 1; //inport 188 | para[256+32+15] = 0; 189 | 190 | //1-4.conv_para 191 | para[256+48+0] = 1; //N 192 | para[256+48+1] = 3; //K 193 | para[256+48+2] = 1; //M 194 | para[256+48+3] = 16; //Rin 195 | para[256+48+4] = 16; //Cin 196 | para[256+48+5] = 16; //R 197 | para[256+48+6] = 16; //C 198 | para[256+48+7] = 1; //S 199 | para[256+48+8] = 1; //P 200 | para[256+48+9] = 1; //act 201 | para[256+48+10] = 0; //weight_offset 202 | para[256+48+11] = 0; //bias_offset 203 | para[256+48+12] = 512; //in_offset 204 | para[256+48+13] = 0; //out_offset 205 | para[256+48+14] = 1; //inport 206 | para[256+48+15] = 0; 207 | 208 | 209 | 210 | 211 | 212 | 213 | } 214 | void print_para_list(ap_uint<32>* para) 215 | { 216 | int i,j; 217 | 218 | for(i = 0 ; i < 32; i++) 219 | { 220 | for(j = 0 ; j < 16; j++) 221 | cout << para[i*16+j] <<" "; 222 | cout << endl; 223 | } 224 | } 225 | -------------------------------------------------------------------------------- /fpga_cnn/src/max_pool_acc_innerpp.h: -------------------------------------------------------------------------------- 1 | #ifndef _MAX_POOL_ACC_H_ 2 | #define _MAX_POOL_ACC_H_ 3 | 4 | #include 5 | #include 6 | #include "activation_functions.h" 7 | using namespace std; 8 | 9 | template 10 | class max_pool_acc { 11 | 12 | private: 13 | int pool_layer_number; 14 | 15 | public: 16 | max_pool_acc() : pool_layer_number(0) {pool_layer_number = 0;}; 17 | 18 | // Tn << 32 && N << 32 19 | void in_buf_load_512( 20 | ap_fixed<16,10> buf[][(pTr - 1) * pS_max + pK_max][(pTc - 1) * pS_max + pK_max], 21 | ap_int<512>* i_data, 22 | Tparam in_offset, 23 | Tparam n, Tparam r, Tparam c, Tparam S, Tparam K, Tparam P, 24 | Tparam R, Tparam C, Tparam N, Tparam R_IN, Tparam C_IN, Tparam TR, Tparam TC) 25 | { 26 | ap_int<512> data_tmp = 0; 27 | // valid data portion 28 | for (int j = r * S - P; j < r * S + TR - P; j++) 29 | { 30 | for (int k = c * S - P; k < c * S + TC - P; k++) 31 | { 32 | #pragma HLS PIPELINE 33 | for (int i = 0; i < pTn; i+=32) 34 | { 35 | #pragma HLS UNROLL 36 | if ((i + n >= N) || j < 0 || j >= (R_IN - 2 * P) || k < 0 || k >= (C_IN - 2 * P)) 37 | { 38 | for (int wr = 0; wr < pTn; wr++) 39 | { 40 | #pragma HLS UNROLL 41 | buf[i + wr][j - r * S + P][k - c * S + P] = pT(0); 42 | } 43 | } 44 | else 45 | { 46 | data_tmp = *(i_data + in_offset + (i + n)/32 * (R_IN - 2 * P) * (C_IN - 2 * P) + j * (R_IN - 2 * P) + k); 47 | for (int wr = 0; wr < pTn; wr++) 48 | { 49 | #pragma HLS UNROLL 50 | buf[wr][j - r * S + P][k - c * S + P].range(15,0) = data_tmp.range((wr + 1) * 16 - 1, (wr) * 16); 51 | } 52 | } 53 | } 54 | } 55 | } 56 | } 57 | 58 | // Max pooling computation kernel 59 | void pool_engine(pT in_buf[][(pTr-1)*pS_max + pK_max][(pTc-1)*pS_max + pK_max], 60 | pG out_buf[][pTr][pTc], 61 | Tparam S, Tparam n, Tparam r, Tparam c, Tparam K, Tparam R, Tparam C, Tparam TR, Tparam TC) { 62 | if (n >= 0) { 63 | for (int i = 0; i < K; i++) { 64 | for (int j = 0; j < K; j++) { 65 | for (int tr = 0; tr < pTr && tr + r < R && (S * tr + i) < TR; tr++) { 66 | for (int tc = 0; tc < pTc && tc + c < C && (S * tc + j) < TC; tc++) { 67 | #pragma HLS PIPELINE 68 | for (int tn = 0; tn < pTn; tn++) { 69 | #pragma HLS UNROLL 70 | out_buf[tn][tr][tc] = (i == 0 && j == 0) ? in_buf[tn][S * tr][S * tc] 71 | : ((out_buf[tn][tr][tc] 72 | > in_buf[tn][S * tr + i][S * tc + j]) 73 | ? out_buf[tn][tr][tc] 74 | : in_buf[tn][S * tr + i][S * tc + j]); 75 | } 76 | } 77 | } 78 | } 79 | } 80 | } 81 | } 82 | 83 | // Ouput out_buf data to output interface 84 | void output_res_512(pG out_buf[][pTr][pTc], 85 | Itf *out_data, 86 | Tparam out_offset, 87 | Tparam n, Tparam r, Tparam c, Tparam N, Tparam R, Tparam C, bool act) { 88 | 89 | Itf out_tmp = 0; 90 | Itf ex_out_tmp = 0; 91 | pG tmp = 0; 92 | pG tmp_outbuf = 0; 93 | if (n >= 0) { 94 | for (int j = r; j < r + pTr && j < R; j++) 95 | { 96 | for (int k = c; k < c + pTc && k < C; k++) 97 | { 98 | #pragma HLS PIPELINE 99 | for (int i = 0; i < pTn; i += 32) 100 | { 101 | if(i < N - n) 102 | { 103 | for (int wr = 0; wr < (pTn<32?pTn:32); wr++) 104 | { 105 | #pragma HLS UNROLL 106 | tmp_outbuf = RELU(out_buf[i + wr][j - r][k - c]); 107 | out_tmp.range(16 * (wr + 1) - 1, 16 * wr) = tmp_outbuf.range(15,0); 108 | } 109 | *(out_data + out_offset + ((i + n)/32)*R*C + j*C + k) = out_tmp; 110 | } 111 | } 112 | } 113 | } 114 | } 115 | } 116 | ///////////////////////------------------conv accelerator----------------////////////////////////// 117 | void max_pool_layer_mbuf( 118 | Tparam R_IN, // input Row 119 | Tparam C_IN, // input column 120 | Tparam N, //input feature number 121 | Tparam K, //input kernel size 122 | Tparam R, // output Row 123 | Tparam C, // output column 124 | Tparam S, // stride size 125 | Tparam P, // padding size 126 | Tparam act, // activation function bit (1-- with act, 0--without act) 127 | Tparam i_offset, 128 | Tparam o_offset, 129 | Itf *i_data, 130 | Itf *o_data) 131 | { 132 | 133 | Tparam TR=0; 134 | Tparam TC=0; 135 | ap_fixed<16,10> in_buf_0[pTn][(pTr-1)*pS_max + pK_max][(pTc-1)*pS_max + pK_max]; 136 | ap_fixed<16,10> out_buf_0[pTn][pTr][pTc]; 137 | 138 | #pragma HLS ARRAY_PARTITION variable=in_buf_0 complete dim=1 139 | #pragma HLS ARRAY_PARTITION variable=out_buf_0 complete dim=1 140 | 141 | 142 | 143 | for(int r = 0; r < R; r += pTr) 144 | { 145 | for (int c = 0; c < C; c += pTc) 146 | { 147 | TR = ((r * S + (pTr - 1) * S + K) > R_IN ? (R_IN - r * S) : ((pTr - 1) * S + K)); 148 | TC = ((c * S + (pTc - 1) * S + K) > C_IN ? (C_IN - c * S) : ((pTc - 1) * S + K)); 149 | for (int n = 0; n < N ; n += pTn) 150 | { 151 | in_buf_load_512(in_buf_0, i_data, i_offset, n, r, c, S, K, P, R, C, N, R_IN, C_IN, TR, TC); 152 | pool_engine(in_buf_0, out_buf_0, S, n, r, c, K, R, C, TR, TC); 153 | output_res_512(out_buf_0, o_data, o_offset, n, r, c, N, R, C, act); 154 | } 155 | } 156 | } 157 | } 158 | }; 159 | #endif 160 | -------------------------------------------------------------------------------- /netGenerator/dse/tm_tn_multiAcc.py: -------------------------------------------------------------------------------- 1 | import helping_functions 2 | import sys 3 | import math 4 | from model_extract import model_extract 5 | from model_split import model_partition_ordered 6 | from model_split import model_split_unordered 7 | from model_split import gop_calculate 8 | from model_split import max_layer_dataout 9 | from model_split import model_split_by_label 10 | from model_split import model_split_by_list 11 | from model_split import model_partition_ordered 12 | # from cluster import clusters_layers_kmeans 13 | from model_partition import partition 14 | from model_partition import partition_to_k 15 | from global_search import global_search 16 | from local_search import single_item_search 17 | from local_search import model_partition_by_gop 18 | from local_search import local_search 19 | from local_search import per_die_config_dse 20 | from local_search import per_die_config_dse_multiAcc 21 | from local_search import per_die_config_dse_multiAcc_flex 22 | from local_search import conv_net_perf 23 | from param_write import generate_param_file 24 | from local_search import flatten 25 | from task_analysis import acc_task_analysis 26 | from task_analysis import subnet_task_analysis 27 | import time 28 | 29 | 30 | def print_line(stage_name): 31 | if stage_name == "line": 32 | print("-" * int(math.ceil((int(80))))) 33 | else: 34 | print("\n") 35 | print("-" * int(math.ceil((int(80) - len(stage_name))/2)), stage_name, "-" * int(math.ceil((int(80) - len(stage_name))/2))) 36 | 37 | def multiAcc_dse(): 38 | # define the network parameter containers 39 | conv_N = [] 40 | conv_M = [] 41 | conv_r = [] 42 | conv_R = [] 43 | conv_K = [] 44 | conv_S = [] 45 | flag = [] 46 | cut_flag = [] 47 | pool_N = [] 48 | 49 | sub_conv_N = [] 50 | sub_conv_M = [] 51 | sub_conv_r = [] 52 | sub_conv_R = [] 53 | sub_conv_K = [] 54 | sub_conv_S = [] 55 | sub_flag = [] 56 | 57 | pair_1 = [] 58 | pair_2 = [] 59 | pair_3 = [] 60 | lat_1 = 0 61 | lat_2 = 0 62 | lat_3 = 0 63 | sub_lat_list = [] 64 | lat_list = [] 65 | 66 | util_1 = 0 67 | util_2 = 0 68 | util_3 = 0 69 | sub_util_list = [] 70 | util_list = [] 71 | 72 | OPs = 0 73 | sub_pair_list = [] 74 | item_list = [] 75 | pair_list = [] 76 | overall_lat = 60551400 77 | layer_list = [] 78 | gop_list = [] 79 | 80 | """ 81 | step 1: extract model from txt file with parameter no_include_fc / include_fc 82 | """ 83 | conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag, pool_N = model_extract('no_include_fc') 84 | # print("Extracted cut flag: ", cut_flag) 85 | # print("Extracted pool flag:", flag) 86 | OPs = gop_calculate(conv_N, conv_M, conv_R, conv_K) 87 | max_layerout = max_layer_dataout(conv_N, conv_M, conv_R, conv_K) 88 | 89 | print_line("Model extract phase") 90 | print("1: ", "Model extracted") 91 | print("1: ", "Overall convolution operation required: ", OPs) 92 | print("1: ", "Max layer output data: ", max_layerout) 93 | # print_line("Model split finish") 94 | 95 | """ 96 | step 2: randomly cluster, param k=4, layer label results are in item 97 | """ 98 | print_line("Model partition phase") 99 | for i in range(0, len(conv_N)): 100 | layer_list.append(i) 101 | # kmeans=clusters_layers_kmeans(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, 2) 102 | # print kmeans 103 | partition_location, diff_ratio = model_partition_by_gop(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag) 104 | print("2: layers extracted", conv_N) 105 | print("2: layers cutable ", cut_flag) 106 | print("2: partition location", partition_location) 107 | print("2: diff_ratio: ", diff_ratio) 108 | 109 | sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \ 110 | =model_partition_ordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, partition_location[0]+1, partition_location[1]+1) 111 | # print "2: Best partition output: ", partition_location, diff_ratio 112 | print("2:", sub_conv_N) 113 | 114 | sub_gop_list = [] 115 | for i in range(0, len(sub_conv_N)): 116 | sub_gop_list.append(gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i])) 117 | 118 | 119 | print("2: gop of sub_nets", sub_gop_list) 120 | print("2: length of sub_conv_N", len(sub_conv_N[0]), len(sub_conv_N[1]), len(sub_conv_N[2])) 121 | print("2", sub_flag) 122 | print("2: length of sub_flag", len(sub_flag[0]), len(sub_flag[1]), len(sub_flag[2])) 123 | sub_pair_list = [] 124 | sub_lat_list = [] 125 | sub_util_list = [] 126 | 127 | print_line("Best Configuration Search") 128 | overall_start = time.time() 129 | # acc_cluster_num = 3 130 | # pair_list, item_list, gop_list, util_list = global_search(layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat) 131 | # pair_list, gop_list, util_list = per_die_config_dse_multiAcc(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, 132 | # sub_conv_S, sub_flag) 133 | pair_list = per_die_config_dse_multiAcc_flex(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) 134 | 135 | overall_end = time.time() 136 | 137 | print_line("DSE Output") 138 | print("Best Configuration Search Results for layer accelerators: ") 139 | for i in range(0, len(pair_list)): 140 | print(pair_list[i]) 141 | 142 | acc_task_list, total_acc_num = acc_task_analysis(pair_list, sub_conv_N, sub_conv_M, sub_conv_r, \ 143 | sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) 144 | 145 | print("Accelerator task list: ") 146 | for acc_num in range(0, len(acc_task_list)): 147 | print("acc core", acc_num, " task list: ", acc_task_list[acc_num]) 148 | 149 | print_line("Subnet Task Out") 150 | subnet_task_list = subnet_task_analysis(pair_list, acc_task_list, sub_conv_N, sub_conv_M, sub_conv_r, \ 151 | sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) 152 | print("sub net interface list:") 153 | for i in range(0, len(subnet_task_list)): 154 | print(subnet_task_list[i]) 155 | 156 | 157 | print_line("Write out configurations") 158 | print(len(pair_list), "sub-nets are generated") 159 | print(total_acc_num, "accelerators are written into the cofig file") 160 | generate_param_file(pair_list, pool_N, acc_task_list, subnet_task_list, "acc_ins_params.txt") 161 | 162 | print_line("netGen run time system info") 163 | print("Overall time cost:", overall_end - overall_start, "s") 164 | print_line("line") 165 | 166 | 167 | print_line("test") 168 | print(conv_net_perf(sub_conv_N[2], sub_conv_M[2], sub_conv_R[2], sub_conv_S[2], sub_conv_K[2], sub_flag[2], 8, 274, 37, 4, 4)) 169 | 170 | 171 | if __name__ == "__main__": 172 | conv_N = multiAcc_dse() 173 | -------------------------------------------------------------------------------- /fpga_cnn/src/fc_acc_innerpp.h: -------------------------------------------------------------------------------- 1 | #ifndef _FC_ACC_H_ 2 | #define _FC_ACC_H_ 3 | 4 | #include 5 | #include 6 | #include "config.h" 7 | #include "activation_functions.h" 8 | 9 | #if _C_DEBUG_MODE_ 10 | #include 11 | #endif 12 | 13 | using namespace std; 14 | 15 | template 16 | class fc_acc 17 | { 18 | 19 | private: 20 | int fc_layer_number; 21 | 22 | public: 23 | fc_acc() : fc_layer_number(0) { // construction function with parameter checking 24 | if (iTm < 32 || iTn < 32){ 25 | if(iTm < 32) cout << "FC ACC: iTm is invalid, please check the iTm value to make sure it is >= 32 !!!" << endl; 26 | if(iTn < 32) cout << "FC ACC: iTn is invalid, please check the iTm value to make sure it is >= 32 !!!" << endl; 27 | } else { 28 | cout << "FC ACC: fc_acc is valid!" << endl; 29 | } 30 | } 31 | 32 | ////------------------------------C++ debugging functions---------------------------------------//// 33 | 34 | 35 | ////-----------------------------Accelerator Functions---------------------------------------//// 36 | // Load bias data 37 | void b_buf_load_512(W buf[], Itf *fc_layer_bias, int fc_b_offset, int m, int M) { 38 | Itf data_temp = 0; 39 | for (int i = 0; i < iTm; i+=32){ 40 | #pragma HLS PIPELINE 41 | data_temp = *(fc_layer_bias + fc_b_offset + (m+i)/32); 42 | cout << "index of bias memory : " << fc_b_offset + (m+i)/32 << endl; 43 | for (int wr = 0; wr < 32; wr++){ 44 | #pragma HLS UNROLL 45 | if (i+wr < iTm) { 46 | if (i+wr < M){ 47 | buf[i+wr].range(15,0) = data_temp.range((wr+1)*16-1, wr*16); 48 | // cout << "bias buffer[" << i + wr <<"] = "<< buf[i+wr] << endl; 49 | } else { 50 | buf[i+wr].range(15,0) = 0; 51 | } 52 | } 53 | } 54 | } 55 | } 56 | 57 | // Load input data 58 | void in_buf_load_512(T buf[iTn], 59 | Itf *fc_in_data, 60 | int fc_i_offset, int n, int N) { 61 | Itf data_temp = 0; 62 | for (int i = 0; i < iTn; i+=32) { 63 | #pragma HLS PIPELINE 64 | data_temp = *(fc_in_data + fc_i_offset + (n+i)/32); 65 | // cout << "index of in data memory : " << fc_i_offset + (n+i)/32 << endl; 66 | for (int wr = 0; wr < 32; wr++) { 67 | #pragma HLS UNROLL 68 | if(i + wr < iTn){ 69 | if(i+wr < N ){ 70 | buf[i+wr].range(15,0) = data_temp.range((wr+1)*16-1, wr*16); 71 | // cout << "data_buffer["<= N - iTn) { 132 | for (int i = 0; i < iTm && i < M; i += 32) { // iTm should always greater than 32, otherwise this will not work 133 | for (int wr = 0; wr < 32; wr++){ 134 | #pragma HLS PIPELINE 135 | if(i + wr < M && i + wr < iTm) { 136 | // tmp_outbuf = RELU(buf[i + wr]); 137 | tmp_outbuf = buf[i + wr]; 138 | tmp.range(15, 0) = tmp_outbuf.range(15, 0); 139 | } else { 140 | tmp.range(15,0) = 0; 141 | } 142 | out_tmp.range(16 * (wr + 1) - 1, 16 * (wr)) = tmp.range(15,0); 143 | // cout << "out_buffer[" << wr << "] = " << buf[wr] << endl; 144 | } 145 | *(fc_o_data + fc_o_offset + (m + i)/32) = out_tmp; 146 | // cout << "index of out memory : " << fc_o_offset + ((m+i)/32) << endl; 147 | } 148 | } 149 | } 150 | 151 | 152 | #if _LAYER_MODE_ // layer function with manual double buffer -- worked 153 | void fc_layer_acc_mbuf( 154 | int N, //input feature number 155 | int M, // output feature number 156 | int R_IN, 157 | int C_IN, 158 | int K, 159 | bool act, // activation function bit (1-- with act, 0--without act) 160 | Itf *fc_layer_weights, //w[M][N][K][K] 161 | Itf *fc_layer_bias, // b[M] 162 | int fc_w_offset, 163 | int fc_b_offset, 164 | int fc_i_offset, 165 | int fc_o_offset, 166 | Itf *fc_i_data, // in_data[N][(R-1)*S + K][(C-1)*S + K] --> [N][(R-1)*S + K - 2*P][(C-1)*S + K - 2*P] 167 | Itf *fc_o_data) 168 | { // out[M][R][C] 169 | 170 | /***************local data buffer groups******************************/ 171 | T in_buf_0[iTn]; 172 | W w_buf_0[iTn]; 173 | W b_buf_0[iTm]; 174 | G out_buf_0[iTm]; 175 | 176 | // T in_buf_1[iTn]; 177 | // W w_buf_1[iTn]; 178 | // W b_buf_1[iTm]; 179 | // G out_buf_1[iTm]; 180 | 181 | #if _HLS_MODE_ 182 | #pragma HLS ARRAY_PARTITION variable = in_buf_0 complete dim = 1 183 | #pragma HLS ARRAY_PARTITION variable = w_buf_0 complete dim = 1 184 | #pragma HLS ARRAY_PARTITION variable = b_buf_0 complete dim = 1 185 | #pragma HLS ARRAY_PARTITION variable = out_buf_0 complete dim = 1 186 | #endif 187 | 188 | #if _C_DEBUG_MODE_ 189 | #if _KERNEL_DEBUG_ 190 | cout << "Starting fc acc manual double buffer test ...." << endl; 191 | out_buf_reset(out_buf_0); 192 | b_buf_reset(b_buf_0); 193 | w_buf_reset(w_buf_0); 194 | cout << "Local buffer reset finised ..." << endl; 195 | #endif 196 | #endif 197 | 198 | for (int m = 0; m < M; m += iTm) 199 | { 200 | 201 | for (int n = 0; n < N; n += iTn) 202 | { 203 | //--------------------------Load input B W D in ping-pong manner-------------------------// 204 | b_buf_load_512(b_buf_0, fc_layer_bias, fc_b_offset, m, M); 205 | w_buf_load_512(w_buf_0, fc_layer_weights, fc_w_offset, n, N); 206 | in_buf_load_512(in_buf_0, fc_i_data, fc_i_offset, n, N); 207 | //------------------------------compute buffered data -----------------------------------// 208 | fc_engine(in_buf_0, w_buf_0, b_buf_0, out_buf_0, n); 209 | //---------------------------transfer output data----------------------------------------// 210 | output_res_512(out_buf_0, fc_o_data, fc_o_offset, n, m, N, M, act); 211 | // output_res_512(b_buf_0, fc_o_data, fc_o_offset, n, m, N, M, act); 212 | } 213 | } 214 | }; 215 | #endif 216 | }; 217 | 218 | #endif 219 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cloud-Dnn 2 | 3 | ## Introduction 4 | 5 | Cloud-DNN is an open-source framework that maps DNN (deep neural network) models trained by Caffe to FPGAs in the cloud for inference acceleration. It takes the input *.prototxt DNN description, generates corresponding C++ network description, and then produces the final hardware accelerator IPs through high-level synthesis. The goal of Cloud-DNN is to provide more flexible and user-friendly DNN acceleration on cloud-FPGAs (e.g., AWS F1). 6 | 7 | ### Hardware settings 8 | - Local cluster 9 | -UltraScale+ VU118 board with PCIe connection 10 | - AWS cluster 11 | -AWS F1.2Xlarge instance 12 | 13 | ### OS settings 14 | - Local cluster 15 | -Ubuntu 16.04 16 | - AWS cluster 17 | -FPGA development system image (Centos 7.6) 18 | 19 | ### Software requirement 20 | - Python 3.5 21 | - gcc g++ 22 | - Xilinx vivado_hls 2018.2 (2018.2.op for AWS) 23 | - Xilinx vivado 2018.2 (2018.2.op for AWS) 24 | - Caffe and the required libraries (including Pycaffe) 25 | - aws-fpga repo 26 | - Drivers 27 | -Local: XDMA driver for UltraScale+ VU118 board 28 | -AWS: AWS shell IP support, EDMA(XDMA for the latest version of AWS shell) driver 29 | 30 | 31 |
32 | GitHub Repository Structure 33 | 34 | ```sh 35 | Open-Dnn/ 36 | | 37 | |-- LICENSE 38 | |-- README.md 39 | |-- netGenerator 40 | | |-- paramExtractor 41 | | |-- dse 42 | | `-- netGen 43 | |-- scripts 44 | | |-- compile 45 | | |-- hls_impl 46 | | `-- sys_gen 47 | |-- acc_runtime 48 | | |-- local_acc 49 | | `-- aws_acc 50 | |-- fpga_cnn 51 | | |-- src 52 | | `-- testbench 53 | |-- docs 54 | `-- examples 55 | ``` 56 |
57 | 58 | 59 | ## Brief Manual 60 | 61 | ### Steps briefing 62 | 63 |

64 | 65 |

66 | 67 | Building an accelerator system for either local cluster or AWS cluster both requires: 68 | 69 | 1. DNN description analysis 70 | 71 | 1. C++ accelerator description generation 72 | 73 | 1. Accelerator IP generation with vivado_hls 74 | 75 | 1. Accelerator system configuration 76 | 77 | 1. Host function construction and compilation 78 | 79 | The generation processes are the same before step 4. The differences in the rest of the steps are explained with the detailed operations below. 80 | 81 | 82 | ### Build accelerator system 83 | 84 | Please follow the steps with a given alex.prototxt file and a trained alex.caffemodel to build your accelerator system. Make sure your environment is well set before starting this manual. 85 | 86 | 1. Generating C++ accelerator description. After the repo is downloaded (no need for the *.caffemodel file for now) 87 | ```sh 88 | cd Open-Dnn/netGenerator 89 | ./run_generator.sh -i alex.prototxt 90 | ``` 91 | run_generator.sh will automatically extract, analyze and generate the C++ code with the given alex.prototxt file. Since the alex.prototxt is given as the repo file, please only download the alex.caffemodel before executing the runtime software. 92 | 93 | >**:pushpin: TIPS:** 94 | > - The run_generator.sh includes all the steps before generating the accelerator IPs: parameter extraction, parameter analysis and C++ code generation. If the process doesn't work with your input model description, please hack the intermediate files copied or moved after every stage in the run_generator.sh to generate your own design. 95 | > - The steps in the run_generator.sh script could also be executed one by one with the scripts mentioned for each of the steps with the corresponding input files. 96 | > - The intermediate files for alex.prototxt are provided in the examples/ folder, please copy them to the corresponding location to run the generation step by step if your system is constrained with the software environmental supports. 97 | > - The parameter extract script is sensitive to the format of the name\type in the prototxt file. The current version only supports the word with the first letter capitalized and with "" symbol for it. 98 | 99 | 2. Generating accelerator IPs. After the run_generator.sh script is executed successfully, the generated project is named as gen_proj and located at Open-Dnn/gen_proj. 100 | ```sh 101 | cd ../gen_proj/hls_proj 102 | ./syn.sh 103 | ``` 104 | syn.sh will generate the 3 sub-net IPs with the C++ code and scripts generated from previous step. One could also hack the accelerator configurations in the acc_instance.h and call the testbench classes to verify the correctness of your change. 105 | 106 | >**:pushpin: TIPS:** 107 | > - For co-sim, please uncomment the iteration in the hls_script.tcl. Current hls_script.tcl is simplified for IP generation. 108 | > - The provided ff_test.cpp includes a simple testbench for the first sub_net function, which is sub_net_0, please modify and uncomment the others if you need to run co-sim for them. (It will be generated automatically in a future version.) 109 | 110 | 3. Accelerator system construction. The system construction scripts are provided within the generated project folder gen_proj/impl_proj. Before constructing the accelerator system, make sure the environment is well set and the sub-net IPs are generated and located properly. 111 | - Local Cluster 112 | ```sh 113 | cd ../impl_proj/local_impl/ 114 | *(specify the path of the generated IPs in the build_system_local.tcl) 115 | use vivado tcl console to call build_system_local.tcl (either in tcl console or with terminal mode) 116 | ``` 117 | 118 | >**:pushpin: TIPS:** 119 | > - You could also manually build your own accelerator system by taking the system_overview_local.pdf in the Open-Dnn/docs/ folder as a reference. 120 | > - Remember to specify the interface latency as 3 for the URAMs in the system. 121 | > - Please be aware of the clocks in the system overview. 122 | 123 | 124 | - AWS F1 125 | 126 | Before starting this step, please make sure the IPI design examples in the aws-fpga repo could be excuted correctly. Follow the IPI design flow provided by it. 127 | ```sh 128 | mkdir ~/aws-fpga/hdk/cl/examples/aws_acc_ipi 129 | cp ../impl_proj/aws_impl/* ~/aws-fpga/hdk/cl/examples/aws_acc_ipi 130 | *(Specify the path of the generated IPs in the build_system_aws.tcl) 131 | use vivado to call build_system_aws.tcl (tcl console or terminal) 132 | ``` 133 | 134 | >**:pushpin: TIPS:** 135 | > - You could also manually build your own accelerator system by taking the system_overview_aws.pdf in the Open-Dnn/docs/ folder as a reference. 136 | > - Remember to specify the interface latency as 3 for the URAMs in the system. 137 | > - Please be aware of the clocks in the system overview. 138 | 139 | 140 | 4. Runtime software compilation. 141 | - Local Cluster 142 | 143 | After the bitstream of the accelerator system is generated and downloaded to the UltraScale+ VU118 board, copy the acc_runtime/local_acc/ folder to your prefered execution path. Copy the config.h file from the gen_proj/hls_proj/src/ to the local_acc/ folder. Run compilation to get the executable file. 144 | 145 | - AWS F1 146 | 147 | After the AGFI (follow the instructions for AWS F1 AGFI generation) of the accelerator system is generated and downloaded to the AWS F1 instance (Follow the aws-fpga development process). Copy the acc_runtime/aws_acc/ folder to your prefered execution path. Copy the config.h file from the gen_proj/hls_proj/src/ to the aws_acc/ folder. Run compilation to get the executable file. 148 | 149 | 150 | ### Play With Demos 151 | 152 | 153 |
154 | Playing with given demos on local cluster 155 | 156 | Please follow the steps to play with a given demo with bitstream and runtime software. 157 | 158 | 159 |
160 | 161 |
162 | Playing with given demos on AWS F1 163 | 164 | Please follow the steps to play with a given demo with AGFI and runtime software. 165 | 166 | 167 |
168 | 169 | 170 | 171 | ## Additional Resources 172 | 173 | For more details, please refer to the paper below. 174 | 175 | ```sh 176 | @inproceedings{Chen2019fpga, 177 | author = {Chen, Yao and He, Jiong and Zhang, Xiaofan and Hao, Cong and Chen, Deming}, 178 | title = {Cloud-DNN: An Open Framework for Mapping DNN Models to Cloud FPGAs}, 179 | booktitle = {Proceedings of the 2019 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays}, 180 | series = {FPGA '19}, 181 | year = {2019}, 182 | isbn = {978-1-4503-6137-8}, 183 | location = {Seaside, CA, USA}, 184 | pages = {73--82}, 185 | numpages = {10}, 186 | url = {http://doi.acm.org/10.1145/3289602.3293915}, 187 | doi = {10.1145/3289602.3293915}, 188 | acmid = {3293915}, 189 | publisher = {ACM}, 190 | address = {New York, NY, USA}, 191 | keywords = {cloud computing, dnn accelerator, fpga, high-level synthesis, neural network, reconfiguration} 192 | } 193 | ``` 194 | -------------------------------------------------------------------------------- /netGenerator/dse/task_analysis.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def acc_task_analysis(pair_list, sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag): 4 | 5 | total_acc_num = 0 6 | for i in range(0, len(pair_list)): 7 | total_acc_num += len(pair_list[i][1]) 8 | 9 | acc_task_list = [] 10 | for acc_num in range(0, total_acc_num): 11 | acc_task_list.append([]) 12 | print("acc_task_list: ", acc_task_list) 13 | 14 | acc_num_counter = 0 15 | for sub_net_number in range(0, len(sub_conv_N)): 16 | print("sub_net_", sub_net_number, " layer_number: ", len(sub_conv_N[sub_net_number])) 17 | print("sub_net_", sub_net_number, " layer_acc_number: ", pair_list[sub_net_number][0][0]) 18 | print("sub_net_cutting_point: ", pair_list[sub_net_number][0][1]) 19 | # for sub_net_acc_core in range(0, pair_list[sub_net_number][0][0]): 20 | print("acc core_", acc_num_counter) 21 | if len(pair_list[sub_net_number][0][1]) == 1 and pair_list[sub_net_number][0][1][0] == -1: 22 | for layer_num in range(0, len(sub_conv_N[sub_net_number])): 23 | local_list = [] 24 | local_list.append(sub_net_number) 25 | local_list.append(sub_conv_N[sub_net_number][layer_num]) 26 | local_list.append(sub_conv_M[sub_net_number][layer_num]) 27 | local_list.append(sub_conv_r[sub_net_number][layer_num]) 28 | local_list.append(sub_conv_R[sub_net_number][layer_num]) 29 | local_list.append(sub_conv_K[sub_net_number][layer_num]) 30 | local_list.append(sub_conv_S[sub_net_number][layer_num]) 31 | local_list.append(sub_flag[sub_net_number][layer_num]) 32 | acc_task_list[acc_num_counter].append(local_list) 33 | acc_num_counter += 1 34 | print("sub_net no cut") 35 | elif len(pair_list[sub_net_number][0][1]) == 1 and pair_list[sub_net_number][0][1][0] == 1: 36 | for layer_num in range(0, pair_list[sub_net_number][0][1][0]): 37 | local_list = [] 38 | local_list.append(sub_net_number) 39 | local_list.append(sub_conv_N[sub_net_number][layer_num]) 40 | local_list.append(sub_conv_M[sub_net_number][layer_num]) 41 | local_list.append(sub_conv_r[sub_net_number][layer_num]) 42 | local_list.append(sub_conv_R[sub_net_number][layer_num]) 43 | local_list.append(sub_conv_K[sub_net_number][layer_num]) 44 | local_list.append(sub_conv_S[sub_net_number][layer_num]) 45 | local_list.append(sub_flag[sub_net_number][layer_num]) 46 | acc_task_list[acc_num_counter].append(local_list) 47 | acc_num_counter += 1 48 | for layer_num in range(pair_list[sub_net_number][0][1][0], len(sub_conv_N[sub_net_number])): 49 | local_list = [] 50 | local_list.append(sub_net_number) 51 | local_list.append(sub_conv_N[sub_net_number][layer_num]) 52 | local_list.append(sub_conv_M[sub_net_number][layer_num]) 53 | local_list.append(sub_conv_r[sub_net_number][layer_num]) 54 | local_list.append(sub_conv_R[sub_net_number][layer_num]) 55 | local_list.append(sub_conv_K[sub_net_number][layer_num]) 56 | local_list.append(sub_conv_S[sub_net_number][layer_num]) 57 | local_list.append(sub_flag[sub_net_number][layer_num]) 58 | acc_task_list[acc_num_counter].append(local_list) 59 | acc_num_counter += 1 60 | print("sub net cut into 2") 61 | else: 62 | for layer_num in range(0, pair_list[sub_net_number][0][1][0]): 63 | local_list = [] 64 | local_list.append(sub_net_number) 65 | local_list.append(sub_conv_N[sub_net_number][layer_num]) 66 | local_list.append(sub_conv_M[sub_net_number][layer_num]) 67 | local_list.append(sub_conv_r[sub_net_number][layer_num]) 68 | local_list.append(sub_conv_R[sub_net_number][layer_num]) 69 | local_list.append(sub_conv_K[sub_net_number][layer_num]) 70 | local_list.append(sub_conv_S[sub_net_number][layer_num]) 71 | local_list.append(sub_flag[sub_net_number][layer_num]) 72 | acc_task_list[acc_num_counter].append(local_list) 73 | acc_num_counter += 1 74 | for layer_num in range(pair_list[sub_net_number][0][1][0], pair_list[sub_net_number][0][1][1]): 75 | local_list = [] 76 | local_list.append(sub_net_number) 77 | local_list.append(sub_conv_N[sub_net_number][layer_num]) 78 | local_list.append(sub_conv_M[sub_net_number][layer_num]) 79 | local_list.append(sub_conv_r[sub_net_number][layer_num]) 80 | local_list.append(sub_conv_R[sub_net_number][layer_num]) 81 | local_list.append(sub_conv_K[sub_net_number][layer_num]) 82 | local_list.append(sub_conv_S[sub_net_number][layer_num]) 83 | local_list.append(sub_flag[sub_net_number][layer_num]) 84 | acc_task_list[acc_num_counter].append(local_list) 85 | acc_num_counter += 1 86 | for layer_num in range(pair_list[sub_net_number][0][1][1], len(sub_conv_N[sub_net_number])): 87 | local_list = [] 88 | local_list.append(sub_net_number) 89 | local_list.append(sub_conv_N[sub_net_number][layer_num]) 90 | local_list.append(sub_conv_M[sub_net_number][layer_num]) 91 | local_list.append(sub_conv_r[sub_net_number][layer_num]) 92 | local_list.append(sub_conv_R[sub_net_number][layer_num]) 93 | local_list.append(sub_conv_K[sub_net_number][layer_num]) 94 | local_list.append(sub_conv_S[sub_net_number][layer_num]) 95 | local_list.append(sub_flag[sub_net_number][layer_num]) 96 | acc_task_list[acc_num_counter].append(local_list) 97 | acc_num_counter += 1 98 | print("sub net cut into 3") 99 | 100 | return acc_task_list, total_acc_num 101 | 102 | 103 | def subnet_task_analysis(pair_list, acc_task_list, sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag): 104 | # i pair_list[0][0] 1024 105 | #sub_net, 0, 2, 1024, 4096,2048,2048,2048,2048,2048 106 | #sub_net, 1, 2,1024,4096,2048,2048,2048,2048,2048 107 | #sub_net, 2, 2,1024,4096,2048,2048,2048,2048,2048 108 | subnet_task_list = [[], [], []] 109 | acc_num = [] 110 | param_num = [] 111 | bias = [] 112 | weight_num = [] 113 | data_in_0 = [] 114 | data_out_0 = [] 115 | data_in_1 = [] 116 | data_out_1 = [] 117 | data_in_2 = [] 118 | data_out_2 = [] 119 | 120 | for i in range(0, len(sub_conv_N)): 121 | sub_w_num = 0 122 | sub_b_num = 0 123 | max_i_num = 0 124 | max_o_num = 0 125 | for j in range(0, len(sub_conv_N[i])): 126 | layer_b = sub_conv_M[i][j] 127 | # sub_b_num += math.ceil(float(layer_b) / 32) 128 | sub_w_num += math.ceil(float(layer_b) / 32) 129 | layer_w = sub_conv_N[i][j] * sub_conv_M[i][j] * sub_conv_K[i][j] * sub_conv_K[i][j] 130 | sub_w_num += math.ceil(float(layer_w)/32) 131 | data_i = [] 132 | acc_max_i = [] 133 | acc_max_o = [] 134 | for k in range(0, len(acc_task_list)): 135 | for l in range(0, len(acc_task_list[k])): 136 | local_max_i = 0 137 | local_max_o = 0 138 | if acc_task_list[k][l][0] == i: 139 | # print("sub net", i, "acc task list", k, len(acc_task_list[k]), l) 140 | i_size = math.ceil(float(acc_task_list[k][l][1] * \ 141 | acc_task_list[k][l][3] * acc_task_list[k][l][3])/32) 142 | o_size = math.ceil(float(acc_task_list[k][l][2] * \ 143 | acc_task_list[k][l][4] * acc_task_list[k][l][4])/32) 144 | if acc_task_list[k][l][7]: 145 | o_size += math.ceil(float(acc_task_list[k][l][2] * \ 146 | acc_task_list[k][l][4] * acc_task_list[k][l][4])/32/4) 147 | if local_max_i < i_size: 148 | local_max_i = i_size 149 | if local_max_o < o_size: 150 | local_max_o = o_size 151 | if acc_task_list[k][l][0] == i: 152 | data_i.append(local_max_i * 2) #twiced for double buffering 153 | data_i.append(local_max_o * 2) 154 | # print(i, pair_list[i][0][0], 1024, sub_b_num, sub_w_num, data_i) 155 | subnet_task_list[i].append(i) 156 | subnet_task_list[i].append(pair_list[i][0][0]) 157 | subnet_task_list[i].append(1024) 158 | subnet_task_list[i].append(sub_w_num) 159 | subnet_task_list[i].append(data_i) 160 | 161 | return subnet_task_list 162 | 163 | 164 | if __name__ == "__main__": 165 | acc_task_analysis() 166 | 167 | -------------------------------------------------------------------------------- /netGenerator/netGen/generate_consNet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def gen_sub_head(): 4 | 5 | strs = ''' 6 | #ifndef _CONSTRUCT_NET_H_ 7 | #define _CONSTRUCT_NET_H_ 8 | 9 | #include 10 | #include 11 | #include "acc_instance.h" 12 | 13 | using namespace std; 14 | ''' 15 | return strs 16 | 17 | def gen_param_port(parameters): 18 | 19 | param_port_str = ''' 20 | Tparam param_port['''+str(parameters[1]) + '''], 21 | data_type_itf weight_in['''+str(parameters[2]) + '''], 22 | ''' 23 | return param_port_str 24 | 25 | def gen_data_port(parameters): 26 | port_str = ''' ''' 27 | print(parameters[0]) 28 | if str(parameters[0]) == '1': 29 | port_str = ''' 30 | data_type_itf data_in_0['''+str(parameters[3])+'''], 31 | data_type_itf data_out_0['''+str(parameters[4])+'''], 32 | int select ) { 33 | ''' 34 | if str(parameters[0]) == '2': 35 | port_str = ''' 36 | data_type_itf data_in_0['''+str(parameters[3])+'''], 37 | data_type_itf data_out_0['''+str(parameters[4])+'''], 38 | data_type_itf data_in_1['''+str(parameters[4])+'''], 39 | data_type_itf data_out_1['''+str(parameters[6])+'''], 40 | int select ) { 41 | ''' 42 | if str(parameters[0]) == '3': 43 | port_str = ''' 44 | data_type_itf data_in_0['''+str(parameters[3])+'''], 45 | data_type_itf data_out_0['''+str(parameters[4])+'''], 46 | data_type_itf data_in_1['''+str(parameters[4])+'''], 47 | data_type_itf data_out_1['''+str(parameters[6])+'''], 48 | data_type_itf data_in_2['''+str(parameters[6])+'''], 49 | data_type_itf data_out_2['''+str(parameters[8])+'''], 50 | int select ) { 51 | ''' 52 | return port_str 53 | 54 | def gen_param_pragma(parameters): 55 | 56 | param_pragma = ''' 57 | #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS 58 | #pragma HLS INTERFACE s_axilite port=select bundle=CRTL_BUS 59 | 60 | #pragma HLS INTERFACE s_axilite port=param_port bundle=CRTL_BUS 61 | #pragma HLS INTERFACE m_axi port=param_port offset=slave depth='''+str(parameters[1])+''' bundle=PARAM_IN 62 | //#pragma HLS INTERFACE s_axilite port=bias_in bundle=CRTL_BUS 63 | //#pragma HLS INTERFACE m_axi port=bias_in offset=slave depth='''+str(parameters[2])+''' bundle=BIAS_IN 64 | #pragma HLS INTERFACE s_axilite port=weight_in bundle=CRTL_BUS 65 | #pragma HLS INTERFACE m_axi port=weight_in offset=slave depth='''+str(parameters[2])+''' bundle=WEIGHT_IN 66 | ''' 67 | return param_pragma 68 | 69 | def gen_data_pragma(parameters): 70 | 71 | data_pragma = ''' 72 | #pragma HLS INTERFACE s_axilite port=data_in_0 bundle=CRTL_BUS 73 | #pragma HLS INTERFACE m_axi port=data_in_0 offset=slave depth='''+str(parameters[3])+''' bundle=DATA_IN 74 | ''' 75 | if str(parameters[0]) == '1': 76 | data_pragma += ''' 77 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3 78 | ''' 79 | if str(parameters[0]) == '2': 80 | data_pragma += ''' 81 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3 82 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3 83 | #pragma HLS INTERFACE bram port=data_out_1 84 | ''' 85 | if str(parameters[0]) == '3': 86 | data_pragma += ''' 87 | #pragma HLS INTERFACE ap_memory port=data_out_0 latency=3 88 | #pragma HLS INTERFACE ap_memory port=data_in_1 latency=3 89 | #pragma HLS INTERFACE ap_memory port=data_out_1 latency=3 90 | #pragma HLS INTERFACE ap_memory port=data_in_2 latency=3 91 | #pragma HLS INTERFACE bram port=data_out_2 92 | ''' 93 | return data_pragma 94 | 95 | def gen_offset(idx, parameters): 96 | 97 | offset_str = ''' ''' 98 | if str(parameters[0]) == '1': 99 | offset_str = ''' 100 | int acc0_mem_inport_offset = 0; 101 | int acc0_mem_outport_offset = 0; 102 | 103 | if (select == 0) 104 | { 105 | acc0_mem_inport_offset = 0; 106 | acc0_mem_outport_offset = 0; 107 | } 108 | else 109 | { 110 | acc0_mem_inport_offset = 0; 111 | acc0_mem_outport_offset = 0; 112 | } 113 | ''' 114 | if str(parameters[0]) == '2': 115 | offset_str = ''' 116 | int acc0_mem_inport_offset = 0; 117 | int acc0_mem_outport_offset = 0; 118 | int acc1_mem_inport_offset = 0; 119 | int acc1_mem_outport_offset = 0; 120 | 121 | if (select == 0) 122 | { 123 | acc0_mem_inport_offset = 0; 124 | acc0_mem_outport_offset = 0; 125 | acc1_mem_inport_offset = '''+str(int(int(parameters[4])/2))+ '''; 126 | acc1_mem_outport_offset = '''+str(int(int(parameters[6])/2))+'''; 127 | } 128 | else 129 | { 130 | acc0_mem_inport_offset = '''+str(int(int(parameters[3])/2))+'''; 131 | acc0_mem_outport_offset = '''+str(int(int(parameters[4])/2))+ '''; 132 | acc1_mem_inport_offset = 0; 133 | acc1_mem_outport_offset = 0; 134 | } 135 | ''' 136 | if str(parameters[0]) == '3': 137 | offset_str = ''' 138 | int acc0_mem_inport_offset = 0; 139 | int acc0_mem_outport_offset = 0; 140 | int acc1_mem_inport_offset = 0; 141 | int acc1_mem_outport_offset = 0; 142 | int acc2_mem_inport_offset = 0; 143 | int acc2_mem_outport_offset = 0; 144 | 145 | if (select == 0) 146 | { 147 | acc0_mem_inport_offset = 0; 148 | acc0_mem_outport_offset = 0; 149 | acc1_mem_inport_offset = ''' + str(int(int(parameters[4]) / 2)) + '''; 150 | acc1_mem_outport_offset = ''' + str(int(int(parameters[6]) / 2)) + '''; 151 | acc2_mem_inport_offset = 0; 152 | acc2_mem_outport_offset = 0; 153 | } 154 | else 155 | { 156 | acc0_mem_inport_offset = ''' + str(int(int(parameters[3]) / 2)) + '''; 157 | acc0_mem_outport_offset = ''' + str(int(int(parameters[4]) / 2)) + '''; 158 | acc1_mem_inport_offset = 0; 159 | acc1_mem_outport_offset = 0; 160 | acc2_mem_inport_offset = ''' + str(int(int(parameters[6]) / 2)) + '''; 161 | acc2_mem_outport_offset = ''' + str(int(int(parameters[8]) / 2)) + '''; 162 | } 163 | ''' 164 | 165 | return offset_str 166 | 167 | def gen_convpool_func(parameters): 168 | 169 | func_bd = ''' ''' 170 | for i in range(0, int(parameters[0])): 171 | func_bd += ''' 172 | conv_pool_acc_'''+str(i)+'''(param_port + '''+str(i*512)+''', 173 | //bias_in, 174 | weight_in, 175 | data_in_'''+str(i)+''' + acc'''+str(i)+'''_mem_inport_offset, 176 | data_out_'''+str(i)+''' +acc'''+str(i)+'''_mem_outport_offset); 177 | ''' 178 | return func_bd 179 | 180 | def gen_subnet_func(idx, parameters): 181 | 182 | print(idx) 183 | param_port = gen_param_port(parameters) 184 | data_port = gen_data_port(parameters) 185 | param_pragma = gen_param_pragma(parameters) 186 | data_pragma = gen_data_pragma(parameters) 187 | offset_str = gen_offset(idx, parameters) 188 | convpool_func_bd = gen_convpool_func(parameters) 189 | 190 | subnet_func_bd = '''void sub_net_''' + str(idx) + '''( ''' \ 191 | + param_port \ 192 | + data_port \ 193 | + param_pragma \ 194 | + data_pragma \ 195 | + offset_str \ 196 | + convpool_func_bd \ 197 | + ''' };''' 198 | 199 | return subnet_func_bd 200 | 201 | def construct_function(idx, parameters): 202 | 203 | func_body = ''' ''' 204 | 205 | ''' 206 | data_type_itf data_in_0[2048], 207 | data_type_itf data_out_0[2048], 208 | data_type_itf data_in_1[2048], 209 | data_type_itf data_out_1[2048], 210 | int select 211 | ){ 212 | 213 | ''' 214 | return func_body 215 | 216 | 217 | def load_parameter(filename): 218 | lists = [] 219 | with open(filename) as f: 220 | while 1: 221 | line = f.readline() 222 | if not line: 223 | break 224 | lists.append(line.strip().split(",")) 225 | 226 | ps_list = {} 227 | print("loaded parameters") 228 | for l in lists: 229 | if l[0] not in ps_list: 230 | ps_list[l[0]] = [] 231 | ps_list[l[0]].append(l[1:]) 232 | 233 | print(ps_list) 234 | 235 | return ps_list 236 | 237 | 238 | def generate_consnet(ps_file, store_file): 239 | ps = load_parameter(ps_file) 240 | keys = ["sub_net_0", "sub_net_1", "sub_net_2"] 241 | print(keys) 242 | with open(store_file, "w") as wf: 243 | sub_head = gen_sub_head() 244 | wf.write(sub_head + "\n") 245 | sub_net_counter = 0 246 | for key in keys: 247 | if key not in ps: 248 | continue 249 | lists = ps[key] 250 | print("sub net counter:", sub_net_counter) 251 | print("lists", lists[0], "list len(): ", len(lists)) 252 | # for i in range(len(lists)): 253 | func = gen_subnet_func(sub_net_counter, lists[0]) 254 | wf.write(func + "\n\n") 255 | sub_net_counter += 1 256 | wf.write("#endif\n") 257 | print("ok") 258 | 259 | 260 | if __name__ == "__main__": 261 | parser = argparse.ArgumentParser() 262 | parser.add_argument("--params", help="accelerator param file") 263 | args = parser.parse_args() 264 | generate_consnet(args.params, "construct_net.h") 265 | -------------------------------------------------------------------------------- /fpga_cnn/src/acc_instance.h: -------------------------------------------------------------------------------- 1 | #ifndef _ACC_INSTANCE_H_ 2 | #define _ACC_INSTANCE_H_ 3 | #include "config.h" 4 | #include "conv_acc_2ibuf.h" 5 | #include "fc_acc_innerpp.h" 6 | #include "max_pool_acc_innerpp.h" 7 | 8 | using namespace std; 9 | conv_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 4, 8, 8, 5, 5, 32, 32, 32> convAcc0; 10 | conv_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 4, 8, 8, 5, 5, 32, 32, 32> convAcc1; 11 | // Tm, Tn,Tr,Tc,S_max,K_max, int IBUF_t, int WBUF_t, int OBUF_t> 12 | 13 | void conv_layer_acc_0( 14 | Tparam N, 15 | Tparam K, 16 | Tparam M, 17 | Tparam R_IN, 18 | Tparam C_IN, 19 | Tparam C_OUT, 20 | Tparam R_OUT, 21 | Tparam S, 22 | Tparam P, 23 | Tparam act, 24 | Tparam inport, 25 | Tparam weight_offset, 26 | Tparam bias_offset, 27 | Tparam in_offset, 28 | Tparam out_offset, 29 | ap_fixed<32,26>* layer_bias, 30 | data_type_itf* i_weight, 31 | data_type_itf* i_data, 32 | data_type_itf* out_data 33 | ) 34 | { 35 | #pragma HLS INTERFACE m_axi port=i_data 36 | //#pragma HLS INTERFACE bram port=out_data 37 | convAcc0.conv_layer_acc_2ibuf(N, K, M, R_IN, C_IN, C_OUT, R_OUT, S, P, act, inport, weight_offset, bias_offset, in_offset, out_offset,layer_bias,i_weight,i_data,out_data); 38 | } 39 | 40 | 41 | 42 | void conv_layer_acc_1( 43 | Tparam N, 44 | Tparam K, 45 | Tparam M, 46 | Tparam R_IN, 47 | Tparam C_IN, 48 | Tparam C_OUT, 49 | Tparam R_OUT, 50 | Tparam S, 51 | Tparam P, 52 | Tparam act, 53 | Tparam inport, 54 | Tparam weight_offset, 55 | Tparam bias_offset, 56 | Tparam in_offset, 57 | Tparam out_offset, 58 | ap_fixed<32,26>* layer_bias, 59 | data_type_itf* i_weight, 60 | data_type_itf* i_data, 61 | data_type_itf* out_data 62 | ) 63 | { 64 | //#pragma HLS INTERFACE bram port=i_data 65 | #pragma HLS INTERFACE bram port=out_data 66 | convAcc1.conv_layer_acc_2ibuf(N, K, M, R_IN, C_IN, C_OUT, R_OUT, S, P, act, inport, weight_offset, bias_offset, in_offset, out_offset,layer_bias,i_weight,i_data,out_data); 67 | } 68 | 69 | max_pool_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3> maxPoolAcc0; 70 | 71 | void max_pool_layer_acc_0( 72 | Tparam R_in, 73 | Tparam C_in, 74 | Tparam N, 75 | Tparam K, 76 | Tparam R, 77 | Tparam C, 78 | Tparam S, 79 | Tparam P, 80 | Tparam act, 81 | Tparam i_offset, 82 | Tparam o_offset, 83 | data_type_itf* i_data, 84 | data_type_itf* o_data){ 85 | maxPoolAcc0.max_pool_layer_mbuf(R_in, C_in, N, K, R, C, S, P, act, i_offset, o_offset, i_data, o_data); 86 | }; 87 | 88 | max_pool_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 16, 16, 2, 3> maxPoolAcc1; 89 | 90 | void max_pool_layer_acc_1( 91 | Tparam R_in, 92 | Tparam C_in, 93 | Tparam N, 94 | Tparam K, 95 | Tparam R, 96 | Tparam C, 97 | Tparam S, 98 | Tparam P, 99 | Tparam act, 100 | Tparam i_offset, 101 | Tparam o_offset, 102 | data_type_itf* i_data, 103 | data_type_itf* o_data){ 104 | maxPoolAcc1.max_pool_layer_mbuf(R_in, C_in, N, K, R, C, S, P, act, i_offset, o_offset, i_data, o_data); 105 | }; 106 | 107 | fc_acc< data_type_itf, Tparam, data_type, data_type_w, data_type_o, 32, 32, 5, 5> fcAcc0; 108 | 109 | void fc_layer_acc_0( 110 | Tparam N, 111 | Tparam K, 112 | Tparam M, 113 | Tparam R_IN, 114 | Tparam C_IN, 115 | Tparam C_OUT, 116 | Tparam R_OUT, 117 | Tparam S, 118 | Tparam P, 119 | Tparam act, 120 | Tparam weight_offset, 121 | Tparam bias_offset, 122 | Tparam in_offset, 123 | Tparam out_offset, 124 | data_type_itf* layer_bias, 125 | data_type_itf* i_weight, 126 | data_type_itf* i_data, 127 | data_type_itf* out_data 128 | ){ 129 | fcAcc0.fc_layer_acc_mbuf(N, M, R_IN, C_IN, K, act, 130 | i_weight, layer_bias, 131 | weight_offset, bias_offset, in_offset, out_offset, 132 | i_data, out_data); 133 | }; 134 | 135 | 136 | 137 | void conv_pool_acc_0( 138 | Tparam* param_port, 139 | ap_fixed<32,26>* bias_in, 140 | data_type_itf* weight_in, 141 | data_type_itf* data_in, 142 | data_type_itf* data_out 143 | ) 144 | { 145 | Tparam layer_num_local[16]; 146 | Tparam param_conv_local[16]; 147 | Tparam param_pool_local[16]; 148 | 149 | for (unsigned int ll = 0; ll < 16; ll++) 150 | { 151 | #pragma HLS PIPELINE 152 | layer_num_local[ll] = param_port[ll]; 153 | } 154 | 155 | cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl; 156 | cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <* bias_in, 215 | data_type_itf* weight_in, 216 | data_type_itf* data_in, 217 | data_type_itf* data_out 218 | ) 219 | { 220 | Tparam layer_num_local[16]; 221 | Tparam param_conv_local[16]; 222 | Tparam param_pool_local[16]; 223 | 224 | for (unsigned int ll = 0; ll < 16; ll++) 225 | { 226 | #pragma HLS PIPELINE 227 | layer_num_local[ll] = param_port[ll]; 228 | } 229 | 230 | cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl; 231 | cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" < 1: 98 | top_flag.append(0) 99 | print(top_key, top_value) 100 | else: 101 | top_flag.append(1) 102 | bot_num = 0 103 | print("traverse through bottoms----------------------------------") 104 | for bot_key, bot_value in bots.items(): 105 | if len(bot_value) > 1: 106 | bot_flag.append(0) 107 | print(bot_key, bot_value) 108 | else: 109 | bot_flag.append(1) 110 | bot_num = bot_num + 1 111 | 112 | print("top_flag:", top_flag) 113 | print("bot_flag:", bot_flag) 114 | unit_flag = 1 115 | for i in range(0, bot_num): 116 | if top_flag[i] == 0 and bot_flag[i] == 1: 117 | unit_flag = 0 118 | elif top_flag[i] == 1 and bot_flag[i] == 0: 119 | unit_flag = 1 120 | else: 121 | unit_flag = unit_flag 122 | print(unit_flag, end=' ') 123 | cutable.append(unit_flag) 124 | print() 125 | print("cutable flag:", cutable) 126 | 127 | print("final cutable points: ", cutable) 128 | layer_num = 0 129 | param_num = 0 130 | 131 | for layer in net.layers: 132 | # print(layer.type) 133 | if layer.type == 'Convolution' or layer.type == 'Pooling' or layer.type == 'Concat' or layer.type == 'InnerProduct': 134 | print(layer_param_list[param_num], layer.type, cutable[layer_num]) 135 | # f.write(layer.type + " ") 136 | nn_layer_type.append(layer.type) 137 | if layer.type == 'Convolution': 138 | nn_in_data_size_conv.append(layer_param_list[param_num][1]) 139 | nn_conv_cutable.append(cutable[layer_num]) 140 | if layer.type == 'Pooling': 141 | nn_in_data_size_pooling.append(layer_param_list[param_num][1]) 142 | nn_pool_cutable.append(cutable[layer_num]) 143 | if layer.type == 'Concat' or layer.type == 'InnerProduct': 144 | nn_fc_cutable.append(cutable[layer_num]) 145 | 146 | if layer.type == 'Convolution' or layer.type == 'Pooling' or layer.type == 'Concat' or layer.type == 'InnerProduct' or layer.type == 'LRN': 147 | param_num = param_num + 1 148 | layer_num = layer_num + 1 149 | 150 | # for x, y in layer_dic.items(): 151 | # print(x, y) 152 | layer_count = 0 153 | temp_layer_list = [] 154 | 155 | # for layer in parsible_net.layer: 156 | # temp_layer_list.append(layer) 157 | 158 | 159 | count = 0 160 | for layer in parsible_net.layer: 161 | if layer.type == "Convolution": 162 | kernel = layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1 163 | stride = layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1 164 | pad = layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0 165 | print(layer.name) 166 | tmp_count = 0 167 | tmp_dim_list = [] 168 | for layer_name, dim in net.blobs.items(): 169 | tmp_dim_list.append(dim.data.shape[1]) 170 | tmp_count = tmp_count + 1 171 | if tmp_count > 1: 172 | if layer_name == layer.name: 173 | inchannel = tmp_dim_list[-2] 174 | outchannel = layer.convolution_param.num_output 175 | group = layer.convolution_param.group 176 | nn_channel_size_conv.append(kernel) 177 | nn_stride_conv.append(stride) 178 | nn_padding_conv.append(pad) 179 | nn_in_number_conv.append(inchannel) 180 | nn_out_number_conv.append(outchannel) 181 | nn_bias_conv.append(outchannel) 182 | nn_group_conv.append(group) 183 | if layer.type == "Pooling": 184 | kernel = layer.pooling_param.kernel_size 185 | stride = layer.pooling_param.stride 186 | pad = layer.pooling_param.pad 187 | tmp_count = 0 188 | tmp_dim_list = [] 189 | print(layer.name) 190 | for layer_name, dim in net.blobs.items(): 191 | tmp_dim_list.append(dim.data.shape[1]) 192 | tmp_count = tmp_count + 1 193 | if tmp_count > 1: 194 | # print("previous layer info: ", layer_name, dim.data.shape[1], tmp_dim_list[-2]) 195 | if layer_name == layer.name: 196 | in_num = tmp_dim_list[-2] 197 | nn_channel_size_pooling.append(kernel) 198 | nn_stride_pooling.append(stride) 199 | nn_padding_pooling.append(pad) 200 | nn_in_number_pooling.append(in_num) 201 | if layer.type == "InnerProduct": 202 | output = layer.inner_product_param.num_output 203 | tmp_count = 0 204 | tmp_dim_list = [] 205 | print(layer.name) 206 | for layer_name, dim in net.blobs.items(): 207 | tmp_dim_list.append(dim.data.shape) 208 | tmp_count = tmp_count + 1 209 | # print("--------------------") 210 | # print(tmp_dim_list) 211 | if tmp_count > 1: 212 | # print("previous layer info: ", layer_name, dim.data.shape) 213 | if layer_name == layer.name: 214 | if len(tmp_dim_list[-2]) == 2: 215 | in_num = tmp_dim_list[-2][-1] 216 | if len(tmp_dim_list[-2]) == 4: 217 | in_num = tmp_dim_list[-2][-1] * tmp_dim_list[-2][-2] * tmp_dim_list[-2][-3] 218 | #TODO: here is an error with fc in data size, need to be modified 219 | nn_channel_size_fc.append(1) 220 | #nn_in_data_size_fc.append(1) 221 | nn_out_number_fc.append(output) 222 | nn_in_number_fc.append(in_num) 223 | 224 | count = count + 1 225 | 226 | print("Start writing param to file") 227 | 228 | # Writing the extracted params to an intermediate file 229 | write_param_inline("Network Structure: ", nn_layer_type, storefile) 230 | write_param_inline("nn_in_data_size_conv: ", nn_in_data_size_conv, storefile) 231 | write_param_inline("nn_channel_size_conv: ", nn_channel_size_conv, storefile) 232 | write_param_inline("nn_padding_conv: ", nn_padding_conv, storefile) 233 | write_param_inline("nn_stride_conv: ", nn_stride_conv, storefile) 234 | write_param_inline("nn_in_number_conv: ", nn_in_number_conv, storefile) 235 | write_param_inline("nn_out_number_conv: ", nn_out_number_conv, storefile) 236 | write_param_inline("nn_group_conv: ", nn_group_conv, storefile) 237 | write_param_inline("nn_bias_conv: ", nn_bias_conv, storefile) 238 | write_param_inline("nn_in_data_size_pooling: ", nn_in_data_size_pooling, storefile) 239 | write_param_inline("nn_channel_size_pooling: ", nn_channel_size_pooling, storefile) 240 | write_param_inline("nn_padding_pooling: ", nn_padding_pooling, storefile) 241 | write_param_inline("nn_stride_pooling: ", nn_stride_pooling, storefile) 242 | write_param_inline("nn_in_number_pooling: ", nn_in_number_pooling, storefile) 243 | write_param_inline("nn_in_data_size_fc: ", nn_in_data_size_fc, storefile) 244 | write_param_inline("nn_in_number_fc: ", nn_in_number_fc, storefile) 245 | write_param_inline("nn_out_number_fc: ", nn_out_number_fc, storefile) 246 | write_param_inline("nn_channel_size_fc: ", nn_channel_size_fc, storefile) 247 | write_param_inline("conv_cut_flag: ", nn_conv_cutable, storefile) 248 | write_param_inline("pool_cut_flag: ", nn_pool_cutable, storefile) 249 | write_param_inline("fc_cut_flag: ", nn_fc_cutable, storefile) 250 | 251 | 252 | 253 | if __name__ == '__main__': 254 | parser = argparse.ArgumentParser() 255 | parser.add_argument("--model", help="model prototxt path .prototxt") 256 | parser.add_argument("--weights", help="caffe model weights path .caffemodel") 257 | parser.add_argument("--output", help="output path") 258 | args = parser.parse_args() 259 | extract_caffe_model(args.model, args.weights, args.output, "net_config_params.txt") 260 | # gen_net_config_params("net_config_params.txt") 261 | -------------------------------------------------------------------------------- /netGenerator/dse/helping_functions.py: -------------------------------------------------------------------------------- 1 | EOL = "\n" 2 | SEPARATER = " " 3 | SPACE = " " 4 | PARAMETER_BEGIN = "(" 5 | PARAMETER_END = ")" 6 | BODY_BEGIN = "{" 7 | BODY_END = "}" 8 | ARRAY_BEGIN = "[" 9 | ARRAY_END = "]" 10 | CLASS_BEGIN = "<" 11 | CLASS_END = ">" 12 | COMMA = "," 13 | COMMA_SPACE = ", " 14 | EOS = ";" 15 | CALL_SYMBOL = "." 16 | FOR = "for" 17 | EQUAL = " = " 18 | INCREMENT = "++" 19 | LESS = " < " 20 | 21 | 22 | def read_params(file_name): 23 | arr = [] 24 | with open(file_name) as f: 25 | lines = f.readlines() 26 | for line in lines: 27 | l = line.strip().split(':') 28 | arr.extend(l) 29 | return arr 30 | 31 | 32 | def prompt(s): 33 | var = raw_input(s) 34 | return var 35 | 36 | 37 | def generate_for_loop(counter, counter_type, begin, end, for_body, lc, inc, prefix=SEPARATER): 38 | 39 | for_l = FOR + SPACE + PARAMETER_BEGIN + counter_type + SPACE + counter + EQUAL + str(begin) +\ 40 | EOS + SPACE + counter + LESS + str(end) + EOS + SPACE + counter 41 | if inc == 1: 42 | for_l += INCREMENT 43 | else: 44 | for_l += " += " + str(inc) 45 | 46 | for_l += PARAMETER_END + SPACE + BODY_BEGIN + EOL 47 | 48 | for b in for_body: 49 | for_l += prefix*(lc+1) + b + EOL 50 | 51 | for_l += prefix*lc + BODY_END + EOL 52 | 53 | return for_l 54 | 55 | 56 | def generate_for_loop1(counter, counter_type, begin, end, for_body, prefix=SEPARATER): 57 | 58 | for_l = FOR + SPACE + PARAMETER_BEGIN + counter_type + SPACE + counter + EQUAL + str(begin) +\ 59 | EOS + SPACE + counter + LESS + str(end) + EOS + SPACE + counter + INCREMENT 60 | for_l += PARAMETER_END + SPACE + BODY_BEGIN + prefix 61 | for_l += for_body 62 | for_l += prefix + BODY_END + EOL 63 | return for_l 64 | 65 | 66 | def generate_while(cond, body, k, prefix=SEPARATER): 67 | w_str = "while " 68 | w_str += PARAMETER_BEGIN + cond + PARAMETER_END + SPACE + BODY_BEGIN + EOL 69 | for b in body: 70 | w_str += prefix*(k+1) + b + EOL 71 | w_str += prefix*k + BODY_END + EOL*2 72 | return w_str 73 | 74 | 75 | def generate_if(condition, body, else_body, k, prefix=SEPARATER): 76 | if_str = "if " 77 | if_str += PARAMETER_BEGIN + condition + PARAMETER_END + SPACE + BODY_BEGIN +\ 78 | EOL 79 | for b in body: 80 | if_str += prefix*(k+1) + b + EOL 81 | if_str += prefix*k + BODY_END + EOL 82 | 83 | if else_body != "": 84 | if_str += prefix*k + "else " + BODY_BEGIN + EOL 85 | for e_b in else_body: 86 | if_str += prefix*(k+1) +\ 87 | e_b + EOL 88 | if_str += prefix*k + BODY_END + EOL 89 | 90 | return if_str 91 | 92 | 93 | def params_values(s, arr2): 94 | if s in arr2: 95 | data = arr2[arr2.index(s)+1] 96 | values = data.split() 97 | return values 98 | else: 99 | return 0 100 | 101 | 102 | def extraction(arr): 103 | arr1 = [] 104 | arr2 = [] 105 | layers_order = params_values("Network Structure", arr) 106 | nn_in_data_size_conv_values = params_values("nn_in_data_size_conv", arr) 107 | nn_in_number_conv_values = params_values("nn_in_number_conv", arr) 108 | nn_group_conv_values = params_values("nn_group_conv", arr) 109 | nn_channel_size_conv_values = params_values("nn_channel_size_conv", arr) 110 | nn_out_number_conv_values = params_values("nn_out_number_conv", arr) 111 | nn_padding_conv_values = params_values("nn_padding_conv", arr) 112 | nn_stride_conv_values = params_values("nn_stride_conv", arr) 113 | nn_bias_conv_values = params_values("nn_bias_conv", arr) 114 | 115 | nn_in_data_size_pooling_values = params_values("nn_in_data_size_pooling", arr) 116 | nn_channel_size_pooling_values = params_values("nn_channel_size_pooling", arr) 117 | nn_padding_pooling_values = params_values("nn_padding_pooling", arr) 118 | nn_stride_pooling_values = params_values("nn_stride_pooling", arr) 119 | nn_in_number_pooling_values = params_values("nn_in_number_pooling", arr) 120 | 121 | nn_in_number_fc_values = params_values("nn_in_number_fc", arr) 122 | nn_in_data_size_fc_values = params_values("nn_in_data_size_fc", arr) 123 | nn_channel_size_fc_values = params_values("nn_channel_size_fc", arr) 124 | nn_out_number_fc_values = params_values("nn_out_number_fc", arr) 125 | 126 | nn_local_size_lrn_values = params_values("nn_local_size_lrn", arr) 127 | 128 | nn_in_number_batch_norm_values = params_values("nn_in_number_batch_norm", arr) 129 | nn_in_number_scale_values = params_values("nn_in_number_scale", arr) 130 | 131 | nn_in_number_eltwise_values = params_values("nn_in_number_eltwise", arr) 132 | nn_input_size_eltwise_values = params_values("nn_input_size_eltwise", arr) 133 | 134 | nn_in_number_concat_values = params_values("nn_in_number_concat", arr) 135 | nn_input_size_concat_values = params_values("nn_input_size_concat", arr) 136 | 137 | conv_cut_flag_values = params_values("conv_cut_flag", arr) 138 | pool_cut_flag_values = params_values("pool_cut_flag", arr) 139 | fc_cut_flag_values = params_values("fc_cut_flag", arr) 140 | 141 | arr1.append(layers_order) 142 | arr2.append("layers_order") 143 | 144 | n = len(nn_in_data_size_conv_values) 145 | if nn_in_number_fc_values != 0: 146 | n = n + len(nn_in_number_fc_values) 147 | arr1.append(str(n)) 148 | arr2.append("n") 149 | 150 | arr1.append(nn_in_number_conv_values) 151 | arr2.append("nn_in_number_conv") 152 | arr1.append(nn_in_data_size_conv_values) 153 | arr2.append("nn_in_data_size_conv") 154 | arr1.append(nn_channel_size_conv_values) 155 | arr2.append("nn_channel_size_conv") 156 | arr1.append(nn_padding_conv_values) 157 | arr2.append("nn_padding_conv") 158 | arr1.append(nn_stride_conv_values) 159 | arr2.append("nn_stride_conv") 160 | arr1.append(nn_out_number_conv_values) 161 | arr2.append("nn_out_number_conv") 162 | arr1.append(nn_group_conv_values) 163 | arr2.append("nn_group_conv") 164 | arr1.append(nn_local_size_lrn_values) 165 | arr2.append("nn_local_size_lrn") 166 | arr1.append(nn_in_data_size_pooling_values) 167 | arr2.append("nn_in_data_size_pooling") 168 | arr1.append(nn_channel_size_pooling_values) 169 | arr2.append("nn_channel_size_pooling") 170 | arr1.append(nn_padding_pooling_values) 171 | arr2.append("nn_padding_pooling") 172 | arr1.append(nn_stride_pooling_values) 173 | arr2.append("nn_stride_pooling") 174 | arr1.append(nn_in_number_pooling_values) 175 | arr2.append("nn_in_number_pooling") 176 | arr1.append(nn_in_number_fc_values) 177 | arr2.append("nn_in_number_fc") 178 | arr1.append(nn_in_data_size_fc_values) 179 | arr2.append("nn_in_data_size_fc") 180 | arr1.append(nn_channel_size_fc_values) 181 | arr2.append("nn_channel_size_fc") 182 | arr1.append(nn_out_number_fc_values) 183 | arr2.append("nn_out_number_fc") 184 | arr1.append(nn_in_number_batch_norm_values) 185 | arr2.append("nn_in_number_batch_norm") 186 | arr1.append(nn_in_number_scale_values) 187 | arr2.append("nn_in_number_scale") 188 | arr1.append(nn_in_number_eltwise_values) 189 | arr2.append("nn_in_number_eltwise") 190 | arr1.append(nn_input_size_eltwise_values) 191 | arr2.append("nn_input_size_eltwise") 192 | arr1.append(nn_in_number_concat_values) 193 | arr2.append("nn_in_number_concat") 194 | arr1.append(nn_input_size_concat_values) 195 | arr2.append("nn_input_size_concat") 196 | arr1.append(conv_cut_flag_values) 197 | arr2.append("conv_cut_flag") 198 | arr1.append(pool_cut_flag_values) 199 | arr2.append("pool_cut_flag") 200 | arr1.append(fc_cut_flag_values) 201 | arr2.append("fc_cut_flag") 202 | 203 | val = str(int(nn_in_number_conv_values[0])) + " * " + \ 204 | str(int(nn_in_data_size_conv_values[0])) + " * " + \ 205 | str(int(nn_in_data_size_conv_values[0])) 206 | arr1.append(val) 207 | arr2.append("in_data_mem_size") 208 | 209 | val = "" 210 | for i in range(len(nn_in_number_conv_values)): 211 | val += str(int(nn_in_number_conv_values[i]) * int(nn_out_number_conv_values[i]) /\ 212 | int(nn_group_conv_values[i])*int(nn_channel_size_conv_values[i])*\ 213 | int(nn_channel_size_conv_values[i])) 214 | if (i+1) != len(nn_in_number_conv_values): 215 | val += " + " 216 | arr1.append(val) 217 | arr2.append("conv_weight_size") 218 | 219 | val = "" 220 | if nn_bias_conv_values != 0: 221 | for i, v in enumerate(nn_bias_conv_values): 222 | val += v 223 | if (i+1) != len(nn_bias_conv_values): 224 | val += " + " 225 | arr1.append(val) 226 | arr2.append("conv_bias_size") 227 | 228 | val = "" 229 | if nn_in_number_fc_values != 0: 230 | for j in range(len(nn_in_number_fc_values)): 231 | val += str(int(nn_in_number_fc_values[j])*int(nn_out_number_fc_values[j])*int(nn_channel_size_fc_values[j])*int(nn_channel_size_fc_values[j])) 232 | if (j+1) != len(nn_in_number_fc_values): 233 | val += " + " 234 | arr1.append(val) 235 | arr2.append("fc_weight_size") 236 | 237 | val = "" 238 | if nn_out_number_fc_values != 0: 239 | for i, out in enumerate(nn_out_number_fc_values): 240 | val += out 241 | if (i+1) != len(nn_out_number_fc_values): 242 | val += " + " 243 | arr1.append(val) 244 | arr2.append("fc_bias_size") 245 | 246 | val = "" 247 | if nn_out_number_fc_values != 0: 248 | arr1.append(str(nn_out_number_fc_values[len(nn_out_number_fc_values)-1])) 249 | arr2.append("fc_out_size") 250 | else: 251 | arr1.append(str(nn_in_number_pooling_values[len(nn_in_number_pooling_values)-1])) 252 | arr2.append("out_size") 253 | 254 | val = "" 255 | if nn_in_number_batch_norm_values != 0: 256 | for i, v in enumerate(nn_in_number_batch_norm_values): 257 | val += v 258 | if (i+1) != len(nn_in_number_batch_norm_values): 259 | val += " + " 260 | arr1.append(val) 261 | arr2.append("nn_batch_norm_size") 262 | 263 | val = "" 264 | if nn_in_number_scale_values != 0: 265 | for i, v in enumerate(nn_in_number_scale_values): 266 | val += v 267 | if (i+1) != len(nn_in_number_scale_values): 268 | val += " + " 269 | arr1.append(val) 270 | arr2.append("nn_scale_size") 271 | 272 | val = "" 273 | if nn_in_number_eltwise_values != 0: 274 | arr1.append(val) 275 | arr2.append("nn_in_number_eltwise_size") 276 | 277 | val = "" 278 | if nn_input_size_eltwise_values != 0: 279 | arr1.append(val) 280 | arr2.append("nn_input_size_eltwise_size") 281 | 282 | val = "" 283 | if nn_in_number_concat_values != 0: 284 | arr1.append(val) 285 | arr2.append("nn_in_number_concat_size") 286 | 287 | val = "" 288 | if nn_input_size_concat_values != 0: 289 | arr1.append(val) 290 | arr2.append("nn_input_size_concat_size") 291 | 292 | maximum = [] 293 | for l in range(len(nn_in_data_size_conv_values)): 294 | out = (int(nn_in_data_size_conv_values[l]) + int(nn_padding_conv_values[l]) * 2 -\ 295 | int(nn_channel_size_conv_values[l]))/int(nn_stride_conv_values[l]) + 1; 296 | val = out * out * int(nn_out_number_conv_values[l]) 297 | maximum.append(val) 298 | for l1 in range(len(nn_in_data_size_pooling_values)): 299 | out = (int(nn_in_data_size_pooling_values[l1]) + int(nn_padding_pooling_values[l1])*2 -\ 300 | int(nn_channel_size_pooling_values[l1]))/int(nn_stride_pooling_values[l1]) + 1 301 | val = int(out) * int(out) * int(nn_in_number_pooling_values[l1]) 302 | maximum.append(val) 303 | if nn_in_number_fc_values != 0: 304 | for l2 in range(len(nn_in_number_fc_values)): 305 | val = int(nn_in_number_fc_values[l2]) * int(nn_channel_size_fc_values[l2]) *\ 306 | int(nn_channel_size_fc_values[l2]) 307 | maximum.append(val) 308 | maxim = max(maximum) 309 | arr1.append(str(maxim)) 310 | arr2.append("maximum") 311 | 312 | return arr1, arr2 313 | 314 | 315 | -------------------------------------------------------------------------------- /acc_runtime/local_acc/demos/convTest/runtime.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "acc_config.h" 10 | #include "acc_ctrl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace std; 19 | 20 | void fc_software(float* feature_in,float* feature_out,float* weight,float* bias,int num_input,int num_output); 21 | 22 | void software_validate_conv(int num_input, 23 | int num_output, 24 | int kernel_size, 25 | int stride, 26 | int padding, 27 | int feature_in_size, 28 | int feature_out_size, 29 | short int feature_in[][32], 30 | short int feature_out[][32], 31 | short int weight[][32], 32 | int* bias 33 | ); 34 | void software_validate_pooling(int num_input, 35 | int feature_in_size, 36 | int feature_out_size, 37 | short int feature_in[][32], 38 | short int feature_out[][32]); 39 | 40 | void input_feature_ready(short int input_feature[][32]); 41 | void weight_ready(short int weight[][32]); 42 | void para_ready(int para[][512]); 43 | 44 | short int temp_array[20480][32] = {0}; 45 | int main() 46 | { 47 | short int input_feature[16*16][32] = {0}; 48 | short int output_feature[16*16][32] = {0}; 49 | short int weight[200][32] = {0}; 50 | int bias[512] = {0}; 51 | int para[3][512] = {0}; 52 | 53 | //1.input feature ready 54 | input_feature_ready(input_feature); 55 | //2.weight ready 56 | weight_ready(weight); 57 | //3.para ready 58 | para_ready(para); 59 | 60 | acc_ctrl sub_net0(ACC0_PARA_OFFSET, //para_offset_addr 61 | ACC0_WEIGHT_OFFSET, //weight_offset_addr 62 | ACC0_DATA_IN_OFFSET, //data_in_offset_addr 63 | ACC0_CTRL_OFFSET //ctrl_addr 64 | ); 65 | 66 | acc_ctrl sub_net1(ACC1_PARA_OFFSET, //para_offset_addr 67 | ACC1_WEIGHT_OFFSET, //weight_offset_addr 68 | ACC1_DATA_IN_OFFSET, //data_in_offset_addr 69 | ACC1_CTRL_OFFSET //ctrl_addr 70 | ); 71 | 72 | acc_ctrl sub_net2(ACC2_PARA_OFFSET, //para_offset_addr 73 | ACC2_WEIGHT_OFFSET, //weight_offset_addr 74 | ACC2_DATA_IN_OFFSET, //data_in_offset_addr 75 | ACC2_CTRL_OFFSET //ctrl_addr 76 | ); 77 | 78 | sub_net0.write_para(para[0],512*4); 79 | sub_net1.write_para(para[1],512*4); 80 | sub_net2.write_para(para[2],512*4); 81 | 82 | sub_net0.write_weight(weight,200*64); 83 | sub_net1.write_weight(weight,200*64); 84 | sub_net2.write_weight(weight,200*64); 85 | 86 | sub_net0.write_data(input_feature,16*16*64); 87 | 88 | sub_net0.start_process(0); 89 | sub_net0.start_process(1); 90 | sub_net1.start_process(0); 91 | sub_net1.start_process(1); 92 | sub_net2.start_process(0); 93 | sub_net2.start_process(1); 94 | 95 | sub_net2.read_data(output_feature,16*16*64); 96 | 97 | int i,j,k; 98 | for(i=0;i<16;i++) 99 | { 100 | for(j=0;j<16;j++) 101 | cout << setw(10)<< output_feature[i*16+j][0]/64.0 <<" "; 102 | cout << endl; 103 | } 104 | cout <<"test finish"<< endl; 105 | return 0; 106 | } 107 | 108 | void input_feature_ready(short int input_feature[][32]) 109 | { 110 | int i,j; 111 | for(i=0;i<16*16;i++) 112 | { 113 | for(j=0;j<1;j++) 114 | input_feature[i][j] = 1<<6; 115 | for(j=1;j<32;j++) 116 | input_feature[i][j] = 0<<6; 117 | } 118 | } 119 | 120 | void weight_ready(short int weight[][32]) 121 | { 122 | int i; 123 | for(i=0;i<20;i++) 124 | { 125 | weight[i][0] = 16; 126 | } 127 | weight[20][0] = 0; 128 | weight[21][0] = 0; 129 | } 130 | 131 | void para_ready(int para[][512]) 132 | { 133 | int i; 134 | 135 | //===========================subnet0 136 | //0-1.layer_num 137 | para[0][0] = 1; 138 | //0-2.conv para 139 | para[0][16+0] = 1;//N 140 | para[0][16+1] = 3;//K 141 | para[0][16+2] = 1;//M 142 | para[0][16+3] = 16;//Rin 143 | para[0][16+4] = 16;//Cin 144 | para[0][16+5] = 16;//R 145 | para[0][16+6] = 16;//C 146 | para[0][16+7] = 1;//S 147 | para[0][16+8] = 1;//P 148 | para[0][16+9] = 1;//act 149 | para[0][16+10] = 0;//weight_offset 150 | para[0][16+11] = 20;//bias_offset 151 | para[0][16+12] = 0;//in_offset 152 | para[0][16+13] = 0;//out_offset 153 | para[0][16+14] = 0;//inport 154 | para[0][16+15] = 0; 155 | 156 | //1-1.layer_num 157 | para[0][256+0] = 1; 158 | //1-2.conv_para 159 | para[0][256+16+0] = 1; //N 160 | para[0][256+16+1] = 3; //K 161 | para[0][256+16+2] = 1; //M 162 | para[0][256+16+3] = 16; //Rin 163 | para[0][256+16+4] = 16; //Cin 164 | para[0][256+16+5] = 16; //R 165 | para[0][256+16+6] = 16; //C 166 | para[0][256+16+7] = 1; //S 167 | para[0][256+16+8] = 1; //P 168 | para[0][256+16+9] = 1; //act 169 | para[0][256+16+10] = 0; //weight_offset 170 | para[0][256+16+11] = 20; //bias_offset 171 | para[0][256+16+12] = 0; //in_offset 172 | para[0][256+16+13] = 0; //out_offset 173 | para[0][256+16+14] = 0; //inport 174 | para[0][256+16+15] = 0; 175 | 176 | 177 | //===========================subnet1 178 | //0-1.layer_num 179 | para[1][0] = 1; 180 | //0-2.conv para 181 | para[1][16+0] = 1;//N 182 | para[1][16+1] = 3;//K 183 | para[1][16+2] = 1;//M 184 | para[1][16+3] = 16;//Rin 185 | para[1][16+4] = 16;//Cin 186 | para[1][16+5] = 16;//R 187 | para[1][16+6] = 16;//C 188 | para[1][16+7] = 1;//S 189 | para[1][16+8] = 1;//P 190 | para[1][16+9] = 1;//act 191 | para[1][16+10] = 0;//weight_offset 192 | para[1][16+11] = 20;//bias_offset 193 | para[1][16+12] = 0;//in_offset 194 | para[1][16+13] = 0;//out_offset 195 | para[1][16+14] = 0;//inport 196 | para[1][16+15] = 0; 197 | 198 | //1-1.layer_num 199 | para[1][256+0] = 1; 200 | //1-2.conv_para 201 | para[1][256+16+0] = 1; //N 202 | para[1][256+16+1] = 3; //K 203 | para[1][256+16+2] = 1; //M 204 | para[1][256+16+3] = 16; //Rin 205 | para[1][256+16+4] = 16; //Cin 206 | para[1][256+16+5] = 16; //R 207 | para[1][256+16+6] = 16; //C 208 | para[1][256+16+7] = 1; //S 209 | para[1][256+16+8] = 1; //P 210 | para[1][256+16+9] = 1; //act 211 | para[1][256+16+10] = 0; //weight_offset 212 | para[1][256+16+11] = 20; //bias_offset 213 | para[1][256+16+12] = 0; //in_offset 214 | para[1][256+16+13] = 0; //out_offset 215 | para[1][256+16+14] = 0; //inport 216 | para[1][256+16+15] = 0; 217 | 218 | //===========================subnet2 219 | //0-1.layer_num 220 | para[2][0] = 1; 221 | //0-2.conv para 222 | para[2][16+0] = 1;//N 223 | para[2][16+1] = 3;//K 224 | para[2][16+2] = 1;//M 225 | para[2][16+3] = 16;//Rin 226 | para[2][16+4] = 16;//Cin 227 | para[2][16+5] = 16;//R 228 | para[2][16+6] = 16;//C 229 | para[2][16+7] = 1;//S 230 | para[2][16+8] = 1;//P 231 | para[2][16+9] = 1;//act 232 | para[2][16+10] = 0;//weight_offset 233 | para[2][16+11] = 20;//bias_offset 234 | para[2][16+12] = 0;//in_offset 235 | para[2][16+13] = 0;//out_offset 236 | para[2][16+14] = 0;//inport 237 | para[2][16+15] = 0; 238 | 239 | //1-1.layer_num 240 | para[2][256+0] = 1; 241 | //1-2.conv_para 242 | para[2][256+16+0] = 1; //N 243 | para[2][256+16+1] = 3; //K 244 | para[2][256+16+2] = 1; //M 245 | para[2][256+16+3] = 16; //Rin 246 | para[2][256+16+4] = 16; //Cin 247 | para[2][256+16+5] = 16; //R 248 | para[2][256+16+6] = 16; //C 249 | para[2][256+16+7] = 1; //S 250 | para[2][256+16+8] = 1; //P 251 | para[2][256+16+9] = 1; //act 252 | para[2][256+16+10] = 0; //weight_offset 253 | para[2][256+16+11] = 20; //bias_offset 254 | para[2][256+16+12] = 0; //in_offset 255 | para[2][256+16+13] = 0; //out_offset 256 | para[2][256+16+14] = 0; //inport 257 | para[2][256+16+15] = 0; 258 | } 259 | 260 | void software_validate_conv(int num_input, 261 | int num_output, 262 | int kernel_size, 263 | int stride, 264 | int padding, 265 | int feature_in_size, 266 | int feature_out_size, 267 | short int feature_in[][32], 268 | short int feature_out[][32], 269 | short int weight[][32], 270 | int* bias 271 | ) 272 | { 273 | int i,j,k,x,y,z; 274 | int temp; 275 | short int* temp_array = new short int[(feature_in_size+2*padding) * (feature_in_size+2*padding) * num_input]; 276 | 277 | for(k = 0 ; k < num_input; k++) 278 | for(i = 0 ; i < feature_in_size+2*padding; i++) 279 | for(j = 0 ; j < feature_in_size+2*padding ; j++) 280 | temp_array[ k * (feature_in_size+2*padding ) * (feature_in_size+2*padding ) + i * (feature_in_size+2*padding ) + j] = 0; 281 | 282 | for(j = 0 ; j < num_input; j++) 283 | for(x = padding; x < feature_in_size + padding; x++) 284 | for(y = padding; y < feature_in_size + padding; y++) 285 | temp_array[ j * (feature_in_size+2*padding ) * (feature_in_size+2*padding ) + x * (feature_in_size+2*padding ) + y] 286 | = feature_in[(j/32) * feature_in_size * feature_in_size +(x-padding)*feature_in_size + (y-padding)][j%32]; 287 | //input_feature[(j/32) * feature_in_size * feature_in_size +(x-padding)*feature_in_size + (y-padding)].range(16*(j%32)+15,16*(j%32)); 288 | 289 | cout <<"software processing..." << endl; 290 | for(i = 0 ; i < num_output; i++) 291 | { 292 | for(x = 0 ; x < feature_in_size-kernel_size+1+2*padding; x+=stride) 293 | for(y = 0; y < feature_in_size-kernel_size+1+2*padding;y+=stride) 294 | { 295 | //temp = bias[i].range(15,0); 296 | //temp = 0; 297 | temp = bias[i]; 298 | for(j = 0 ; j < num_input; j++) 299 | { 300 | for(k = 0 ; k < kernel_size;k++) 301 | { 302 | for(z = 0 ; z < kernel_size; z++) 303 | { 304 | 305 | temp += (int)(temp_array[j * (feature_in_size+2*padding ) * (feature_in_size+2*padding ) + x * (feature_in_size+2*padding) + y + (k*(feature_in_size+2*padding) + z)]) 306 | *(int)(weight[(i/32)*num_input*kernel_size*kernel_size +j*kernel_size*kernel_size + k*kernel_size + z][i%32])/64; 307 | } 308 | } 309 | } 310 | feature_out[(i/32) *feature_out_size * feature_out_size + (x/stride) * feature_out_size + y/stride][i%32]= (temp < 0) ? 0 : temp; 311 | // if( feature_out[(i/32) *feature_out_size * feature_out_size + (x/stride) * feature_out_size + y/stride][i%32] <0) 312 | // { 313 | // feature_out[(i/32) *feature_out_size * feature_out_size + (x/stride) * feature_out_size + y/stride][i%32] = 0; 314 | // //cout <<"!!"<< i << ":" < temp_max) 346 | temp_max = temp_array[z]; 347 | 348 | feature_out[i/32*feature_out_size*feature_out_size + (j/2) * feature_out_size + k/2][i%32] = temp_max; 349 | } 350 | } 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /netGenerator/netGen/generate_accInst.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def conv_default_function(idx, parameters): 4 | 5 | creator = "conv_acc<"+",".join(parameters)+"> convAcc" + str(idx) +";" 6 | func = '''void conv_layer_acc_'''+str(idx)+'''( 7 | Tparam N, 8 | Tparam K, 9 | Tparam M, 10 | Tparam R_IN, 11 | Tparam C_IN, 12 | Tparam C_OUT, 13 | Tparam R_OUT, 14 | Tparam S, 15 | Tparam P, 16 | Tparam act, 17 | Tparam inport, 18 | Tparam weight_offset, 19 | Tparam bias_offset, 20 | Tparam in_offset, 21 | Tparam out_offset, 22 | //ap_fixed<32,26>* layer_bias, 23 | data_type_itf* i_weight, 24 | data_type_itf* i_data, 25 | data_type_itf* out_data 26 | ){ 27 | convAcc'''+str(idx)+'''.conv_layer_acc_mbuf(N, K, M, R_IN, C_IN, C_OUT, R_OUT, S, P, act, inport, weight_offset, bias_offset, in_offset, out_offset, i_weight, i_data, out_data); 28 | };''' 29 | return creator, func 30 | 31 | def max_pool_default_function(idx, parameters): 32 | creator = "max_pool_acc<"+",".join(parameters)+"> maxPoolAcc"+str(idx)+";" 33 | func = '''void max_pool_layer_acc_'''+str(idx)+'''( 34 | Tparam R_in, 35 | Tparam C_in, 36 | Tparam N, 37 | Tparam K, 38 | Tparam R, 39 | Tparam C, 40 | Tparam S, 41 | Tparam P, 42 | Tparam act, 43 | Tparam i_offset, 44 | Tparam o_offset, 45 | data_type_itf* i_data, 46 | data_type_itf* o_data){ 47 | maxPoolAcc'''+str(idx)+'''.max_pool_layer_mbuf(R_in, C_in, N, K, R, C, S, P, act, i_offset, o_offset, i_data, o_data); 48 | };''' 49 | return creator, func 50 | 51 | def fc_default_function(idx, parameters): 52 | creator = "fc_acc<"+",".join(parameters)+"> fcAcc"+str(idx)+";" 53 | 54 | func = '''void fc_layer_acc_'''+str(idx)+'''( 55 | Tparam N, 56 | Tparam K, 57 | Tparam M, 58 | Tparam R_IN, 59 | Tparam C_IN, 60 | Tparam C_OUT, 61 | Tparam R_OUT, 62 | Tparam S, 63 | Tparam P, 64 | Tparam act, 65 | Tparam weight_offset, 66 | Tparam bias_offset, 67 | Tparam in_offset, 68 | Tparam out_offset, 69 | data_type_itf* layer_bias, 70 | data_type_itf* i_weight, 71 | data_type_itf* i_data, 72 | data_type_itf* out_data 73 | ){ 74 | fcAcc'''+str(idx)+'''.fc_layer_acc_mbuf(N, M, R_IN, C_IN, K, act, 75 | i_weight, layer_bias, 76 | weight_offset, bias_offset, in_offset, out_offset, 77 | i_data, out_data); 78 | };''' 79 | return creator, func 80 | 81 | 82 | def conv_pool_default_func(parameters): 83 | if len(parameters) == 3: 84 | idx1, idx2, idx3 = parameters 85 | idx1 = idx1.strip() 86 | idx2 = idx2.strip() 87 | idx3 = idx3.strip() 88 | print("conv pool generation parameter list len(): ", len(parameters)) 89 | creator, func = conv_pool(idx1, idx2, idx3) 90 | elif len(parameters) == 2: 91 | idx1, idx2 = parameters 92 | idx1 = idx1.strip() 93 | idx2 = idx2.strip() 94 | print("conv pool generation parameter list len(): ", len(parameters)) 95 | creator, func = single_conv(idx1, idx2) 96 | 97 | return creator, func 98 | 99 | 100 | def conv_pool(idx1, idx2, idx3): 101 | func ='''void conv_pool_acc_'''+str(idx1)+'''( 102 | Tparam* param_port, 103 | //ap_fixed<32,26>* bias_in, 104 | data_type_itf* weight_in, 105 | data_type_itf* data_in, 106 | data_type_itf* data_out 107 | ) 108 | { 109 | Tparam layer_num_local[16]; 110 | Tparam param_conv_local[16]; 111 | Tparam param_pool_local[16]; 112 | 113 | for (unsigned int ll = 0; ll < 16; ll++) 114 | { 115 | layer_num_local[ll] = param_port[ll]; 116 | } 117 | 118 | cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl; 119 | cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <* bias_in, 177 | data_type_itf* weight_in, 178 | data_type_itf* data_in, 179 | data_type_itf* data_out 180 | ) 181 | { 182 | Tparam layer_num_local[16]; 183 | Tparam param_conv_local[16]; 184 | Tparam param_pool_local[16]; 185 | 186 | for (unsigned int ll = 0; ll < 16; ll++) 187 | { 188 | layer_num_local[ll] = param_port[ll]; 189 | } 190 | 191 | cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl; 192 | cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" <* bias_in, 238 | data_type_itf* weight_in, 239 | data_type_itf* data_in, 240 | data_type_itf* data_out 241 | ) 242 | { 243 | Tparam layer_num_local[16]; 244 | Tparam param_conv_local[16]; 245 | Tparam param_pool_local[16]; 246 | 247 | for (unsigned int ll = 0; ll < 16; ll++) 248 | { 249 | layer_num_local[ll] = param_port[ll]; 250 | } 251 | 252 | cout << "LAYER ACC: CONV Loading layer number for current accelerator ..." << endl; 253 | cout << "LAYRE ACC: CONV will process "<< layer_num_local[0] << " layers" < 5 | #include 6 | #include "hls_stream.h" 7 | #include "activation_functions.h" 8 | 9 | using namespace std; 10 | 11 | template 12 | class conv_acc { 13 | 14 | private: 15 | int conv_layer_number; 16 | 17 | public: 18 | conv_acc() : conv_layer_number(0) { conv_layer_number = 0; }; 19 | 20 | ////------------------------------C++ debugging functions---------------------------------------//// 21 | // Reset output buffer 22 | void out_buf_reset(G buf[][Tr][Tc]) { 23 | for (int i = 0; i < Tm; i++) { 24 | for (int j = 0; j < Tr; j++) { 25 | for (int k = 0; k < Tc; k++) { 26 | buf[i][j][k] = G(0); 27 | } 28 | } 29 | } 30 | } 31 | // Reset weight buffer 32 | void w_buf_reset(int K, W buf[][Tm][K_max][K_max]) { 33 | for (int i = 0; i < Tn; i++) { 34 | for (int j = 0; j < Tm; j++) { 35 | for (int k = 0; k < K; k++) { 36 | for (int l = 0; l < K; l++) { 37 | buf[i][j][k][l] = W(0); 38 | } 39 | } 40 | } 41 | } 42 | } 43 | // Reset bias buffer 44 | void b_buf_reset(W buf[]) { 45 | for (int i = 0; i < Tm; i++) { 46 | buf[i] = W(0); 47 | } 48 | } 49 | 50 | ////-----------------------------Accelerator Functions---------------------------------------//// 51 | // Load bias data 52 | void b_buf_load(W buf[], ap_fixed<32,26> *layer_bias, int bias_offset, int m) { 53 | for (int i = 0; i < Tm; i++) { 54 | buf[i].range(15,0) = (*(layer_bias + bias_offset + i + m)).range(15,0); 55 | // cout << "Read bias location: " << bias_offset + i + m << " Read bias data: " << buf[i] << endl; 56 | } 57 | } 58 | // Tn << 32 && N << 32 59 | void in_buf_load_axi( 60 | T buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max], 61 | Itf* i_data, 62 | int in_offset, int n, int r, int c, int S, int K, int P, int R_IN, int C_IN, int N ) { 63 | Itf data_tmp = 0; 64 | // valid data portion 65 | for (int j = r * S - P; j < (r + Tr - 1) * S + K - P ; j++) {// 66 | for (int k = c * S - P; k < (c + Tc -1) * S + K - P; k++) { 67 | #pragma HLS PIPELINE 68 | for (int i = 0; i < Tn; i += Tn) { 69 | #pragma HLS UNROLL 70 | if ((i + n >= N) || j < 0 || j >= R_IN || k < 0 || k >= C_IN) { 71 | for (int wr = 0; wr < Tn; wr++) { 72 | #pragma HLS UNROLL 73 | buf[wr][j - r * S + P][k - c * S + P] = T(0); 74 | } 75 | } else { 76 | data_tmp = *(i_data + in_offset + (i + n)/32 * R_IN * C_IN + j * R_IN + k); 77 | for (int wr = 0; wr < Tn; wr++) { 78 | #pragma HLS UNROLL 79 | buf[wr][j - r * S + P][k - c * S + P].range(15,0) = data_tmp.range(((n + wr)%32 + 1) * 16 - 1, ((n + wr)%32) * 16); 80 | } 81 | } 82 | } 83 | } 84 | } 85 | } 86 | void in_buf_load_bram( 87 | T buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max], 88 | Itf* i_data, 89 | int in_offset, int n, int r, int c, int S, int K, int P, int R_IN, int C_IN, int N ) { 90 | Itf data_tmp = 0; 91 | // valid data portion 92 | for (int j = r * S - P; j < (r + Tr - 1) * S + K - P ; j++) {// 93 | for (int k = c * S - P; k < (c + Tc -1) * S + K - P; k++) { 94 | #pragma HLS PIPELINE 95 | for (int i = 0; i < Tn; i += Tn) { 96 | #pragma HLS UNROLL 97 | if ((i + n >= N) || j < 0 || j >= R_IN || k < 0 || k >= C_IN) { 98 | for (int wr = 0; wr < Tn; wr++) { 99 | #pragma HLS UNROLL 100 | buf[wr][j - r * S + P][k - c * S + P] = T(0); 101 | } 102 | } else { 103 | data_tmp = *(i_data + in_offset + (i + n)/32 * R_IN * C_IN + j * R_IN + k); 104 | for (int wr = 0; wr < Tn; wr++) { 105 | #pragma HLS UNROLL 106 | buf[wr][j - r * S + P][k - c * S + P].range(15,0) = data_tmp.range(((n + wr)%32 + 1) * 16 - 1, ((n + wr)%32) * 16); 107 | } 108 | } 109 | } 110 | } 111 | } 112 | } 113 | 114 | void in_buf_load( 115 | bool inport, 116 | T buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max], 117 | Itf* i_data, 118 | Itf* out_data, 119 | int in_offset, int n, int r, int c, int S, int K, int P, int R_IN, int C_IN, int N ) { 120 | 121 | if(inport == 0) { 122 | in_buf_load_axi(buf, i_data, in_offset, n, r, c, S, K, P, R_IN, C_IN, N); 123 | cout << "input data with i_data!" << endl; 124 | } else { 125 | in_buf_load_bram(buf, out_data, in_offset, n, r, c, S, K, P, R_IN, C_IN, N); 126 | cout << "input data with out_data!" << endl; 127 | } 128 | } 129 | 130 | // Load weight squeezed in the N dimension 131 | void w_buf_load_512(W buf[][Tm][K_max][K_max], 132 | Itf *layer_weights, 133 | int weight_offset, 134 | int n, int m, int K, int N, int M) 135 | { 136 | Itf w_tmp = 0; 137 | for (int k1 = 0; k1 < K; k1++) { 138 | for (int k2 = 0; k2 < K; k2++) { 139 | #pragma HLS PIPELINE 140 | for (int i = 0; i < Tm; i++) { // Tm greater than 32 141 | for (int j = 0; j < Tn; j += Tn) { // Tn smaller than 32 142 | #pragma HLS UNROLL 143 | w_tmp = *(layer_weights + weight_offset + ((j + n)/32)* M * K * K + (i + m) * K * K + k1*K + k2); 144 | for (int wr = 0; wr < Tn; wr++) { 145 | #pragma HLS UNROLL 146 | buf[wr][i][k1][k2].range(15,0) = w_tmp.range(((n + wr)%32 + 1) * 16 - 1, ((n+wr)%32) * 16); 147 | } 148 | } 149 | } 150 | } 151 | } 152 | } 153 | 154 | // Load weight squeezed in the M dimension 155 | void w_buf_load_512_tm(W buf[][Tm][K_max][K_max], 156 | Itf *layer_weights, 157 | int weight_offset, 158 | int n, int m, int K, int N, int M) 159 | { 160 | Itf w_tmp = 0; 161 | for (int k1 = 0; k1 < K; k1++) { 162 | for (int k2 = 0; k2 < K; k2++) { 163 | for (int j = 0; j < Tn; j++) { // Tn smaller than 32 164 | #pragma HLS PIPELINE 165 | for (int i = 0; i < Tm; i+=32) { // Tm greater than 32 166 | w_tmp = *(layer_weights + weight_offset + ((j + n)/32)* M * K * K + (i + m) * K * K + k1*K + k2); 167 | for (int wr = 0; wr < 32; wr++) { 168 | #pragma HLS UNROLL 169 | buf[j][i+wr][k1][k2].range(15,0) = w_tmp.range((wr%32 + 1) * 16 - 1, ((wr)%32) * 16); 170 | } 171 | } 172 | } 173 | } 174 | } 175 | } 176 | 177 | // Convolution computation kernel Tm, Tn based 178 | void conv_engine(T in_buf[][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max], W w_buf[][Tm][K_max][K_max], 179 | W b_buf[], G out_buf[][Tr][Tc], int S, int n, int N, int r, int c, int K, int R_OUT, int C_OUT, 180 | int w_offset, int i_offset) { 181 | if (n >= 0 && n - Tn < N) { 182 | for (int i = 0; i < K; i++) { 183 | for (int j = 0; j < K; j++) { 184 | for (int tr = 0; tr < Tr && tr < R_OUT - r; tr++) { 185 | for (int tc = 0; tc < Tc; tc++) { 186 | #pragma HLS PIPELINE 187 | for (int tm = 0; tm < Tm; tm++) { 188 | #pragma HLS UNROLL 189 | for (int tn = 0; tn < Tn; tn++) { 190 | #pragma HLS UNROLL 191 | if (i == 0 && j == 0 && tn == 0 && n == 0) 192 | out_buf[tm][tr][tc] = b_buf[tm] + w_buf[tn][tm][i + w_offset][j] * 193 | in_buf[tn][S * (tr) + i + i_offset][S * (tc) + 194 | j]; 195 | else 196 | out_buf[tm][tr][tc] = out_buf[tm][tr][tc] + w_buf[tn][tm][i + w_offset][j] * 197 | in_buf[tn][S * (tr) + i + i_offset][ 198 | S * (tc) + j]; 199 | } 200 | } 201 | } 202 | } 203 | } 204 | } 205 | } 206 | } 207 | 208 | // Ouput out_buf data to output interface 209 | void output_res_512(ap_fixed<16,10> out_buf[][Tr][Tc], 210 | ap_int<512>* out_data, 211 | int out_offset, 212 | int n, int m, int r, int c, int N, int M, 213 | int R_OUT, int C_OUT, bool act) 214 | { 215 | ap_int<512> out_tmp = 0; 216 | ap_fixed<16,10> tmp = 0; 217 | ap_fixed<16,10> tmp_outbuf = 0; 218 | if (n >= N - Tn) 219 | { 220 | for (int j = r; (j < r + Tr) && (j < R_OUT); j++) 221 | { 222 | for (int k = c; (k < c + Tc) && (k < C_OUT); k++) 223 | { 224 | for (int wr = 0; wr < Tm && wr < M; wr += 32) // Tm should always greater than 32, otherwise this will not work 225 | { 226 | #pragma HLS PIPELINE 227 | for (int wr_d = 0; wr_d < 32; wr_d++) 228 | { 229 | #pragma HLS UNROLL 230 | if(m + wr + wr_d < M) 231 | { 232 | tmp_outbuf = RELU(out_buf[wr + wr_d][j - r][k - c]); 233 | tmp.range(15, 0) = tmp_outbuf.range(15, 0); 234 | } 235 | else 236 | { 237 | tmp.range(15,0) = 0; 238 | } 239 | out_tmp.range(16 * (wr_d + 1) - 1, 16 * (wr_d)) = tmp.range(15,0); 240 | } 241 | *(out_data + out_offset + ((m / Tm) + (wr / 32)) * R_OUT * C_OUT + j * C_OUT + k) = out_tmp; 242 | } 243 | } 244 | } 245 | } 246 | } 247 | // + int(wr / 32) 248 | ///////////////////////------------------conv accelerator----------------////////////////////////// 249 | #if _LAYER_MODE_ // layer function with cast port 250 | void conv_layer_acc_mbuf( 251 | int N, //input feature number 252 | int K, //input kernel size 253 | int M, // output feature number 254 | int R_IN, // input Row 255 | int C_IN, // input column 256 | int R_OUT, // output Row 257 | int C_OUT, // output column 258 | int S, // stride size 259 | int P, // padding size 260 | bool act, // activation function bit (1-- with act, 0--without act) 261 | bool inport, 262 | int weight_offset, 263 | int bias_offset, 264 | int in_offset, 265 | int out_offset, 266 | ap_fixed<32,26> *layer_bias, 267 | Itf *i_weight, 268 | Itf *i_data, 269 | Itf *out_data ) { // out[M][R][C] 270 | 271 | /***************local data buffer groups******************************/ 272 | T in_buf_0[Tn][(Tr - 1) * S_max + K_max][(Tc - 1) * S_max + K_max]; 273 | W w_buf_0[Tn][Tm][K_max][K_max]; 274 | W b_buf_0[Tm]; 275 | G out_buf_0[Tm][Tr][Tc]; 276 | 277 | #pragma HLS ARRAY_PARTITION variable = in_buf_0 complete dim = 1 278 | #pragma HLS ARRAY_PARTITION variable = w_buf_0 complete dim = 1 279 | #pragma HLS ARRAY_PARTITION variable = w_buf_0 complete dim = 2 280 | #pragma HLS ARRAY_PARTITION variable = b_buf_0 complete 281 | #pragma HLS ARRAY_PARTITION variable = out_buf_0 complete dim = 1 282 | 283 | //--------------------------Initial data load ---------------------------------------------// 284 | for (int r = 0; r < R_OUT; r += Tr) 285 | { 286 | for (int c = 0; c < C_OUT; c += Tc) 287 | { 288 | for (int m = 0; m < M; m += Tm) 289 | { 290 | for (int n = 0; n < N; n += Tn) 291 | { 292 | 293 | //--------------------------Load input B W D in ping-pong manner-------------------------// 294 | b_buf_load(b_buf_0, layer_bias, bias_offset, m); 295 | 296 | // w_buf_load_512_tm(w_buf_0, i_weight, weight_offset, n, m, K, N, M); 297 | 298 | in_buf_load(inport, in_buf_0, i_data, out_data, in_offset, n, r, c, S, K, P, R_IN, C_IN, N); 299 | 300 | //------------------------------compute buffered data -----------------------------------// 301 | conv_engine(in_buf_0, w_buf_0, b_buf_0, out_buf_0, S, n, N, r, c, K, R_OUT, C_OUT, 0, 0); 302 | 303 | //---------------------------transfer output data----------------------------------------// 304 | output_res_512(out_buf_0,out_data,out_offset, n, m, r, c, N, M, R_OUT, C_OUT, act); 305 | 306 | } 307 | } 308 | } 309 | } 310 | 311 | 312 | }; 313 | 314 | #endif 315 | 316 | }; 317 | 318 | #endif 319 | --------------------------------------------------------------------------------