├── soft_version ├── README.MD ├── Step03 │ ├── README.MD │ ├── main3.c │ └── yolov3.cfg └── Step02 │ ├── README.MD │ ├── main3.cpp │ ├── yolov3.cfg │ └── yolov3_acc_sim.h ├── YOLOV3实验细节.pdf ├── yolov2实验复现报告.pdf ├── yolov3_hls ├── README.md ├── simu1.png ├── sourceFile │ ├── README.md │ ├── cnn.h │ └── yolov3.cpp ├── testBench │ ├── README.md │ ├── dog.jpg │ ├── coco.names │ ├── main.cpp │ └── yolov3.cfg └── files needed.png ├── petalinux ├── README.MD └── command_petalinux ├── yolov3_elf ├── README.MD ├── main.cc └── xconv_hw.h └── README.md /soft_version/README.MD: -------------------------------------------------------------------------------- 1 | 这些文件对应陈辰大佬里的这一步的修改,第一步分离的步骤不动,第二与第三步都进行了修改。 2 | -------------------------------------------------------------------------------- /YOLOV3实验细节.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/YOLOV3实验细节.pdf -------------------------------------------------------------------------------- /soft_version/Step03/README.MD: -------------------------------------------------------------------------------- 1 | 这里是第三步的改动后的代码,基本上改动不大 2 | 3 | 4 | 缺的文件与STEP02相似,用法也和陈辰大佬的代码相似。 5 | -------------------------------------------------------------------------------- /yolov2实验复现报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov2实验复现报告.pdf -------------------------------------------------------------------------------- /yolov3_hls/README.md: -------------------------------------------------------------------------------- 1 | There are two folders ,one is source File and the other is test bench. 2 | -------------------------------------------------------------------------------- /yolov3_hls/simu1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov3_hls/simu1.png -------------------------------------------------------------------------------- /yolov3_hls/sourceFile/README.md: -------------------------------------------------------------------------------- 1 | This folder is source file which is used to generate the ip core. 2 | -------------------------------------------------------------------------------- /yolov3_hls/testBench/README.md: -------------------------------------------------------------------------------- 1 | this file is about to test that ip core,in my computer it is OK. 2 | -------------------------------------------------------------------------------- /yolov3_hls/files needed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov3_hls/files needed.png -------------------------------------------------------------------------------- /petalinux/README.MD: -------------------------------------------------------------------------------- 1 | petalinux的设置与陈辰大佬提供的有些不同,我使用的全部是默认设置,也就是说存储的东西会掉电消失。因此我使用了nfs系统来弥补因设置问题造成的缺陷。这里是petalinux的命令过程。 2 | -------------------------------------------------------------------------------- /yolov3_hls/testBench/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov3_hls/testBench/dog.jpg -------------------------------------------------------------------------------- /soft_version/Step02/README.MD: -------------------------------------------------------------------------------- 1 | 这里对应着第二步的具体代码 2 | 3 | 这里还缺少coco.names,dog.jpg,labels文件夹以及第一步生成好的权重和偏置。 4 | 陈辰大佬对文件的命名情况做过改动,所以对权重的名称尽可能仔细核对, 5 | 具体则相似,通过注释控制生成reorg还是qunti 6 | -------------------------------------------------------------------------------- /yolov3_elf/README.MD: -------------------------------------------------------------------------------- 1 | those code are used to complied to generate .elf file to run on fpga. 2 | 这块我没有使用SDK,而是使用命令直接编译的,在使vivado的环境变量生效以后,`arm-linux-gnueabihf-g++ -static -O3 * -o myfile.elf`,运行此命令会生成elf文件 3 | -------------------------------------------------------------------------------- /petalinux/command_petalinux: -------------------------------------------------------------------------------- 1 | source /opt/pkg/petalinux/settings.sh 2 | source /opt/Xilinx/Vivado/2017.4/settings64.sh 3 | petalinux-create --type project --template zynq --name hdyolo 4 | cd hdyolo 5 | cp -r ../project_2.sdk ./ 6 | rm -rf ./project-spec/meta-user/recipes-bsp/device-tree/files/system-user.dtsi 7 | cp ../system-user.dtsi ./project-spec/meta-user/recipes-bsp/device-tree/files/ 8 | petalinux-config --get-hw-description ./project_2.sdk/ 9 | petalinux-config -c kernel 10 | petalinux-config -c rootfs 11 | petalinux-build 12 | petalinux-package --boot --fsbl ./images/linux/zynq_fsbl.elf --fpga --u-boot --force 13 | 14 | -------------------------------------------------------------------------------- /yolov3_hls/sourceFile/cnn.h: -------------------------------------------------------------------------------- 1 | #ifndef CNN_H 2 | 3 | #define CNN_H 4 | 5 | 6 | void YOLO2_FPGA(int *Input,int *Input1,int *Input2,int *Input3,int *Output,int *Output1,int *Weight,int *Beta,const int InFM_num,const int OutFM_num, 7 | const int Kernel_size,const int Kernel_stride, 8 | const int Input_w,const int Input_h,const int output_w,const int output_h,const int Padding,const bool IsNL,const bool IsBN, 9 | const int TM,const int TN,const int TR,const int TC, 10 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType, 11 | const int InputQ,const int OutputQ,const int WeightQ,const int BetaQ,int trow_loops); 12 | 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /yolov3_hls/testBench/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yolov3_hls 2 | 这一块的代码是在把上交的陈辰大佬的yolov2代码改成yolov3的时候的一些改动,主要步骤还是参考陈辰大佬的代码。 3 | https://github.com/dhm2013724/yolov2_xilinx_fpga 4 | 但是有一些和他不一样的细节。 5 | 6 | # Tips: 7 | 我目前已经不做这个方向了,有些细节记得不是很清楚了,只能把当时的复现实验报告和做v3时候的细节报告传上来(请忽略掉一些吐槽),请分别查看v2和v3的pdf报告。 8 | 9 | # 在权重处理部分 10 | 在权重处理部分我写了好几个版本的,由于过去了一段时间,我也忘记了当初用的哪一个版本了。 11 | 但是主要的改动在于内存控制的部分,在第2步的第一小步骤:产生重组织的权重的时候,进行了修改。 12 | 在第二步的第二个小步骤,产生量化的文件只进行了偏移量的修改,此外为了和我的实验匹配还加了一些垃圾代码,你可以不看(比如固定位数的量化) 13 | 在第三步产生输入输出的特征图量化的步骤同样进行了上述步骤的修改。[但是事后才发现有很大的问题,这一块的补偿体现在了elf代码里,在对卷积层判定的时候对于层号的选择那块] 14 | 15 | # IP核部分 16 | IP核的改动不算特别大,移除了YOLOv2的maxpool部分,增加了shortcut(add方式的)和upsample层。 17 | 同时对于upsample层的块大小做了固定[尝试增加块大小到26(最小的二倍),但是发现性能低到爆炸] 18 | 由于记性不好,我忘记了生成ipcore使用的是yolov3.cpp还是cnn.cpp了 19 | 20 | # 最终的elf部分 21 | 由于以上的种种问题,在这块的代码上缝缝补补改了很多。 22 | 首先是由于shortcut选择了对其的方式,所以会因为不同层之间的量化位数不同产生溢出,因此需要把输出较大的层和输出较小的层做同步。[比如A层有输出13.6,A+3层有输出0.01,获得A层的位数是10,A+3是15,这个时候shortcut如果向A+3层的位数同步,那么就会造成13.6*pow(2,15)产生超过16位的结果] 23 | 其次由于yolov3有比较复杂的FPN机制,所以需要保存很多层的信息,因此我的缓存区利用的并不好[虽然后面重写了一个版本的,但是在fpga上好像并不行我也不知道是什么原因,业已证明只需要5个小缓冲区就能做完yolov3,我这里相当于用了十个] 24 | 同样的,由于缓冲区设计以及IPcore的设计,在并不需要做route层的情况下就可以实现[使用地址直接拼接,反正route也是拼接] 25 | [实际上在fpga做upsample并不快,只是少了因为拷贝内存的时间,所以相对于总时间来说这一块也放fpga上了] 26 | 27 | # 最终的效果 28 | 其实并不怎么好,因为陈辰大佬使用了im2col的方式进行计算,而yolov3过深的网络结构给arm核的拷贝带来了大量的负担,为了拷贝数据造成的时延实际上相当大。 29 | 但是最终的能耗确实很低。 30 | -------------------------------------------------------------------------------- /yolov3_hls/testBench/main.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "yolov3.h" 10 | 11 | int main( int argc, char *argv[]) 12 | { 13 | //freopen("result.txt","w",stdout); 14 | 15 | printf("YOLOv3 TEST Begin\n"); 16 | char **names = get_labels("coco.names"); 17 | int x; 18 | for(x=0;x<80;x++)//80 classe labels 19 | { 20 | printf("[%d]%s\n",x,names[x]); 21 | } 22 | image **alphabet = load_alphabet(); 23 | 24 | network *net = load_network("yolov3.cfg", "yolov2.weights", 0); 25 | set_batch_network(net, 1); 26 | 27 | ////////////////////load img resize img begin 28 | char buff[256]; 29 | char *input_imgfn = buff; 30 | strncpy(input_imgfn, "dog.jpg", 256); 31 | printf("Input img:%s\n",input_imgfn); 32 | image im = load_image_stb(input_imgfn, 3);//3 channel img 33 | printf("img w=%d,h=%d,c=%d\n",im.w,im.h,im.c); 34 | image sized = letterbox_image(im, 416, 416); 35 | save_image_png(sized, "sized");// convert to yolov3 net input size 416x416x3 36 | ////////////////////load img resize img end 37 | 38 | time_t first, second; 39 | 40 | layer l = net->layers[net->n-1]; 41 | float *X = sized.data; 42 | 43 | first=time(NULL); 44 | yolov2_hls_ps(net, X); 45 | second=time(NULL); 46 | printf("%s: Predicted in %f seconds.\n", input_imgfn, difftime(second,first)); 47 | 48 | int nboxes = 0; 49 | float nms=.45; 50 | float thresh = .5; 51 | float hier_thresh = .5; 52 | detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes); 53 | printf("%d\n", nboxes); 54 | 55 | if (nms) do_nms_sort(dets, nboxes, l.classes, nms); 56 | draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes); 57 | 58 | free_detections(dets, nboxes); 59 | 60 | ///////////////////write predictions img 61 | save_image_png(im, "predictions");// output 62 | 63 | free_image(im); 64 | free_image(sized); 65 | 66 | printf("YOLOv3 TEST End\n"); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /yolov3_elf/main.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Empty C++ Application 3 | */ 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "yolo3.h" 12 | 13 | 14 | #include 15 | 16 | extern inline long long getTimeStampMill(); 17 | 18 | 19 | int main( int argc, char *argv[]) 20 | { 21 | unsigned int WEIGHT_BASE = 0x10000000; 22 | unsigned int BETA_BASE = 0x1EC38000; 23 | unsigned int MEM_BASE = 0x1EC53000; 24 | 25 | printf("YOLOv3 TEST Begin\n"); 26 | printf("timestamp is %ld\n",getTimeStampMill()); 27 | char **names = get_labels((char*)"coco.names"); 28 | int x; 29 | image **alphabet = load_alphabet(); 30 | network *net = load_network((char*)"yolov3.cfg", (char*)"yolov3.weights", 0); 31 | set_batch_network(net, 1); 32 | 33 | char buff[256]; 34 | char *input_imgfn = buff; 35 | if(argc==1) 36 | { 37 | strncpy(input_imgfn, (char*)"dog.jpg", 256); 38 | } 39 | else 40 | { 41 | strncpy(input_imgfn, argv[1], 256); 42 | } 43 | image im = load_image_stb(input_imgfn, 3); 44 | image sized = letterbox_image(im, 416,416); 45 | save_image_png(sized, "sized"); 46 | double time; 47 | layer l = net->layers[net->n-1]; 48 | float *X = sized.data; 49 | time = what_time_is_it_now(); 50 | net = yolov2_hls_ps(net, X,WEIGHT_BASE,BETA_BASE,MEM_BASE); 51 | printf("Predicted in %f seconds.\n",what_time_is_it_now()-time); 52 | 53 | int nboxes = 0; 54 | int total = 0; 55 | float nms=0.45; 56 | float thresh = .5; 57 | float hier_thresh = .5; 58 | 59 | detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes); 60 | 61 | if (nms) do_nms_sort(dets, nboxes, l.classes, nms); 62 | 63 | draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes); 64 | free_detections(dets, nboxes); 65 | 66 | save_image_png(im, "predictions");// output 67 | 68 | free_image(im); 69 | free_image(sized); 70 | 71 | printf("YOLOv3 TEST End\n"); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /soft_version/Step03/main3.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "yolov3.h" 9 | 10 | int main( int argc, char *argv[]) 11 | { 12 | //freopen("result.txt","w",stdout); 13 | 14 | printf("YOLOv3 TEST Begin\n"); 15 | char **names = get_labels("coco.names"); 16 | int x; 17 | for(x=0;x<80;x++)//80 classe labels 18 | { 19 | printf("[%d]%s\n",x,names[x]); 20 | } 21 | image **alphabet = load_alphabet(); 22 | 23 | network *net = load_network("yolov3.cfg", "yolov3.weights", 0); 24 | set_batch_network(net, 1); 25 | 26 | ////////////////////load img resize img begin 27 | char buff[256]; 28 | char *input_imgfn = buff; 29 | strncpy(input_imgfn, "dog.jpg", 256); 30 | printf("Input img:%s\n",input_imgfn); 31 | image im = load_image_stb(input_imgfn, 3);//3 channel img 32 | printf("img w=%d,h=%d,c=%d\n",im.w,im.h,im.c); 33 | //image sized = letterbox_image(im, 416, 416); 34 | image sized = letterbox_image(im, 416, 416); 35 | 36 | save_image_png(sized, "sized");// convert to yolov3 net input size 416x416x3 37 | ////////////////////load img resize img end 38 | 39 | time_t first, second; 40 | 41 | layer l = net->layers[net->n-1]; 42 | float *X = sized.data; 43 | 44 | //char line[256]; 45 | //FILE *fp3; 46 | //char filename[256]; 47 | //sprintf(filename, "yolo_layer_input_%d.txt", 123123); 48 | //printf("YOLO_layer:intputs=%d,%s\n",416*416*3,filename); 49 | // if( (fp3 = fopen(filename, "w")) == NULL)fprintf(stderr,"CANNOT OPEN\n"); 50 | // for( x = 0; x < l.outputs; x++) 51 | //{ 52 | // sprintf(line, "%f\n", X[x]); 53 | // if(fputs(line,fp3)<0)fprintf(stderr,"write FILE failed\n"); 54 | // } 55 | // fclose(fp3); 56 | 57 | first=time(NULL); 58 | yolov2_hls_ps(net, X); 59 | printf("yolov2_hls_ps END!\n"); 60 | second=time(NULL); 61 | printf("%s: Predicted in %f seconds.\n", input_imgfn, difftime(second,first)); 62 | 63 | int nboxes = 0; 64 | float nms=.45; 65 | float thresh = .5; 66 | float hier_thresh = .5; 67 | detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes); 68 | printf("%d\n", nboxes); 69 | printf("get_network_boxes END!\n"); 70 | /* 71 | for(x=0;x 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_AP_CTRL 0x000 24 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_GIE 0x004 25 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_IER 0x008 26 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_ISR 0x00c 27 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT_R_DATA 0x010 28 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT_R_DATA 32 29 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT1_DATA 0x018 30 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT1_DATA 32 31 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT2_DATA 0x020 32 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT2_DATA 32 33 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT3_DATA 0x028 34 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT3_DATA 32 35 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT_R_DATA 0x030 36 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT_R_DATA 32 37 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT1_DATA 0x038 38 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT1_DATA 32 39 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_WEIGHT_DATA 0x040 40 | #define XYOLO2_FPGA_CTRL_BUS_BITS_WEIGHT_DATA 32 41 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_BETA_DATA 0x048 42 | #define XYOLO2_FPGA_CTRL_BUS_BITS_BETA_DATA 32 43 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INFM_NUM_DATA 0x050 44 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INFM_NUM_DATA 32 45 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTFM_NUM_DATA 0x058 46 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTFM_NUM_DATA 32 47 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_KERNEL_SIZE_DATA 0x060 48 | #define XYOLO2_FPGA_CTRL_BUS_BITS_KERNEL_SIZE_DATA 32 49 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_KERNEL_STRIDE_DATA 0x068 50 | #define XYOLO2_FPGA_CTRL_BUS_BITS_KERNEL_STRIDE_DATA 32 51 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT_W_DATA 0x070 52 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT_W_DATA 32 53 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT_H_DATA 0x078 54 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT_H_DATA 32 55 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT_W_DATA 0x080 56 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT_W_DATA 32 57 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT_H_DATA 0x088 58 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT_H_DATA 32 59 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_PADDING_DATA 0x090 60 | #define XYOLO2_FPGA_CTRL_BUS_BITS_PADDING_DATA 32 61 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_ISNL_DATA 0x098 62 | #define XYOLO2_FPGA_CTRL_BUS_BITS_ISNL_DATA 1 63 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_ISBN_DATA 0x0a0 64 | #define XYOLO2_FPGA_CTRL_BUS_BITS_ISBN_DATA 1 65 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TM_DATA 0x0a8 66 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TM_DATA 32 67 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TN_DATA 0x0b0 68 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TN_DATA 32 69 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TR_DATA 0x0b8 70 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TR_DATA 32 71 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TC_DATA 0x0c0 72 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TC_DATA 32 73 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_MLOOPS_DATA 0x0c8 74 | #define XYOLO2_FPGA_CTRL_BUS_BITS_MLOOPS_DATA 32 75 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_NLOOPS_DATA 0x0d0 76 | #define XYOLO2_FPGA_CTRL_BUS_BITS_NLOOPS_DATA 32 77 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_RLOOPS_DATA 0x0d8 78 | #define XYOLO2_FPGA_CTRL_BUS_BITS_RLOOPS_DATA 32 79 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_CLOOPS_DATA 0x0e0 80 | #define XYOLO2_FPGA_CTRL_BUS_BITS_CLOOPS_DATA 32 81 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_LAYERTYPE_DATA 0x0e8 82 | #define XYOLO2_FPGA_CTRL_BUS_BITS_LAYERTYPE_DATA 32 83 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUTQ_DATA 0x0f0 84 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUTQ_DATA 32 85 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUTQ_DATA 0x0f8 86 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUTQ_DATA 32 87 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_WEIGHTQ_DATA 0x100 88 | #define XYOLO2_FPGA_CTRL_BUS_BITS_WEIGHTQ_DATA 32 89 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_BETAQ_DATA 0x108 90 | #define XYOLO2_FPGA_CTRL_BUS_BITS_BETAQ_DATA 32 91 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TROW_LOOPS_DATA 0x110 92 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TROW_LOOPS_DATA 32 93 | 94 | 95 | #define YOLO2_BASEADDR 0x43c00000 96 | 97 | #define WriteReg(BaseAddress, RegOffset, Data) *(volatile unsigned int*)((BaseAddress) + (RegOffset)) = (Data) 98 | #define ReadReg(BaseAddress, RegOffset) *(volatile unsigned int*)((BaseAddress) + (RegOffset)) 99 | 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /soft_version/Step02/main3.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "yolov3.h" 10 | 11 | #define MIN_VALUE (-1024*1024*1024) 12 | #define MAX_VALUE (1024*1024*1024) 13 | 14 | #define QUANTI 15 | 16 | #ifndef QUANTI 17 | int main( int argc, char *argv[]) 18 | { 19 | //freopen("result.txt","w",stdout); 20 | printf("YOLOv3 TEST Begin\n"); 21 | char **names = get_labels("coco.names"); 22 | int x; 23 | for(x=0;x<80;x++)//80 classe labels 24 | { 25 | printf("[%d]%s\n",x,names[x]); 26 | } 27 | image **alphabet = load_alphabet(); 28 | network *net = load_network("yolov3.cfg"); 29 | set_batch_network(net, 1); 30 | 31 | ////////////////////load img resize img begin 32 | char img_buff[256]; 33 | char *input_imgfn = img_buff; 34 | if(argc==1) 35 | strncpy(input_imgfn, "./dog.jpg", 256); 36 | else 37 | strncpy(input_imgfn, argv[1], 256); 38 | image im = load_image_stb(input_imgfn, 3);//3 channel img 39 | printf("Input img:%s\n w=%d,h=%d,c=%d\n", input_imgfn, im.w, im.h, im.c); 40 | image sized = letterbox_image(im, 416, 416); 41 | save_image_png(sized, "sized");// convert to yolov3 net input size 416x416x3 42 | ////////////////////load img resize img end 43 | 44 | time_t first, second; 45 | layer l = net->layers[net->n-1]; 46 | float *X = sized.data; 47 | 48 | first=time(NULL); 49 | yolov2_hls_ps(net, X); 50 | second=time(NULL); 51 | printf("%s: Predicted in %f seconds.\n", input_imgfn, difftime(second,first)); 52 | 53 | int nboxes = 0; 54 | float nms=.45; 55 | float thresh = .5; 56 | float hier_thresh = .5; 57 | detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes); 58 | printf("%d\n", nboxes); 59 | //for(x=0;xmax) 102 | max = tmp_in_float; 103 | } 104 | printf("float min=%.7lf,max=%.7lf ",min,max);//find float min max 105 | 106 | int k; 107 | int maxQ = -1; 108 | for(k=0;k<16;k++)//find maxQ 109 | { 110 | if(min>ap16_range[2*k]&&maxmax_error) 138 | max_error = error; 139 | 140 | out[woffset+j] = tmp_fixed; 141 | } 142 | printf("sum2_error = %.7lf,min_error=%.7lf,max_error=%.7lf",sum_error,min_error,max_error); 143 | printf("\n"); 144 | 145 | woffset += offset[offset_index]; 146 | offset_index++; 147 | } 148 | 149 | return 0; 150 | } 151 | 152 | int main(int argc,char *argv[]) 153 | { 154 | int i; 155 | printf("Test fixed-point\n"); 156 | /* 157 | int weight_offset[32] = {864, 18432, 73728, 8192, 73728, 158 | 294912, 32768, 294912, 1179648, 131072, 1179648, 131072, 159 | 1179648, 4718592, 524288, 4718592, 524288, 4718592, 9437184, 160 | 9437184, 32768, 11796480, 435200, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 161 | 162 | int beta_offset[32] = {32, 64, 128, 64, 128, 256, 128, 256, 512, 256, 512, 256, 512, 1024, 163 | 512, 1024, 512, 1024, 1024, 1024, 64, 1024, 425, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 164 | 165 | short *Weight_fixed_buf = (short *)calloc(203767168/4,sizeof(short)); 166 | float *Weight_buf = (float *)calloc(203767168/4,sizeof(float)); 167 | float *Beta_buf = (float *)calloc(43044/4,sizeof(float)); 168 | */ 169 | int yolo3_weight_offset[128] = {864,18432,2048,18432, 170 | 73728,8192,73728, 171 | 8192,73728, 172 | 294912,32768,294912, 173 | 32768,294912, 174 | 32768,294912, 175 | 32768,294912, 176 | 32768,294912, 177 | 32768,294912, 178 | 32768,294912, 179 | 32768,294912, 180 | 1179648,131072,1179648, 181 | 131072,1179648, 182 | 131072,1179648, 183 | 131072,1179648, 184 | 131072,1179648, 185 | 131072,1179648, 186 | 131072,1179648, 187 | 131072,1179648, 188 | 4718592,524288,4718592, 189 | 524288,4718592, 190 | 524288,4718592, 191 | 524288,4718592, 192 | 524288,4718592,524288,4718592,524288,4718592,261120, 193 | 131072, 194 | 196608,1179648,131072,1179648,131072,1179648,130560, 195 | 32768, 196 | 49152,294912,32768,294912,32768,294912,65280, 197 | 0,0,0,0,0,0,0,0,0,0, 198 | 0,0,0,0,0,0,0,0,0,0, 199 | 0,0,0,0,0,0,0,0,0,0, 200 | 0,0,0,0,0,0,0,0,0,0, 201 | 0,0,0,0,0,0,0,0,0,0, 202 | 0,0,0}; 203 | 204 | int yolo3_beta_offset[128] = {32,64,32,64, 205 | 128,64,128, 206 | 64,128, 207 | 256,128,256, 208 | 128,256, 209 | 128,256, 210 | 128,256, 211 | 128,256, 212 | 128,256, 213 | 128,256, 214 | 128,256, 215 | 512,256,512, 216 | 256,512, 217 | 256,512, 218 | 256,512, 219 | 256,512, 220 | 256,512, 221 | 256,512, 222 | 256,512, 223 | 1024,512,1024, 224 | 512,1024, 225 | 512,1024, 226 | 512,1024, 227 | 512,1024,512,1024,512,1024,255, 228 | 256, 229 | 256,512,256,512,256,512,255, 230 | 128, 231 | 128,256,128,256,128,256,255, 232 | 0,0,0,0,0,0,0,0,0,0, 233 | 0,0,0,0,0,0,0,0,0,0, 234 | 0,0,0,0,0,0,0,0,0,0, 235 | 0,0,0,0,0,0,0,0,0,0, 236 | 0,0,0,0,0,0,0,0,0,0, 237 | 0,0,0}; 238 | 239 | short* yolo3_weight_fixed_buf = (short *)calloc(247583104/4,sizeof(short)); 240 | float* yolo3_weight_buf = (float *)calloc(247583104/4,sizeof(float)); 241 | float* yolo3_beta_buf = (float *)calloc(108276/4,sizeof(float)); 242 | 243 | 244 | FILE *fp_w = fopen("weights_reorg.bin", "rb"); 245 | if(!fp_w) printf("fopen weights_reorg.bin error\n"); 246 | FILE *fp_b = fopen("bias.bin", "rb"); 247 | if(!fp_b) printf("fopen bias.bin error\n"); 248 | /* 249 | fread(Weight_buf, sizeof(float), 203767168/4, fp_w); 250 | fread(Beta_buf, sizeof(float), 43044/4, fp_b); 251 | */ 252 | fread(yolo3_weight_buf, sizeof(float), 247583104/4, fp_w); 253 | fread(yolo3_beta_buf, sizeof(float), 108276/4, fp_b); 254 | 255 | fclose(fp_w); 256 | fclose(fp_b); 257 | //////////////////////////////// 258 | 259 | short ap16_min = 0x8000; 260 | short ap16_max = 0x7fff; 261 | printf("ap16_min = %d \nap16_max = %d\n",ap16_min,ap16_max); 262 | float ap16_range[16*2]; 263 | for(i=0;i<16;i++) 264 | { 265 | printf("Q%2d:",i); 266 | ap16_range[2*i] = (float)ap16_min*pow((float)2,-i);//min 267 | ap16_range[2*i+1] = (float)ap16_max*pow((float)2,-i);//max 268 | printf("min=%.7lf,max=%.7lf\n",ap16_range[2*i],ap16_range[2*i+1]); 269 | } 270 | //////////////////////////////// 271 | int maxQ_array[32]; 272 | int layer_num; 273 | FILE* fout; 274 | char layer_num_string[256]; 275 | char s[256]; 276 | 277 | printf("weight quantize begin\n"); 278 | layer_num = quantize_short16(yolo3_weight_buf,yolo3_weight_fixed_buf,yolo3_weight_offset,128,ap16_range,maxQ_array); 279 | for(i=0;i 3 | #include 4 | #include 5 | #include 6 | #include "cnn.h" 7 | 8 | 9 | /* 10 | *mei gai sha a za conv dou bu dui jin le? 11 | * */ 12 | ////////////////////////////////////////////20181229 n4m32 v2 without input and reorg opt ok input opt ok combine input relu comb ok // input opt ok // output opt ok //weight opt ok (5)n4m32i4o2 ok start 13 | #define MAX(x,y) ((x)>(y)?(x):(y)) 14 | #define MIN(x,y) ((x)<(y)?(x):(y)) 15 | #define S 2 16 | #define K 3 17 | 18 | //Tn he Tm hai bu zhi dao shen me yi si 19 | #define Tn 4 20 | #define Tm 32 21 | 22 | //Tr,Tc yao gen ju wang luo da xiao she zhi 23 | #define Tr 26 24 | #define Tc 26 25 | 26 | #define SIZE 13 27 | #define PARA 1 28 | 29 | #define OnChipIB_Width ((Tc-1)*S+K) 30 | #define OnChipIB_Height ((Tr-1)*S+K) 31 | //MAX_BETA_LENGTH yao gen ju ce ding de shu ju she zhi 32 | #define MAX_BETA_LENGTH (1024) 33 | #define INTERWIDTH 20 34 | /* 35 | 方案1:shortcut分别加载两次,每次四个 36 | 尝试ing 37 | 方案2:重写加载函数只包含两个端口 38 | 等方案一的结果 39 | */ 40 | 41 | typedef unsigned char UCHAR; 42 | 43 | void mmcpy_inputport(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum) 44 | { 45 | bool enable = TN_MIN > 0; 46 | if(!enable) 47 | return; 48 | 49 | memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int)); 50 | 51 | } 52 | 53 | void mmcpy_inputport1(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum) 54 | { 55 | bool enable = TN_MIN > 1; 56 | if(!enable) 57 | return; 58 | 59 | memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int)); 60 | 61 | } 62 | 63 | void mmcpy_inputport2(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum) 64 | { 65 | bool enable = TN_MIN > 2; 66 | if(!enable) 67 | return; 68 | 69 | memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int)); 70 | 71 | 72 | } 73 | 74 | void mmcpy_inputport3(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum) 75 | { 76 | bool enable = TN_MIN > 3; 77 | if(!enable) 78 | return; 79 | 80 | memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int)); 81 | 82 | } 83 | 84 | /* 85 | 是从四个端口加入的 86 | 主要是看大小 87 | */ 88 | void mmcpy_inputpixel_m2b_comb(int *input,int *input1,int *input2,int *input3, 89 | int input_memcpy_buffer[(OnChipIB_Width+3)/2],int input_memcpy_buffer1[(OnChipIB_Width+3)/2], 90 | int input_memcpy_buffer2[(OnChipIB_Width+3)/2],int input_memcpy_buffer3[(OnChipIB_Width+3)/2], 91 | ap_uint<1> RowBeginByte[Tn],ap_uint<3> TN_MIN_3b,ap_uint<6> t2,ap_uint<1> RowSub,int IN_OFFSET,ap_uint<9> RowIncreaseLength,ap_uint<18> IHxIW_18b,ap_uint<6> ColIncreaseLength,ap_uint<6> next_t2[1],bool next_IsRowPixel[1],bool IsRowPixel,bool enable) 92 | { 93 | static int tmp_inoffset; 94 | 95 | next_t2[0] = t2; 96 | next_IsRowPixel[0] = IsRowPixel; 97 | 98 | if(!enable) 99 | return; 100 | 101 | bool init = (t2==0); 102 | if(init) 103 | { 104 | tmp_inoffset = IN_OFFSET; 105 | }else 106 | { 107 | tmp_inoffset += RowIncreaseLength; 108 | } 109 | 110 | int InOffset[Tn]; 111 | #pragma HLS ARRAY_PARTITION variable=InOffset complete dim=1 112 | int RowOffset[Tn]; 113 | #pragma HLS ARRAY_PARTITION variable=RowOffset complete dim=1 114 | ap_uint<1> LowBit[Tn]; 115 | #pragma HLS ARRAY_PARTITION variable=LowBit complete dim=1 116 | UCHAR BeginByteNum[Tn]; 117 | #pragma HLS ARRAY_PARTITION variable=BeginByteNum complete dim=1 118 | UCHAR RowIntNum[Tn]; 119 | #pragma HLS ARRAY_PARTITION variable=RowIntNum complete dim=1 120 | 121 | int t1; 122 | for(t1 = 0;t1 < Tn;t1++) 123 | { 124 | #pragma HLS UNROLL 125 | InOffset[t1] = tmp_inoffset + t1*IHxIW_18b; 126 | RowOffset[t1] = InOffset[t1] >> 1; 127 | LowBit[t1] = InOffset[t1]&0x1; 128 | RowBeginByte[t1] = LowBit[t1]; 129 | BeginByteNum[t1] = ColIncreaseLength + LowBit[t1]; 130 | 131 | // assert((BeginByteNum[t1] > 0)&&(BeginByteNum[t1] < 256)); 132 | 133 | RowIntNum[t1] = BeginByteNum[t1] >> 1; 134 | if(BeginByteNum[t1]&0x1) 135 | RowIntNum[t1]++; 136 | 137 | // assert((RowIntNum[t1] > 0)&&(RowIntNum[t1] < 256)); 138 | } 139 | 140 | mmcpy_inputport(input,input_memcpy_buffer, TN_MIN_3b,RowOffset[0],RowIntNum[0]); 141 | mmcpy_inputport1(input1,input_memcpy_buffer1, TN_MIN_3b,RowOffset[1],RowIntNum[1]); 142 | mmcpy_inputport2(input2,input_memcpy_buffer2, TN_MIN_3b,RowOffset[2],RowIntNum[2]); 143 | mmcpy_inputport3(input3,input_memcpy_buffer3, TN_MIN_3b,RowOffset[3],RowIntNum[3]); 144 | } 145 | 146 | void copy_input2buf_row(short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],ap_uint<6> row_len,ap_uint<6> col_len,ap_uint<1> RowSub,ap_uint<1> ColSub, 147 | int input_memcpy_buffer[(OnChipIB_Width+3)/2],int input_memcpy_buffer1[(OnChipIB_Width+3)/2], 148 | int input_memcpy_buffer2[(OnChipIB_Width+3)/2],int input_memcpy_buffer3[(OnChipIB_Width+3)/2], 149 | ap_uint<1> RowBeginByte[Tn],UCHAR TRow,UCHAR TCol,int LayerType,ap_uint<6> next_t2[1],bool next_enable[1],bool enable,ap_uint<3> T2Rate) 150 | { 151 | 152 | if(!enable) 153 | return; 154 | 155 | static ap_uint<6> t2_local = 0; 156 | ap_uint<6> t2 = next_t2[0]; 157 | bool IsRowPixel = next_enable[0]; 158 | int t1,t3; 159 | ap_uint<6> t2r; 160 | ap_uint<3> T2R; 161 | 162 | bool initial = (t2==0); 163 | if(initial) 164 | { 165 | t2_local = 0; 166 | } 167 | 168 | short pad_value = 0; 169 | /* 170 | if(LayerType==1) 171 | pad_value = 0x8001; 172 | */ 173 | int input_mmcpy_offset[Tn]; 174 | #pragma HLS ARRAY_PARTITION variable=input_mmcpy_offset complete dim=1 175 | bool NextInputFlag[Tn]; 176 | #pragma HLS ARRAY_PARTITION variable=NextInputFlag complete dim=1 177 | ap_uint<1> cnt[Tn]; 178 | #pragma HLS ARRAY_PARTITION variable=cnt complete dim=1 179 | short input_array[Tn][2]; 180 | #pragma HLS ARRAY_PARTITION variable=input_array complete dim=1 181 | 182 | for(t1 = 0;t1 < Tn; t1++) 183 | { 184 | #pragma HLS UNROLL 185 | input_mmcpy_offset[t1] = 0; 186 | } 187 | 188 | if(!IsRowPixel) 189 | { 190 | T2R = T2Rate + 1; 191 | }else 192 | { 193 | T2R = T2Rate; 194 | } 195 | // ap_uint<6> T2R_bound = MIN(t2_local + T2R,OnChipIB_Height); 196 | unsigned char tmp_min = t2_local + T2R; 197 | ap_uint<6> T2R_bound = MIN(tmp_min, OnChipIB_Height); 198 | 199 | bool IsRowInit_flag = true; 200 | 201 | for(t2r = t2_local;t2r < T2R_bound; t2r++) 202 | for(t3 = 0;t3 < TCol; t3++) 203 | { 204 | #pragma HLS PIPELINE 205 | bool IsRowPixel_t2r = (t2r >= RowSub)&&(t2r < (row_len + RowSub)); 206 | bool IsColPixel = (t3 >= ColSub)&&(t3 < (col_len + ColSub)); 207 | bool IsRowInit = (t3==ColSub)&&IsRowInit_flag; 208 | 209 | if(IsRowPixel_t2r&&IsColPixel) 210 | { 211 | if(IsRowInit) 212 | { 213 | IsRowInit_flag = false; 214 | cnt[0] = RowBeginByte[0]; 215 | cnt[1] = RowBeginByte[1]; 216 | cnt[2] = RowBeginByte[2]; 217 | cnt[3] = RowBeginByte[3]; 218 | NextInputFlag[0] = true; 219 | NextInputFlag[1] = true; 220 | NextInputFlag[2] = true; 221 | NextInputFlag[3] = true; 222 | } 223 | 224 | if(NextInputFlag[0]) 225 | { 226 | input_array[0][0] = input_memcpy_buffer[input_mmcpy_offset[0]]; 227 | input_array[0][1] = input_memcpy_buffer[input_mmcpy_offset[0]] >> 16; 228 | input_mmcpy_offset[0]++; 229 | NextInputFlag[0] = false; 230 | } 231 | 232 | if(NextInputFlag[1]) 233 | { 234 | input_array[1][0] = input_memcpy_buffer1[input_mmcpy_offset[1]]; 235 | input_array[1][1] = input_memcpy_buffer1[input_mmcpy_offset[1]] >> 16; 236 | input_mmcpy_offset[1]++; 237 | NextInputFlag[1] = false; 238 | } 239 | 240 | if(NextInputFlag[2]) 241 | { 242 | input_array[2][0] = input_memcpy_buffer2[input_mmcpy_offset[2]]; 243 | input_array[2][1] = input_memcpy_buffer2[input_mmcpy_offset[2]] >> 16; 244 | input_mmcpy_offset[2]++; 245 | NextInputFlag[2] = false; 246 | } 247 | 248 | if(NextInputFlag[3]) 249 | { 250 | input_array[3][0] = input_memcpy_buffer3[input_mmcpy_offset[3]]; 251 | input_array[3][1] = input_memcpy_buffer3[input_mmcpy_offset[3]] >> 16; 252 | input_mmcpy_offset[3]++; 253 | NextInputFlag[3] = false; 254 | } 255 | 256 | input_buffer[0][t2r][t3] = input_array[0][cnt[0]]; 257 | input_buffer[1][t2r][t3] = input_array[1][cnt[1]]; 258 | input_buffer[2][t2r][t3] = input_array[2][cnt[2]]; 259 | input_buffer[3][t2r][t3] = input_array[3][cnt[3]]; 260 | 261 | if(cnt[0]==1) 262 | { 263 | NextInputFlag[0] = true; 264 | cnt[0] = 0; 265 | }else 266 | { 267 | cnt[0] = 1; 268 | } 269 | 270 | if(cnt[1]==1) 271 | { 272 | NextInputFlag[1] = true; 273 | cnt[1] = 0; 274 | }else 275 | { 276 | cnt[1] = 1; 277 | } 278 | 279 | if(cnt[2]==1) 280 | { 281 | NextInputFlag[2] = true; 282 | cnt[2] = 0; 283 | }else 284 | { 285 | cnt[2] = 1; 286 | } 287 | 288 | if(cnt[3]==1) 289 | { 290 | NextInputFlag[3] = true; 291 | cnt[3] = 0; 292 | }else 293 | { 294 | cnt[3] = 1; 295 | } 296 | }else 297 | { 298 | input_buffer[0][t2r][t3] = pad_value; 299 | input_buffer[1][t2r][t3] = pad_value; 300 | input_buffer[2][t2r][t3] = pad_value; 301 | input_buffer[3][t2r][t3] = pad_value; 302 | } 303 | 304 | } 305 | 306 | t2_local += T2R; 307 | } 308 | 309 | 310 | //到底影响不影响啊?喵喵喵? 311 | /* 312 | mmcpy_inputpixel_m2b_comb pingpong了这个函数 313 | */ 314 | void input_load(int *input,int *input1,int *input2,int *input3, 315 | short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],int r,int c,int n,int Kernel_stride,int Padding,UCHAR TRow,UCHAR TCol,int Input_w,int Input_h,int TN_MIN,int IHxIW,int LayerType,ap_uint<6> trow_loops) 316 | { 317 | static int input_memcpy_buffer0[(OnChipIB_Width+3)/2]; 318 | static int input_memcpy_buffer1[(OnChipIB_Width+3)/2]; 319 | static int input_memcpy_buffer2[(OnChipIB_Width+3)/2]; 320 | static int input_memcpy_buffer3[(OnChipIB_Width+3)/2]; 321 | ap_uint<1> RowBeginByte[Tn]; 322 | #pragma HLS ARRAY_PARTITION variable=RowBeginByte complete dim=1//0 ro 1 323 | 324 | static int input_memcpy_buffer02[(OnChipIB_Width+3)/2]; 325 | static int input_memcpy_buffer12[(OnChipIB_Width+3)/2]; 326 | static int input_memcpy_buffer22[(OnChipIB_Width+3)/2]; 327 | static int input_memcpy_buffer32[(OnChipIB_Width+3)/2]; 328 | ap_uint<1> RowBeginByte2[Tn];//0 ro 1 329 | #pragma HLS ARRAY_PARTITION variable=RowBeginByte2 complete dim=1//0 ro 1 330 | 331 | ap_uint<1> RowSub,ColSub; 332 | 333 | ap_uint<6> t2; 334 | 335 | ap_uint<9> r_9b = r; 336 | // assert(r < 512); 337 | ap_uint<9> c_9b = c; 338 | // assert(c < 512); 339 | // assert(n < 2048); 340 | ap_uint<11> n_11b = n; 341 | // assert(Kernel_stride < 4); 342 | ap_uint<2> Kernel_stride_2b = Kernel_stride; 343 | // assert(Padding < 2); 344 | ap_uint<1> Padding_1b = Padding; 345 | // assert(Input_w < 512); 346 | ap_uint<9> Input_w_9b = Input_w; 347 | ap_uint<10> Input_h_10b = Input_h; 348 | // assert(Input_h < 1024); 349 | // assert(TN_MIN < 8);//xx8 350 | ap_uint<3> TN_MIN_3b = TN_MIN; 351 | ap_uint<18> IHxIW_18b = IHxIW; 352 | // assert(IHxIW < 512*512); 353 | 354 | ap_int<12> Coffset; 355 | ap_int<12> Roffset; 356 | // Coffset = c_9b*Kernel_stride_2b - Padding_1b; 357 | // Roffset = r_9b*Kernel_stride_2b - Padding_1b; 358 | if(LayerType == 2){ 359 | Coffset = c_9b; 360 | Roffset = r_9b; 361 | } else { 362 | Coffset = c_9b*Kernel_stride_2b - Padding_1b; 363 | Roffset = r_9b*Kernel_stride_2b - Padding_1b; 364 | } 365 | 366 | ap_uint<12> TCol_right,TRow_bottom; 367 | ap_uint<10> TRow_top,TCol_left; 368 | ap_uint<6> row_len,col_len; 369 | 370 | if(Coffset > 0) 371 | TCol_left = Coffset; 372 | else 373 | TCol_left = 0; 374 | 375 | if((Coffset + TCol-1) 0) 383 | TRow_top = Roffset; 384 | else 385 | TRow_top = 0; 386 | 387 | if((Roffset + TRow-1) RowIncreaseLength; 397 | ap_uint<6> ColIncreaseLength; 398 | ap_uint<3> T2Rate; 399 | switch(Input_w_9b) 400 | { 401 | case Tr: 402 | RowIncreaseLength = 2*Tr; 403 | ColIncreaseLength = 2*Tr; 404 | T2Rate = 2; 405 | break; 406 | case SIZE: 407 | RowIncreaseLength = 4*SIZE; 408 | ColIncreaseLength = 4*SIZE; 409 | T2Rate = 4; 410 | break; 411 | default: 412 | RowIncreaseLength = Input_w_9b; 413 | ColIncreaseLength = col_len; 414 | T2Rate = 1; 415 | break; 416 | } 417 | 418 | //assert(ColNum < 64*64); 419 | //assert(RowNum < 64); 420 | RowSub = TRow_top - Roffset; 421 | ColSub = TCol_left - Coffset; 422 | 423 | bool pingpong = 1; 424 | ap_uint<6> next_t2[1]; 425 | bool next_IsRowPixel[1]; 426 | ap_uint<6> next_t22[1]; 427 | bool next_IsRowPixel2[1]; 428 | 429 | // ap_uint<6> trow_loops = (int)ceil(((float)TRow/T2Rate)); 430 | ap_uint<6> TMP_t2; 431 | for(TMP_t2 = 0,t2 = 0;TMP_t2 < trow_loops + 1; t2 += T2Rate,TMP_t2++) 432 | { 433 | bool IsRowPixel = (t2 >= RowSub)&&(t2 < (row_len + RowSub)); 434 | 435 | if(pingpong == 1) 436 | { 437 | mmcpy_inputpixel_m2b_comb(input,input1,input2,input3, 438 | input_memcpy_buffer0, input_memcpy_buffer1, 439 | input_memcpy_buffer2, input_memcpy_buffer3, 440 | RowBeginByte, TN_MIN_3b, t2, RowSub, IN_OFFSET, RowIncreaseLength, IHxIW_18b, ColIncreaseLength, next_t2,next_IsRowPixel,IsRowPixel,TMP_t2!=trow_loops); 441 | 442 | copy_input2buf_row( input_buffer, row_len, col_len, RowSub, ColSub, 443 | input_memcpy_buffer02, input_memcpy_buffer12,input_memcpy_buffer22, input_memcpy_buffer32, 444 | RowBeginByte2, TRow, TCol,LayerType,next_t22,next_IsRowPixel2,TMP_t2!=0,T2Rate); 445 | pingpong = 0; 446 | }else 447 | { 448 | mmcpy_inputpixel_m2b_comb(input,input1,input2,input3, 449 | input_memcpy_buffer02, input_memcpy_buffer12, 450 | input_memcpy_buffer22, input_memcpy_buffer32, 451 | RowBeginByte2, TN_MIN_3b, t2, RowSub, IN_OFFSET, RowIncreaseLength, IHxIW_18b, ColIncreaseLength, next_t22,next_IsRowPixel2,IsRowPixel,TMP_t2!=trow_loops); 452 | 453 | copy_input2buf_row( input_buffer, row_len, col_len, RowSub, ColSub, 454 | input_memcpy_buffer0, input_memcpy_buffer1,input_memcpy_buffer2, input_memcpy_buffer3, 455 | RowBeginByte, TRow, TCol,LayerType,next_t2,next_IsRowPixel,TMP_t2!=0,T2Rate); 456 | pingpong = 1; 457 | } 458 | } 459 | 460 | // assert(TRow_top < 1024); 461 | // assert(TCol_left < 1024); 462 | 463 | } 464 | 465 | void weight_mmcpy_everyKxK(int *Weight,int weight_memcpy_buffer[Tm*Tn/2],ap_uint<3> t3,ap_uint<3> t4,ap_uint<3> next_t3[1],ap_uint<3> next_t4[1],unsigned int ReadLength,bool init_enable,bool enable) 466 | { 467 | if(!enable) 468 | return; 469 | 470 | static int Woffset; 471 | next_t3[0] = t3; 472 | next_t4[0] = t4; 473 | 474 | if(init_enable) 475 | { 476 | Woffset = 0; 477 | } 478 | 479 | memcpy(weight_memcpy_buffer,(int *)(Weight + Woffset),ReadLength*sizeof(int)); 480 | Woffset += ReadLength; 481 | } 482 | 483 | void load_weight2buf_everyKxK(int weight_memcpy_buffer[Tm*Tn/2],short weight_buffer[Tm][Tn][K][K],ap_uint<3> t3,ap_uint<3> t4,ap_uint<6> TM_MIN,ap_uint<3> TN_MIN,bool enable) 484 | { 485 | 486 | if(!enable) 487 | return; 488 | 489 | ap_uint<6> t1; 490 | ap_uint<3> t2; 491 | ap_uint<8> weight_memcpy_offset = 0; 492 | ap_uint<2> cnt = 0; 493 | short input_array[2]; 494 | #pragma HLS ARRAY_PARTITION variable=input_array complete dim=1 495 | short input_value; 496 | 497 | for(t1 = 0;t1 < Tm; t1++) 498 | for(t2 = 0;t2 < Tn; t2++) 499 | { 500 | #pragma HLS PIPELINE 501 | bool Enable = (t1 < TM_MIN)&&(t2 < TN_MIN); 502 | if(Enable) 503 | { 504 | if(cnt==0) 505 | { 506 | input_array[0] = weight_memcpy_buffer[weight_memcpy_offset]; 507 | input_array[1] = weight_memcpy_buffer[weight_memcpy_offset] >> 16; 508 | weight_memcpy_offset++; 509 | } 510 | input_value = input_array[cnt]; 511 | 512 | cnt++; 513 | if(cnt==2) 514 | cnt = 0; 515 | } 516 | else 517 | input_value = 0; 518 | 519 | weight_buffer[t1][t2][t3][t4] = input_value; 520 | } 521 | } 522 | 523 | void weight_load_reorg(int *Weight,short weight_buffer[Tm][Tn][K][K],bool weight_load_enable,int m,int n,int IFM_numxKxK,int KxK,int Kernel_size,int TM_MIN,int TN_MIN) 524 | { 525 | /*int t1,t2,t3,t4;*/ 526 | static int weight_memcpy_buffer[Tm*Tn/2]; 527 | static int weight_memcpy_buffer1[Tm*Tn/2]; 528 | 529 | if(!weight_load_enable) 530 | return; 531 | 532 | // assert(m < 1024); 533 | // assert(n < 2048);//gg2048 534 | // assert(IFM_numxKxK < 1024*16); 535 | // assert(Kernel_size < 4); 536 | // assert(TM_MIN < 64); 537 | // assert(TN_MIN < 8);//xx8 538 | 539 | ap_uint<2> Kernel_size_2b = Kernel_size; 540 | ap_uint<6> TM_MIN_6b = TM_MIN; 541 | ap_uint<3> TN_MIN_3b = TN_MIN; 542 | 543 | ap_uint<10> m_10b = m; 544 | ap_uint<11> n_11b = n; 545 | 546 | bool Me0aNe0 = (m_10b==0)&&(n_11b==0); 547 | unsigned int ReadLength = (TM_MIN_6b*TN_MIN_3b)>>1; 548 | 549 | // if((TM_MIN*TN_MIN)%2) 550 | // printf("weight % error\n"); 551 | 552 | ap_uint<3> t3,t4; 553 | ap_uint<3> next_t3[1]; 554 | ap_uint<3> next_t4[1]; 555 | ap_uint<3> next_t31[1]; 556 | ap_uint<3> next_t41[1]; 557 | 558 | bool pingpong = true; 559 | 560 | for(t3 = 0;t3 < Kernel_size_2b;t3++) 561 | for(t4 = 0;t4 < Kernel_size_2b + 1;t4++) 562 | { 563 | if(pingpong) 564 | { 565 | weight_mmcpy_everyKxK(Weight, weight_memcpy_buffer, t3, t4,next_t3,next_t4, ReadLength,Me0aNe0&&(t3==0)&&(t4==0),t4!=Kernel_size_2b); 566 | load_weight2buf_everyKxK(weight_memcpy_buffer1, weight_buffer, next_t31[0], next_t41[0], TM_MIN, TN_MIN,t4!=0); 567 | pingpong = false; 568 | }else 569 | { 570 | weight_mmcpy_everyKxK(Weight, weight_memcpy_buffer1, t3, t4,next_t31,next_t41, ReadLength,Me0aNe0&&(t3==0)&&(t4==0),t4!=Kernel_size_2b); 571 | load_weight2buf_everyKxK(weight_memcpy_buffer, weight_buffer, next_t3[0], next_t4[0], TM_MIN, TN_MIN,t4!=0); 572 | pingpong = true; 573 | } 574 | } 575 | } 576 | 577 | 578 | void copy_input_weight(int *input,int *input1,int *input2,int *input3,int *Weight,int InFM_num,int Input_w,int Input_h,int Kernel_size,int Kernel_stride,int r,int c,int m,int n, 579 | int TM_MIN,int TN,UCHAR TRow,UCHAR TCol,int Padding,short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],short weight_buffer[Tm][Tn][K][K],int TMP_N_next[1], 580 | bool enable,bool weight_load_enable,bool initialize,const int IHxIW,const int KxK,const int IFM_numxKxK,const int LayerType,ap_uint<6> trow_loops) 581 | { 582 | if(!enable) 583 | return ; 584 | 585 | const int TN_MIN = MIN(TN,InFM_num - n); 586 | TMP_N_next[0] = n; 587 | 588 | input_load(input,input1,input2,input3, input_buffer, r, c, n, Kernel_stride, Padding, TRow, TCol, Input_w, Input_h, TN_MIN, IHxIW, LayerType,trow_loops); 589 | weight_load_reorg(Weight,weight_buffer,weight_load_enable,m,n,IFM_numxKxK,KxK,Kernel_size,TM_MIN,TN_MIN); 590 | 591 | } 592 | 593 | //////////////////////////////////////////////////T3 end 594 | 595 | void copy_local_beta(short beta_buffer[MAX_BETA_LENGTH],int local_beta_buffer[MAX_BETA_LENGTH],const int TM_MIN,int m,UCHAR InterSubBeta) 596 | { 597 | ap_uint<4> InterSubBeta_4b = InterSubBeta; 598 | int offset; 599 | int tm; 600 | for(tm = 0,offset = m;tm < TM_MIN;tm++) 601 | { 602 | #pragma HLS PIPELINE 603 | local_beta_buffer[tm] = beta_buffer[offset] << InterSubBeta_4b; 604 | offset++; 605 | } 606 | } 607 | 608 | ///####################################################################################################################### 609 | //现在缺少位数对齐 610 | /* 611 | 输入大小是没有改动的所以不用管,但是输出是要改动的 612 | */ 613 | //buffer1需要和buffer0对其 614 | //In1Sub和buff1对应, In0Sub和buffer0对应 615 | //In1Sub 是要处理的值 616 | void shortcut(short input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],short input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width], 617 | int output_buffer[Tm][Tr*PARA][Tc*PARA],const int TM_MIN,const int TR_MIN, 618 | const int TC_MIN,UCHAR In0Sub,UCHAR In1Sub,bool enable){ 619 | //对于shortcut来说,只有其中的一个需要和输出对其,那么问题来了,究竟是左移还是右移? 620 | ap_uint<5> tr,tc,tm; 621 | if(!enable){ 622 | return; 623 | } 624 | 625 | ap_uint<5> TM_MIN_5b = TM_MIN; 626 | ap_uint<5> TR_MIN_5b = TR_MIN; 627 | ap_uint<5> TC_MIN_5b = TC_MIN; 628 | 629 | ap_uint<4> In0Sub_4b = In0Sub; 630 | ap_uint<4> In1Sub_4b = INTERWIDTH - In1Sub; 631 | //由于不确定大小关系,所以不能直接减法求出差值然后计算,只能先同步到相同的位置,那就是 INTERWIDTH的位数,然后加法以后再处理回去 632 | 633 | for(tm = 0; tm < TM_MIN_5b;tm++){ 634 | for(tr = 0; tr < TR_MIN_5b;tr++){ 635 | for(tc = 0; tc < TC_MIN_5b; tc++){ 636 | #pragma HLS PIPELINE 637 | int tempt_add0 = input_buffer0[tm][tr][tc] << In0Sub_4b; 638 | int tempt_add1 = input_buffer1[tm][tr][tc] << In1Sub_4b; 639 | //用32位容纳一个<20位的数值,不会有精度损失,然后做加法 640 | int tempt_out = (tempt_add0 + tempt_add1) >> In0Sub_4b; 641 | //output_buffer[tm][tr][tc] = input_buffer0[tm][tr][tc] + input_buffer1[tm][tr][tc]; 642 | output_buffer[tm][tr][tc] = tempt_out; 643 | //int 可以给short吗? 644 | //所以数值上是int 645 | //那么现在就存在一个问题了,究竟是需不需要右移呢? 646 | //先试一下需要右移的吧 647 | } 648 | } 649 | } 650 | 651 | } 652 | ///####################################################################################################################### 653 | void upsample(short input_bufferInput[Tn][OnChipIB_Height][OnChipIB_Width],int output_buffer[Tm*PARA][Tr*PARA][Tc*PARA], 654 | const int upsample_size,const int TM_MIN,const int TR_MIN, 655 | const int TC_MIN,bool enable){ 656 | ap_uint<5> tr,tc,tm,i,j; 657 | if(!enable){ 658 | return; 659 | } 660 | 661 | ap_uint<2> upsample_size_2b = upsample_size; 662 | 663 | ap_uint<5> TM_MIN_5b = TM_MIN; 664 | ap_uint<5> TR_MIN_5b = TR_MIN; 665 | ap_uint<5> TC_MIN_5b = TC_MIN; 666 | 667 | for(tm = 0; tm < TM_MIN_5b;tm++){ 668 | for(tr = 0; tr < TR_MIN_5b;tr++){ 669 | for(tc = 0; tc < TC_MIN_5b; tc++){ 670 | //#pragma HLS PIPELINE 671 | for(i = 0; i < upsample_size_2b;i++){ 672 | #pragma HLS UNROLL 673 | for(j = 0;j < upsample_size_2b;j++){ 674 | #pragma HLS UNROLL 675 | output_buffer[tm][tr*upsample_size_2b+i][tc*upsample_size_2b+j] = input_bufferInput[tm][tr][tc]; 676 | } 677 | } 678 | } 679 | } 680 | } 681 | } 682 | 683 | ///####################################################################################################################### 684 | void compute(short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],int output_buffer[Tm][Tr*PARA][Tc*PARA], 685 | short weight_buffer[Tm][Tn][K][K],short beta_buffer[MAX_BETA_LENGTH],int TMP_N_next[1], 686 | const int Kernel_size,const int Kernel_stride,int TMP_M, 687 | const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable,const bool IsNL,const bool reluenable, 688 | UCHAR InterSubBeta,UCHAR WeightAddInputSubInter,UCHAR InterSubOutput) 689 | { 690 | //输出完全没用到啊?InterSubOutput 691 | //计算以后直接移位我服了,为什么是右移这个位数? 692 | //因为两个运算了以后会超过位数 693 | static int local_beta_buffer[Tm]; 694 | #pragma HLS ARRAY_PARTITION variable=local_beta_buffer complete dim=1 695 | 696 | // static int compute_buffer[Tm][Tr][Tc]; 697 | //#pragma HLS ARRAY_PARTITION variable=compute_buffer complete dim=1 698 | 699 | if(!enable) 700 | { 701 | copy_local_beta(beta_buffer,local_beta_buffer,TM_MIN,TMP_M,InterSubBeta); 702 | return; 703 | } 704 | 705 | int partial_mul[Tm][Tn]; 706 | #pragma HLS ARRAY_PARTITION variable=partial_mul complete dim=1 707 | #pragma HLS ARRAY_PARTITION variable=partial_mul complete dim=2 708 | 709 | ap_uint<2> i,j; 710 | UCHAR tm,tn; 711 | ap_uint<5> tr,tc; 712 | ap_uint<2> Kernel_size_2b = Kernel_size; 713 | ap_uint<2> Kernel_stride_2b = Kernel_stride; 714 | 715 | ap_uint<5> TR_MIN_5b = TR_MIN; 716 | ap_uint<5> TC_MIN_5b = TC_MIN; 717 | 718 | // ap_uint<4> InterSubBeta_4b = InterSubBeta; 719 | ap_uint<4> WeightAddInputSubInter_4b = WeightAddInputSubInter; 720 | 721 | // assert(InterSubBeta < 16); 722 | // assert(WeightAddInputSubInter < 16); 723 | // assert(InterSubOutput < 16); 724 | 725 | // assert(Kernel_size < 4); 726 | // assert(TR_MIN < 32); 727 | // assert(TC_MIN < 32); 728 | 729 | ap_uint<11> n = TMP_N_next[0]; 730 | // assert(n < 2048); 731 | 732 | for(i = 0;i < Kernel_size_2b; i++) 733 | for(j = 0;j < Kernel_size_2b; j++) 734 | for(tr = 0;tr < TR_MIN_5b;tr++) 735 | for(tc = 0;tc < TC_MIN_5b;tc++) 736 | { 737 | #pragma HLS PIPELINE 738 | for(tm = 0;tm < Tm;tm++) 739 | { 740 | #pragma HLS DEPENDENCE variable=output_buffer inter false 741 | int tmp_add_result; 742 | if(i==0&&j==0&&n==0) 743 | { 744 | tmp_add_result = local_beta_buffer[tm]; 745 | } 746 | else 747 | tmp_add_result = output_buffer[tm][tr][tc]; 748 | 749 | partial_mul[tm][0] = (weight_buffer[tm][0][i][j]*input_buffer[0][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3 750 | partial_mul[tm][1] = (weight_buffer[tm][1][i][j]*input_buffer[1][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3 751 | partial_mul[tm][2] = (weight_buffer[tm][2][i][j]*input_buffer[2][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3 752 | partial_mul[tm][3] = (weight_buffer[tm][3][i][j]*input_buffer[3][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3 753 | 754 | int tmp_add1 = partial_mul[tm][0] + partial_mul[tm][1]; 755 | int tmp_add2 = partial_mul[tm][2] + partial_mul[tm][3]; 756 | int tmp_add12 = tmp_add1 + tmp_add2; 757 | output_buffer[tm][tr][tc] = tmp_add_result + tmp_add12; 758 | 759 | // partial_mul[tm][0] = (weight_buffer[tm][0][i][j]*input_buffer[0][tr+i][tc+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3 760 | // partial_mul[tm][1] = (weight_buffer[tm][1][i][j]*input_buffer[1][tr+i][tc+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3 761 | // 762 | // compute_buffer[tm][tr][tc] = tmp_add_result + partial_mul[tm][0] + partial_mul[tm][1]; 763 | } 764 | } 765 | } 766 | 767 | //////////////version-0.2 start 768 | void mmcpy_outputport(int *Output,int output_tmp[Tr*PARA*Tc*PARA/4],ap_uint<6> tm,ap_uint<6> mLoop,int OutputOffset,int OutputLength) 769 | { 770 | bool enable = tm < mLoop; 771 | if(!enable) 772 | return; 773 | 774 | memcpy((int *)(Output + OutputOffset),(int *)(output_tmp),OutputLength*sizeof(int)); 775 | } 776 | 777 | void mmcpy_outputport1(int *Output,int output_tmp[Tr*PARA*Tc*PARA/4],ap_uint<6> tm,ap_uint<6> mLoop,int OutputOffset,int OutputLength) 778 | { 779 | bool enable = tm < mLoop; 780 | if(!enable) 781 | return; 782 | 783 | memcpy((int *)(Output + OutputOffset),(int *)(output_tmp),OutputLength*sizeof(int)); 784 | } 785 | 786 | 787 | /* 788 | 并行调用拷贝进端口的函数 789 | */ 790 | void mmcpy_outputpixel(int *Output,int *Output1,int output_tmp[Tr*PARA*Tc*PARA/4],int output_tmp1[Tr*PARA*Tc*PARA/4],ap_uint<6> tm,ap_uint<6> mLoop1,ap_uint<6> mLoop2,int outputoffsetarray[2],int OutputLength,int OutputLength1,bool enable) 791 | { 792 | if(!enable) 793 | { 794 | return; 795 | } 796 | mmcpy_outputport(Output ,output_tmp ,tm,mLoop1,outputoffsetarray[0],OutputLength ); 797 | mmcpy_outputport1(Output1,output_tmp1,tm,mLoop2,outputoffsetarray[1],OutputLength1); 798 | } 799 | 800 | /* 801 | 不知道是干嘛的 802 | */ 803 | void outputpixel2buf(int output_buffer[Tm][Tr*PARA][Tc*PARA],int output_tmp[Tr*PARA*Tc*PARA/4],int output_tmp1[Tr*PARA*Tc*PARA/4],bool IsNL,int InterSubOutput,int LayerType,bool TC_MINe26,int TR_MIN,int TC_MIN,int mLoop,int rLoop, bool init, 804 | int outputoffsetarray[2],int OutputOffset1_sum,int OutputOffset1_sum1,int OutputOffset2_sum,ap_uint<6> tm_next[1],bool enable) 805 | { 806 | if(!enable) 807 | { 808 | return; 809 | } 810 | 811 | tm_next[0] = mLoop; 812 | 813 | ap_uint<4> InterSubOutput_4b = InterSubOutput; 814 | int tmp_output; 815 | int tmp_output_1; 816 | short tmp_output2; 817 | short tmp_output2_1; 818 | int tmp_output3; 819 | int tmp_output3_1; 820 | ap_uint<2> cnt = 0; 821 | short ouput_array[2]; 822 | #pragma HLS ARRAY_PARTITION variable=ouput_array complete dim=1 823 | short ouput_array1[2]; 824 | #pragma HLS ARRAY_PARTITION variable=ouput_array1 complete dim=1 825 | ap_uint<5> tr; 826 | static ap_uint<6> tm; 827 | 828 | ap_uint<5> TC_MIN_5b = TC_MIN; 829 | ap_uint<5> tc; 830 | ap_uint<2> TM_LOOP,tm_count; 831 | ap_uint<4> TR_LOOP,tr_count; 832 | 833 | 834 | //const int para = (LayerType==2) ? 1:1; 835 | if(init) 836 | { 837 | tm = 0; 838 | } 839 | 840 | if(TC_MINe26) 841 | { 842 | tm = mLoop; 843 | tr = rLoop; 844 | TM_LOOP = 1; 845 | TR_LOOP = 1; 846 | }else 847 | { 848 | tr = 0; 849 | TM_LOOP = 2; 850 | TR_LOOP = SIZE; 851 | } 852 | 853 | ap_uint<8> outputoffset = 0; 854 | ap_uint<8> outputoffset1 = 0; 855 | 856 | for(tm_count = 0;tm_count < TM_LOOP;tm_count++,tm++,tr = 0) 857 | for(tr_count = 0;tr_count < TR_LOOP;tr_count++,tr++) 858 | for(tc = 0;tc < TC_MIN_5b;tc++) 859 | { 860 | #pragma HLS PIPELINE 861 | int tmp = output_buffer[tm][tr][tc]; 862 | int tmp1 = output_buffer[tm + Tm/2][tr][tc]; 863 | if(IsNL&&tmp<0) 864 | { 865 | tmp_output = ((long long)tmp*0xccc)>>15; 866 | }else 867 | { 868 | tmp_output = tmp; 869 | } 870 | 871 | if(IsNL&&tmp1<0) 872 | { 873 | tmp_output_1 = ((long long)tmp1*0xccc)>>15; 874 | }else 875 | { 876 | tmp_output_1 = tmp1; 877 | } 878 | 879 | if(LayerType==0) 880 | { 881 | tmp_output2 = tmp_output >> InterSubOutput_4b; 882 | tmp_output2_1 = tmp_output_1 >> InterSubOutput_4b; 883 | } 884 | else 885 | { 886 | tmp_output2 = tmp_output; 887 | tmp_output2_1 = tmp_output_1; 888 | } 889 | ouput_array[cnt] = tmp_output2; 890 | ouput_array1[cnt] = tmp_output2_1; 891 | cnt++; 892 | if(cnt==2) 893 | { 894 | tmp_output3 = (ouput_array[0] &0x0000FFFF) | 895 | ((ouput_array[1] << 16 )&0xFFFF0000); 896 | tmp_output3_1 = (ouput_array1[0] &0x0000FFFF) | 897 | ((ouput_array1[1] << 16 )&0xFFFF0000); 898 | 899 | output_tmp[outputoffset] = tmp_output3; 900 | outputoffset++; 901 | 902 | output_tmp1[outputoffset1] = tmp_output3_1; 903 | outputoffset1++; 904 | cnt = 0; 905 | } 906 | } 907 | 908 | outputoffsetarray[0] = (OutputOffset1_sum + OutputOffset2_sum)>>1; 909 | outputoffsetarray[1] = (OutputOffset1_sum1 + OutputOffset2_sum)>>1; 910 | 911 | } 912 | 913 | 914 | //para 还没有设置好 915 | void write_back_output_reorg(int output_buffer[Tm][Tr*PARA][Tc*PARA],int *Output,int *Output1,int r,int c,int m,const int Output_w,const int Output_h, 916 | const int TM_MIN,const int TR_MIN,const int TC_MIN,const int OHxOW,bool write_flag,const int OutputQ,bool IsNL,int InterSubOutput,int LayerType) 917 | { 918 | static int output_tmp00[Tr*PARA*Tc*PARA/4]; 919 | static int output_tmp01[Tr*PARA*Tc*PARA/4]; 920 | 921 | static int output_tmp10[Tr*PARA*Tc*PARA/4]; 922 | static int output_tmp11[Tr*PARA*Tc*PARA/4]; 923 | 924 | //const int para = (LayerType==2) ? 2:1; 925 | 926 | int tr,tm,tc; 927 | int OutputLength,OutputLength1; 928 | int mLoopc,mLoop,rLoop; 929 | ap_uint<6> mLoop1,mLoop2; 930 | 931 | if(!write_flag) 932 | return; 933 | 934 | // assert(TM_MIN < 64); 935 | assert(TR_MIN < 32); 936 | assert(TC_MIN < 32); 937 | 938 | ap_uint<6> TM_MIN_6b = TM_MIN; 939 | ap_uint<18> OHxOW_18b = OHxOW; 940 | ap_uint<9> Output_w_9b = Output_w; 941 | ap_uint<10> m_10b = m; 942 | ap_uint<9> r_9b = r; 943 | ap_uint<9> c_9b = c; 944 | 945 | // assert(m < 1024); 946 | // assert(r < 512); 947 | // assert(c < 512); 948 | // assert(OHxOW < 512*512); 949 | // assert(Output_w < 512); 950 | 951 | ap_uint<6> TM_MIN_g; 952 | if(TM_MIN_6b==9) 953 | TM_MIN_g = 12; 954 | else 955 | TM_MIN_g = TM_MIN_6b; 956 | 957 | // const int offset = m_10b*OHxOW_18b + r_9b*Output_w_9b + c_9b; 958 | int tempoff; 959 | if(LayerType == 2){ 960 | tempoff = m*OHxOW_18b + r*Output_w_9b*2 + c_9b*2; 961 | } else { 962 | tempoff = m*OHxOW_18b + r*Output_w_9b + c_9b; 963 | } 964 | //const int offset = m_10b*OHxOW_18b + r_9b*Output_w_9b + c_9b; 965 | const int offset = tempoff; 966 | 967 | 968 | bool TM_MINaboveTmdiv2 = TM_MIN_g > Tm/2; 969 | bool TC_MINe26 = TC_MIN == Tr; 970 | 971 | if(TM_MINaboveTmdiv2) 972 | { 973 | mLoop = Tm/2; 974 | mLoop1 = Tm/2; 975 | mLoop2 = TM_MIN_g - Tm/2; 976 | }else 977 | { 978 | mLoop = TM_MIN_g; 979 | mLoop1 = TM_MIN_g; 980 | mLoop2 = 0; 981 | } 982 | mLoopc = mLoop; 983 | 984 | int offset1 = offset + mLoop1*OHxOW_18b; 985 | 986 | int OutputOffset1,OutputOffset2; 987 | int OutputOffset1_sum,OutputOffset1_sum1; 988 | int OutputOffset2_sum; 989 | 990 | // when TC_MIN==26,burstlength = 13*2/2=13,else 13*13*2/2=169 991 | if(TC_MINe26) 992 | { 993 | OutputLength = Tr/2; 994 | OutputLength1 = Tr/2; 995 | OutputOffset1 = OHxOW_18b; 996 | OutputOffset2 = Output_w_9b; 997 | rLoop = 26; 998 | }else//TMxTRxTC TMx13x13 continues 999 | { 1000 | OutputLength = SIZE*SIZE; 1001 | OutputLength1 = SIZE*SIZE; 1002 | rLoop = 1; 1003 | mLoop = mLoop/2; 1004 | OutputOffset1 = SIZE*SIZE*2; 1005 | OutputOffset2 = 0; 1006 | } 1007 | 1008 | bool pingpong = true; 1009 | int outputoffsetarray[2]; 1010 | #pragma HLS ARRAY_PARTITION variable=outputoffsetarray complete dim=1 1011 | int outputoffsetarray1[2]; 1012 | #pragma HLS ARRAY_PARTITION variable=outputoffsetarray1 complete dim=1 1013 | ap_uint<6> tm_next[1]; 1014 | ap_uint<6> tm_next1[1]; 1015 | bool wb_start_flag = true; 1016 | for(tm = 0,OutputOffset1_sum = offset,OutputOffset1_sum1 = offset1;tm < mLoop;tm++,OutputOffset1_sum += OutputOffset1,OutputOffset1_sum1 += OutputOffset1) 1017 | for(tr = 0,OutputOffset2_sum = 0;tr < rLoop + 1;tr++,OutputOffset2_sum += OutputOffset2,wb_start_flag = false) 1018 | { 1019 | if(pingpong) 1020 | { 1021 | outputpixel2buf( output_buffer, output_tmp00, output_tmp01, IsNL, InterSubOutput, LayerType, TC_MINe26, TR_MIN, TC_MIN, tm, tr,wb_start_flag, 1022 | outputoffsetarray, OutputOffset1_sum, OutputOffset1_sum1, OutputOffset2_sum,tm_next,tr != rLoop); 1023 | mmcpy_outputpixel(Output,Output1, output_tmp10, output_tmp11, tm_next1[0], mLoop1, mLoop2, outputoffsetarray1, OutputLength, OutputLength1,tr != 0); 1024 | pingpong = false; 1025 | }else 1026 | { 1027 | outputpixel2buf( output_buffer, output_tmp10, output_tmp11, IsNL, InterSubOutput, LayerType, TC_MINe26, TR_MIN, TC_MIN, tm, tr,wb_start_flag, 1028 | outputoffsetarray1, OutputOffset1_sum, OutputOffset1_sum1, OutputOffset2_sum,tm_next1,tr != rLoop); 1029 | mmcpy_outputpixel(Output,Output1, output_tmp00, output_tmp01, tm_next[0], mLoop1, mLoop2, outputoffsetarray, OutputLength, OutputLength1,tr != 0); 1030 | pingpong = true; 1031 | } 1032 | } 1033 | 1034 | } 1035 | /* 1036 | void pool_yolo2(short Input[Tn][OnChipIB_Height][OnChipIB_Width],int Output[Tm][Tr][Tc], 1037 | const int Kernel_size,const int Kernel_stride, 1038 | const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable) 1039 | { 1040 | 1041 | if(!enable) 1042 | return; 1043 | 1044 | ap_uint<5> TR_MIN_5b = TR_MIN; 1045 | ap_uint<5> TC_MIN_5b = TC_MIN; 1046 | ap_uint<2> Kernel_stride_2b = Kernel_stride; 1047 | 1048 | // assert(TR_MIN < 32); 1049 | // assert(TC_MIN < 32); 1050 | // assert(Kernel_stride < 4); 1051 | 1052 | ap_uint<2> i,j; 1053 | ap_uint<5> tr,tc; 1054 | // ap_uint<8> i,j,tr,tc; 1055 | int of; 1056 | short tmp[Tn]; 1057 | #pragma HLS ARRAY_PARTITION variable=tmp complete dim=1 1058 | short input_short[Tn]; 1059 | #pragma HLS ARRAY_PARTITION variable=input_short complete dim=1 1060 | 1061 | for(tr = 0;tr < TR_MIN_5b;tr++) 1062 | for(tc = 0;tc < TC_MIN_5b;tc++) 1063 | for(i =0;i < 2; i++) 1064 | for(j = 0;j < 2; j++) 1065 | { 1066 | #pragma HLS PIPELINE 1067 | for( of = 0; of < Tn; of++) 1068 | { 1069 | if(i==0&&j==0) 1070 | tmp[of] = 0x8001; 1071 | input_short[of] = Input[of][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]; 1072 | if(input_short[of] > tmp[of]) 1073 | tmp[of] = input_short[of]; 1074 | 1075 | if(i==1&&j==1) 1076 | Output[of][tr][tc] = tmp[of]; 1077 | } 1078 | } 1079 | } 1080 | 1081 | void reorg_yolo2(short Input[Tn][OnChipIB_Height][OnChipIB_Width],int Output[Tm][Tr][Tc], 1082 | const int Kernel_size,const int Kernel_stride, 1083 | const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable) 1084 | { 1085 | int x, y,kx,ky; 1086 | unsigned char Yoffset; 1087 | unsigned char Xoffset; 1088 | 1089 | if(!enable) 1090 | return; 1091 | 1092 | // ap_uint<5> TR_MIN_5b = TR_MIN; 1093 | // ap_uint<5> TC_MIN_5b = TC_MIN; 1094 | 1095 | assert(TR_MIN < 32); 1096 | assert(TC_MIN < 32); 1097 | 1098 | for( y = 0; y < TR_MIN; y++) 1099 | for( x = 0; x < TC_MIN; x++) 1100 | for(ky= 0;ky < 2; ky++) 1101 | for(kx = 0;kx < 2; kx++) 1102 | { 1103 | #pragma HLS PIPELINE 1104 | Yoffset = (y << 1) + ky; 1105 | Xoffset = (x << 1) + kx; 1106 | 1107 | int in_index = (ky << 1) + kx; 1108 | Output[in_index][y][x] = Input[0][Yoffset][Xoffset]; 1109 | } 1110 | } 1111 | */ 1112 | void intra_pingpong_wrapper(int *Input,int *Input1,int *Input2,int *Input3,int *Weight, int output_buffer[Tm][Tr*PARA][Tc*PARA],short beta_buffer[MAX_BETA_LENGTH], 1113 | short input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],short input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width], 1114 | short input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width],short input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width], 1115 | int InFM_num,int Input_w,int Input_h,int Kernel_size,int Kernel_stride, 1116 | int TMP_R,int TMP_C,int TMP_M,int m,int TM_MIN,int TR_MIN,int TC_MIN,int TN,UCHAR TRow,UCHAR TCol,int Padding, 1117 | int IHxIW,int KxK,int IFM_numxKxK,int nLoops,bool IsNL,int LayerType,int TM,int TMP_X_next[1],int TX_MIN_next[1],bool pingpongx,bool input_flag,bool process_flag, 1118 | UCHAR InterSubBeta,UCHAR WeightAddInputSubInter,UCHAR InterSubOutput,ap_uint<6> trow_loops) 1119 | { 1120 | 1121 | static short weight_buffer0[Tm][Tn][K][K]; 1122 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=1 1123 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=2 1124 | 1125 | static short weight_buffer1[Tm][Tn][K][K]; 1126 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=1 1127 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=2 1128 | 1129 | static int NOP[1]; 1130 | static int tmp_x; 1131 | static int tmp_tx_min; 1132 | 1133 | if(LayerType==0) 1134 | { 1135 | //conv不用动,之前改好了的 1136 | if(!input_flag) 1137 | return; 1138 | TMP_X_next[0] = TMP_M;//consider by the inner-out loop 1139 | TX_MIN_next[0] = TM_MIN;// like above 1140 | 1141 | bool pingpong = 0; 1142 | int TMP_N_next0[1]; 1143 | int TMP_N_next1[1]; 1144 | int n; 1145 | int TMP_N; 1146 | for(TMP_N = 0,n = 0;n < nLoops+1; n++,TMP_N += TN) 1147 | { 1148 | if(pingpong == 1) 1149 | { 1150 | copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N, 1151 | TM_MIN,TN,TRow,TCol,Padding,input_buffer1,weight_buffer1,TMP_N_next1,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1152 | compute(input_buffer0,output_buffer,weight_buffer0,beta_buffer,TMP_N_next0,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops, 1153 | InterSubBeta,WeightAddInputSubInter,InterSubOutput); 1154 | pingpong = 0; 1155 | }else 1156 | { 1157 | copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N, 1158 | TM_MIN,TN,TRow,TCol,Padding,input_buffer0,weight_buffer0,TMP_N_next0,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1159 | compute(input_buffer1,output_buffer,weight_buffer1,beta_buffer,TMP_N_next1,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops, 1160 | InterSubBeta,WeightAddInputSubInter,InterSubOutput); 1161 | pingpong = 1; 1162 | } 1163 | } 1164 | } 1165 | else if(LayerType==1) 1166 | { 1167 | //shortcut,注意要设置stride和kernelsize为1,这是在pc上实验好的结果 1168 | /* 1169 | 对于shortcut的位数对齐有两个方案,第一个是在录入文件的时候就对齐,这一步就不管,但是这个也太low了 1170 | 所以方案二就是在函数里新加内容,不过幸亏的是这个函数里有足够的接口可以用,所以选择方案二 1171 | 方案二:那么要如何使用这些值呢? 1172 | */ 1173 | if(pingpongx==0) 1174 | { 1175 | TMP_X_next[0] = tmp_x; 1176 | TX_MIN_next[0] = tmp_tx_min; 1177 | tmp_x = TMP_M; 1178 | tmp_tx_min = TM_MIN; 1179 | 1180 | //copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1181 | // TM_MIN,TM,TRow,TCol,0,input_buffer0,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1182 | copy_input_weight(Input,Input1,Input,Input1,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1183 | TM_MIN,TM,TRow,TCol,0,input_buffer0,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1184 | copy_input_weight(Input2,Input3,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1185 | TM_MIN,TM,TRow,TCol,0,input_buffer00,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1186 | //pool_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 1187 | //可以看出位数还是在计算函数里用到的,先看看那些都是什么含义然后看看他是如何利用的 1188 | /* 1189 | void shortcut(short input_buffer[Tm][Tr][Tc],int output_buffer[Tm][Tr][Tc], 1190 | const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable) 1191 | */ 1192 | shortcut(input_buffer1,input_buffer10,output_buffer,TM_MIN,TR_MIN,TC_MIN,InterSubOutput,WeightAddInputSubInter,process_flag); 1193 | }else 1194 | { 1195 | TMP_X_next[0] = tmp_x; 1196 | TX_MIN_next[0] = tmp_tx_min; 1197 | tmp_x = TMP_M; 1198 | tmp_tx_min = TM_MIN; 1199 | 1200 | //copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1201 | // TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1202 | copy_input_weight(Input,Input1,Input,Input1,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1203 | TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1204 | copy_input_weight(Input2,Input3,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1205 | TM_MIN,TM,TRow,TCol,0,input_buffer10,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1206 | //pool_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 1207 | shortcut(input_buffer0,input_buffer00,output_buffer,TM_MIN,TR_MIN,TC_MIN,InterSubOutput,WeightAddInputSubInter,process_flag); 1208 | } 1209 | 1210 | } 1211 | else if(LayerType==2) 1212 | { 1213 | //upsample是不需要做位数处理的 1214 | if(pingpongx==0) 1215 | { 1216 | TMP_X_next[0] = tmp_x; 1217 | TX_MIN_next[0] = tmp_tx_min; 1218 | tmp_x = TMP_M; 1219 | tmp_tx_min = TM_MIN; 1220 | 1221 | copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1222 | TM_MIN,TM,TRow,TCol,0,input_buffer0,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1223 | //reorg_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 1224 | upsample(input_buffer1,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag); 1225 | }else 1226 | { 1227 | TMP_X_next[0] = tmp_x; 1228 | TX_MIN_next[0] = tmp_tx_min; 1229 | tmp_x = TMP_M; 1230 | tmp_tx_min = TM_MIN; 1231 | 1232 | copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 1233 | TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 1234 | //reorg_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 1235 | upsample(input_buffer0,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag); 1236 | } 1237 | 1238 | } 1239 | 1240 | } 1241 | 1242 | void copy_beta(short beta_buffer[MAX_BETA_LENGTH],int *Beta,const int OFM_NUM,const int BetaQ) 1243 | { 1244 | static int beta_tmp[MAX_BETA_LENGTH/2]; 1245 | int NUM = (OFM_NUM+1)>>1; 1246 | memcpy(beta_tmp,(int *)Beta,NUM*sizeof(int)); 1247 | int x; 1248 | for(x = 0;x < NUM;x++) 1249 | { 1250 | #pragma HLS PIPELINE 1251 | beta_buffer[2*x] = beta_tmp[x]; 1252 | beta_buffer[2*x+1] = beta_tmp[x]>>16; 1253 | } 1254 | } 1255 | 1256 | /* 1257 | void YOLO2_FPGA( 1258 | int *Input,int *Input1,int *Input2,int *Input3, //Input Address,Using paralized input,four ports 1259 | ,int *Output,int *Output1, //two out Address paralized 1260 | int *Weight,int *Beta //weight and bias Address 1261 | const int InFM_num,const int OutFM_num, //Input size and out size 1262 | const int Kernel_size,const int Kernel_stride, //kernel size and stride 1263 | const int Input_w,const int Input_h,//Input winth and height 1264 | const int output_w,const int output_h, //Out width and height 1265 | const int Padding,//padding value 1266 | ,const bool IsNL,const bool IsBN, //is leaky ReLu,is batch normalization 1267 | const int TM,const int TN,const int TR,const int TC, //accelerate configuration,TM,TN,TR,TC,is setted,but i do not know why set like this 1268 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,//do not know 1269 | const int LayerType,//layertype ,defferent layertype do different operate 1270 | const int InputQ,const int OutputQ,const int WeightQ,const int BetaQ,//quanti weishu 1271 | int trow_loops//do not know,seems not used 1272 | ){ 1273 | 1274 | } 1275 | */ 1276 | 1277 | /* 1278 | 就差shortcut和upsample两层位数对齐了 1279 | upsample不需要对其 1280 | 也就是只差shortcut对其了 1281 | */ 1282 | void YOLO2_FPGA(int *Input,int *Input1,int *Input2,int *Input3,int *Output,int *Output1,int *Weight,int *Beta,const int InFM_num,const int OutFM_num, 1283 | const int Kernel_size,const int Kernel_stride, 1284 | const int Input_w,const int Input_h,const int output_w,const int output_h,const int Padding,const bool IsNL,const bool IsBN, 1285 | const int TM,const int TN,const int TR,const int TC, 1286 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType, 1287 | const int InputQ,const int OutputQ,const int WeightQ,const int BetaQ,int trow_loops) 1288 | { 1289 | 1290 | #pragma HLS INTERFACE m_axi depth=512 port=Input offset=slave bundle=DATA_BUS1 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64 1291 | #pragma HLS INTERFACE m_axi depth=512 port=Input1 offset=slave bundle=DATA_BUS2 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64 1292 | #pragma HLS INTERFACE m_axi depth=512 port=Input2 offset=slave bundle=DATA_BUS3 num_read_outstanding=1 max_read_burst_length=64 1293 | #pragma HLS INTERFACE m_axi depth=512 port=Input3 offset=slave bundle=DATA_BUS4 num_read_outstanding=1 max_read_burst_length=64 1294 | #pragma HLS INTERFACE m_axi depth=512 port=Output offset=slave bundle=DATA_BUS1 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64 1295 | #pragma HLS INTERFACE m_axi depth=512 port=Output1 offset=slave bundle=DATA_BUS2 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64 1296 | #pragma HLS INTERFACE m_axi depth=512 port=Weight offset=slave bundle=DATA_BUS5 num_read_outstanding=1 max_read_burst_length=128 1297 | #pragma HLS INTERFACE m_axi depth=512 port=Beta offset=slave bundle=DATA_BUS5 num_read_outstanding=1 max_read_burst_length=128 1298 | 1299 | #pragma HLS INTERFACE s_axilite register port=return bundle=CTRL_BUS 1300 | #pragma HLS INTERFACE s_axilite register port=InFM_num bundle=CTRL_BUS 1301 | #pragma HLS INTERFACE s_axilite register port=OutFM_num bundle=CTRL_BUS 1302 | #pragma HLS INTERFACE s_axilite register port=Kernel_size bundle=CTRL_BUS 1303 | #pragma HLS INTERFACE s_axilite register port=Kernel_stride bundle=CTRL_BUS 1304 | #pragma HLS INTERFACE s_axilite register port=Input_w bundle=CTRL_BUS 1305 | #pragma HLS INTERFACE s_axilite register port=Input_h bundle=CTRL_BUS 1306 | #pragma HLS INTERFACE s_axilite register port=output_w bundle=CTRL_BUS 1307 | #pragma HLS INTERFACE s_axilite register port=output_h bundle=CTRL_BUS 1308 | #pragma HLS INTERFACE s_axilite register port=Padding bundle=CTRL_BUS 1309 | #pragma HLS INTERFACE s_axilite register port=IsNL bundle=CTRL_BUS 1310 | #pragma HLS INTERFACE s_axilite register port=IsBN bundle=CTRL_BUS 1311 | #pragma HLS INTERFACE s_axilite register port=TM bundle=CTRL_BUS 1312 | #pragma HLS INTERFACE s_axilite register port=TN bundle=CTRL_BUS 1313 | #pragma HLS INTERFACE s_axilite register port=TR bundle=CTRL_BUS 1314 | #pragma HLS INTERFACE s_axilite register port=TC bundle=CTRL_BUS 1315 | #pragma HLS INTERFACE s_axilite register port=mLoops bundle=CTRL_BUS 1316 | #pragma HLS INTERFACE s_axilite register port=nLoops bundle=CTRL_BUS 1317 | #pragma HLS INTERFACE s_axilite register port=rLoops bundle=CTRL_BUS 1318 | #pragma HLS INTERFACE s_axilite register port=cLoops bundle=CTRL_BUS 1319 | #pragma HLS INTERFACE s_axilite register port=LayerType bundle=CTRL_BUS 1320 | #pragma HLS INTERFACE s_axilite register port=InputQ bundle=CTRL_BUS 1321 | #pragma HLS INTERFACE s_axilite register port=OutputQ bundle=CTRL_BUS 1322 | #pragma HLS INTERFACE s_axilite register port=WeightQ bundle=CTRL_BUS 1323 | #pragma HLS INTERFACE s_axilite register port=BetaQ bundle=CTRL_BUS 1324 | #pragma HLS INTERFACE s_axilite register port=trow_loops bundle=CTRL_BUS 1325 | 1326 | #pragma HLS INTERFACE s_axilite register port=Input bundle=CTRL_BUS 1327 | #pragma HLS INTERFACE s_axilite register port=Output bundle=CTRL_BUS 1328 | #pragma HLS INTERFACE s_axilite register port=Weight bundle=CTRL_BUS 1329 | #pragma HLS INTERFACE s_axilite register port=Beta bundle=CTRL_BUS 1330 | 1331 | assert(Kernel_size < 4); 1332 | assert(Kernel_stride < 4); 1333 | assert(TR < 32); 1334 | assert(TC < 32); 1335 | assert(InFM_num < 2048); 1336 | assert(OutFM_num < 2048); 1337 | assert(output_h < 512); 1338 | assert(output_w < 512); 1339 | assert(Input_h < 1024);//gg///?????? 1340 | assert(Input_w < 512); 1341 | 1342 | assert(WeightQ < 32); 1343 | assert(InputQ < 32); 1344 | assert(OutputQ < 32); 1345 | assert(BetaQ < 32); 1346 | 1347 | ap_uint<9> output_h_9b = output_h; 1348 | ap_uint<9> output_w_9b = output_w; 1349 | ap_uint<5> TR_5b = TR; 1350 | ap_uint<5> TC_5b = TC; 1351 | ap_uint<2> Kernel_stride_2b = Kernel_stride; 1352 | ap_uint<2> Kernel_size_2b = Kernel_size; 1353 | ap_uint<11> InFM_num_11b = InFM_num; 1354 | ap_uint<10> Input_h_10b = Input_h; 1355 | ap_uint<9> Input_w_9b = Input_w; 1356 | ap_uint<6> trow_loops_6b = trow_loops; 1357 | 1358 | UCHAR tmprow,tmpcol; 1359 | if(LayerType==2){ 1360 | tmprow = TR_5b; 1361 | tmpcol = TC_5b; 1362 | } else { 1363 | tmprow = (TR_5b-1)*Kernel_stride_2b+Kernel_size_2b; 1364 | tmpcol = (TC_5b-1)*Kernel_stride_2b+Kernel_size_2b; 1365 | } 1366 | const UCHAR TRow = tmprow; 1367 | const UCHAR TCol = tmpcol; 1368 | 1369 | 1370 | const int OHxOW = output_h_9b*output_w_9b; 1371 | //const UCHAR TRow = (TR_5b-1)*Kernel_stride_2b+Kernel_size_2b; 1372 | //const UCHAR TCol = (TC_5b-1)*Kernel_stride_2b+Kernel_size_2b; 1373 | const int IHxIW = Input_h_10b*Input_w_9b; 1374 | const int KxK = Kernel_size_2b*Kernel_size_2b; 1375 | assert(KxK < 10); 1376 | ap_uint<4> KxK_4b = KxK; 1377 | const int IFM_numxKxK = InFM_num_11b*KxK_4b; 1378 | const int mLoops_add1 = mLoops + 1; 1379 | const int mLoops_add2 = mLoops + 2; 1380 | const int mLoops_bound = LayerType ? mLoops_add2: mLoops_add1; 1381 | //temp_inputQ[l.index+1], temp_inputQ[i], INTERWIDTH, INTERWIDTH, trow_loops 1382 | //const int InputQ, const int OutputQ, const int WeightQ, const int BetaQ, int trow_loops 1383 | const UCHAR InterSubBeta = INTERWIDTH - BetaQ;//总的与bias值的偏移量 1384 | const UCHAR WeightAddInputSubInter = WeightQ + InputQ - INTERWIDTH;//输入与权重相乘后大于差值的偏移量 1385 | const UCHAR InterSubOutput = INTERWIDTH - OutputQ;//超过输出的偏移量,虽然处理与写回看起来都用到了,但是实际上只有写回能用到 1386 | /* 1387 | 在shortcut里面 InterSubBeta = INTERWIDTH-INTERWIDTH = 0; 1388 | WeightAddInputSubInter = temp_inputQ[l.index+1];//这个可以直接利用起来,用INTERWIDTH-它 定义成一个值 1389 | InterSubOutput = INTERWIDTH-temp_inputQ[i];;直接利用 1390 | 那么这样子就是两者距离20位的差距,都左移,按放大处理,加法以后再处理成输出值???//需要在这里输出吗 1391 | 1392 | 在conv内 InterSubBeta > 0 1393 | WeightAddInputSubInter 较小 1394 | InterSubOutput 与上相等 1395 | */ 1396 | 1397 | 1398 | assert(InterSubBeta < 16); 1399 | assert(WeightAddInputSubInter < 16); 1400 | assert(InterSubOutput < 16); 1401 | 1402 | //assert(TRow < 256); 1403 | //assert(TCol < 256); 1404 | 1405 | static short input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width]; 1406 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1 1407 | 1408 | static short input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width]; 1409 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1 1410 | 1411 | //mycode 1412 | static short input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width]; 1413 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1 1414 | 1415 | static short input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width]; 1416 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1 1417 | //end 1418 | static int output_buffer0[Tm][Tr*PARA][Tc*PARA]; 1419 | #pragma HLS ARRAY_PARTITION variable=output_buffer0 complete dim=1 1420 | 1421 | static int output_buffer1[Tm][Tr*PARA][Tc*PARA]; 1422 | #pragma HLS ARRAY_PARTITION variable=output_buffer1 complete dim=1 1423 | 1424 | static short beta_buffer[MAX_BETA_LENGTH]; 1425 | 1426 | int r,c,m; 1427 | /////////////////////////////////param 1428 | int TMP_R,TMP_C,TMP_M; 1429 | int TM_MIN,TR_MIN,TC_MIN; 1430 | /////////////////////////////////////// 1431 | 1432 | int TMP_M_next0[1]; 1433 | int TMP_M_next1[1]; 1434 | int TM_MIN_next0[1]; 1435 | int TM_MIN_next1[1]; 1436 | bool pingpongm; 1437 | 1438 | const int para = (LayerType==2) ? 2 : 1; 1439 | 1440 | if(LayerType==0) 1441 | copy_beta(beta_buffer,Beta,OutFM_num,BetaQ); 1442 | 1443 | for(TMP_R = 0,r = 0; r < rLoops; r++, TMP_R += TR) 1444 | { 1445 | TR_MIN = MIN(TR,output_h -TMP_R); 1446 | for(TMP_C = 0,c = 0; c < cLoops; c++,TMP_C += TC) 1447 | { 1448 | TC_MIN = MIN(TC,output_w -TMP_C); 1449 | pingpongm = 0; 1450 | for(TMP_M = 0, m = 0; m < mLoops_bound; m++,TMP_M += TM) 1451 | { 1452 | TM_MIN = MIN(TM,OutFM_num-TMP_M); 1453 | if(LayerType!=0) TM_MIN = Tn; 1454 | 1455 | bool MneZero = (m!=0); 1456 | bool MneOne = (m!=1); 1457 | bool MnemLoops = (m!=mLoops); 1458 | bool MneMLoopsaddOne = (m!=mLoops_add1); 1459 | bool input_flag = LayerType ? MnemLoops&&MneMLoopsaddOne: MnemLoops; 1460 | bool process_flag = LayerType ? MneZero&&MneMLoopsaddOne : MnemLoops; 1461 | bool write_flag = LayerType ? MneZero&&MneOne : MneZero; 1462 | 1463 | if(pingpongm==0) 1464 | { 1465 | intra_pingpong_wrapper(Input,Input1,Input2,Input3,Weight,output_buffer1,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10, 1466 | InFM_num, Input_w, Input_h, Kernel_size, Kernel_stride, 1467 | TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next1,TM_MIN_next1, pingpongm, input_flag, 1468 | process_flag,InterSubBeta,WeightAddInputSubInter,InterSubOutput,trow_loops_6b); 1469 | 1470 | write_back_output_reorg(output_buffer0,Output,Output1,TMP_R,TMP_C,TMP_M_next0[0],output_w,output_h,TM_MIN_next0[0],TR_MIN*para,TC_MIN*para,OHxOW,write_flag,OutputQ, IsNL, InterSubOutput, LayerType); 1471 | pingpongm = 1; 1472 | }else 1473 | { 1474 | intra_pingpong_wrapper(Input,Input1,Input2,Input3,Weight,output_buffer0,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10, 1475 | InFM_num, Input_w, Input_h, Kernel_size, Kernel_stride, 1476 | TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next0,TM_MIN_next0, pingpongm, input_flag, 1477 | process_flag,InterSubBeta,WeightAddInputSubInter,InterSubOutput,trow_loops_6b); 1478 | 1479 | write_back_output_reorg(output_buffer1,Output,Output1,TMP_R,TMP_C,TMP_M_next1[0],output_w,output_h,TM_MIN_next1[0],TR_MIN*para,TC_MIN*para,OHxOW,write_flag,OutputQ, IsNL, InterSubOutput, LayerType); 1480 | pingpongm = 0; 1481 | } 1482 | 1483 | } 1484 | } 1485 | } 1486 | } 1487 | ////////////////////////////////////////////20181229 n4m32 v2 without input and reorg opt end input opt ok relu comb ok // input opt ok //output opt ok //weight opt ok (5)n4m32i4o2 ok end 1488 | -------------------------------------------------------------------------------- /soft_version/Step02/yolov3_acc_sim.h: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////////////////////20181229 anti-reorg start => KxKxTmxTn 2 | #define MAX(x,y) ((x)>(y)?(x):(y)) 3 | #define MIN(x,y) ((x)<(y)?(x):(y)) 4 | #define S 2 5 | #define K 3 6 | 7 | #define HALFWID 208 8 | #define ATOMWID 13 9 | #define BLOCK 512 10 | 11 | #define Tn 4 12 | #define Tm 32 13 | #define Tr 26 14 | //#define Tr 38 15 | #define Tc 26 16 | //#define Tc 38 17 | #define OnChipIB_Width ((Tc-1)*S+K) 18 | #define OnChipIB_Height ((Tr-1)*S+K) 19 | #define MAX_BETA_LENGTH (1024) 20 | #define PARA 1 21 | 22 | #define REORG_GEN 23 | //#define REORG_TEST 24 | 25 | //#define UPSAMPLE_TEST 26 | 27 | /* 28 | 关于TR与 ONCHIPTR 29 | 显然TR< ONCHIPTR 这很显然,那是因为yolo2中所有的都是减小size的操作而没有upsmp 30 | 但是yolo3是有的,例如upsmp就是增加size,那么对于upsmp而言, 31 | 需要设置的输出数组大小要做一些改变,应该是输入size的2x2倍 32 | 但是我又不是很想改动size大小,因为要兼容其他的代码,那么只能考虑把out填充满,而inpt以及 33 | 循环控制要做修改 34 | 35 | 具体有关shorcut 这一步应该是可以完成的,因为有很多size不变的操作,他是要做一个统一的,那么在我这里 36 | shortcut除了是做加法,size应该也是可复用,无非就是输入的加载需要一定的设置。 37 | 38 | */ 39 | 40 | //////////////////////////////////////////////////T3 start 41 | 42 | /* 43 | float *input,float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width] 总的输入数组,要存放的新数组, 44 | int r,int c,int n,当前运行到的行,列,页的位置块标记 45 | int Kernel_stride,int Padding, 卷积核步长,是否padding 46 | int TRow,int TCol,是否padding计算后的新的行列值 47 | int Input_w,int Input_h,输入宽高 48 | int TN_MIN,int IHxIW,输入的页长度,一页的总大小 49 | int LayerType 类型,决定是否设置最小负值,但是在v3里无用 50 | 51 | LOOP1:将当前的输入从二维化的扁平到一维化 52 | LOOP2: 从第一个开始对每个位置遍历,对要存放的新数组的位置,如果不处于padding位置就赋值,否则置成padding值 53 | maxpool为了不影响结果,padding设置为最小的负值 54 | 55 | 56 | 也就是说,这个函数加载(Tn * TRow * TCol)的输入值进入数组里 57 | */ 58 | void input_load(float *input,float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],int r,int c,int n,int Kernel_stride,int Padding,int TRow,int TCol,int Input_w,int Input_h,int TN_MIN,int IHxIW,int LayerType) 59 | { 60 | int t1,t2,t3,t4; 61 | int xoffset; 62 | int yoffset; 63 | 64 | static float input_memcpy_buffer[Tn*OnChipIB_Height*OnChipIB_Width]; 65 | 66 | int tempcoff,temproff,tempcurr; 67 | if(LayerType == 2){ 68 | tempcoff = c; 69 | temproff = r; 70 | //tempcurr = 71 | } else { 72 | tempcoff = c*Kernel_stride - Padding; 73 | temproff = r*Kernel_stride - Padding; 74 | } 75 | 76 | //const int Coffset = c*Kernel_stride - Padding; 77 | //const int Roffset = r*Kernel_stride - Padding; 78 | const int Coffset = tempcoff; 79 | const int Roffset = temproff; 80 | const int CurrentOffset = n*IHxIW + Roffset*Input_w + Coffset; 81 | //虽然说设置了是这么大的,但是可以不加载满 82 | //printf("TN_MIN*TRow:%d x %d = %d\n",TN_MIN,TRow,TN_MIN*TRow); 83 | 84 | float pad_value = 0; 85 | 86 | #ifdef UPSAMPLE_TEST 87 | if(LayerType==2){ 88 | //pad_value = -1024*1024; 89 | printf("r = %d,c = %d,n = %d,Coffset = %d,Roffset = %d,CurrentOffset = %d,TRow = %d\n",r,c,n,Coffset,Roffset,CurrentOffset,TRow); 90 | } 91 | #endif 92 | 93 | int input_mmcpy_offset = 0; 94 | for(t1 = 0;t1 < TN_MIN; t1++) 95 | for(t2 = 0;t2 < TRow; t2++) 96 | { 97 | memcpy((float *)(input_memcpy_buffer + input_mmcpy_offset),(float *)(input + CurrentOffset + t1*IHxIW + t2*Input_w),TCol*sizeof(float)); 98 | input_mmcpy_offset += TCol; 99 | } 100 | 101 | input_mmcpy_offset = 0; 102 | for(t1 = 0;t1 < Tn; t1++) 103 | for(t2 = 0;t2 < TRow; t2++) 104 | for(t3 = 0;t3 < TCol; t3++) 105 | { 106 | xoffset = Coffset + t3; 107 | yoffset = Roffset + t2; 108 | bool XEnable = (xoffset >= 0)&&(xoffset < Input_w); 109 | bool YEnable = (yoffset >= 0)&&(yoffset < Input_h); 110 | bool PaddingEnable = XEnable&&YEnable; 111 | if(PaddingEnable&&(t1 < TN_MIN)) 112 | input_buffer[t1][t2][t3] = input_memcpy_buffer[input_mmcpy_offset]; 113 | else 114 | input_buffer[t1][t2][t3] = pad_value; 115 | input_mmcpy_offset++; 116 | } 117 | } 118 | 119 | void weight_load(float *Weight,float weight_buffer[Tm][Tn][K][K],bool weight_load_enable,int m,int n,int IFM_numxKxK,int KxK,int Kernel_size,int TM_MIN,int TN_MIN) 120 | { 121 | int t1,t2,t3,t4; 122 | static float weight_memcpy_buffer[Tm*Tn*K*K]; 123 | 124 | if(!weight_load_enable) 125 | return; 126 | 127 | const int Woffset = m*IFM_numxKxK + n*KxK; 128 | 129 | int weight_memcpy_offset = 0; 130 | for(t1 = 0;t1 < TM_MIN; t1++) 131 | for(t2 = 0;t2 < TN_MIN; t2++) 132 | { 133 | memcpy((float *)(weight_memcpy_buffer + weight_memcpy_offset),(float *)(Weight + Woffset + t1*IFM_numxKxK + t2*KxK),KxK*sizeof(float)); 134 | weight_memcpy_offset += KxK; 135 | } 136 | 137 | weight_memcpy_offset = 0; 138 | for(t1 = 0;t1 < Tm; t1++) 139 | for(t2 = 0;t2 < Tn; t2++) 140 | for(t3 = 0;t3 tmp[of]) 461 | tmp[of] = Input[of][tr*Kernel_stride+i][tc*Kernel_stride+j]; 462 | 463 | if(i==1&&j==1) 464 | Output[of][tr][tc] = tmp[of]; 465 | } 466 | } 467 | 468 | } 469 | 470 | void reorg_yolo2(float Input[Tn][OnChipIB_Height][OnChipIB_Width],float Output[Tm][Tr][Tc], 471 | const int Kernel_size,const int Kernel_stride, 472 | const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable) 473 | { 474 | int x, y,kx,ky; 475 | unsigned char Yoffset; 476 | unsigned char Xoffset; 477 | 478 | if(!enable) 479 | return; 480 | 481 | for( y = 0; y < TR_MIN; y++) 482 | for( x = 0; x < TC_MIN; x++) 483 | for(ky= 0;ky < 2; ky++) 484 | for(kx = 0;kx < 2; kx++) 485 | { 486 | #pragma HLS PIPELINE 487 | Yoffset = (y << 1) + ky; 488 | Xoffset = (x << 1) + kx; 489 | 490 | int in_index = (ky << 1) + kx; 491 | Output[in_index][y][x] = Input[0][Yoffset][Xoffset]; 492 | } 493 | } 494 | */ 495 | void intra_pingpong_wrapper(float *Input0,float *Input1,float *Weight, float output_buffer[Tm][Tr*PARA][Tc*PARA],float beta_buffer[MAX_BETA_LENGTH], 496 | float input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],float input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width], 497 | float input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width],float input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width], 498 | int InFM_num,int Input_w,int Input_h,int OutFM_num,int Kernel_size,int Kernel_stride, 499 | int TMP_R,int TMP_C,int TMP_M,int m,int TM_MIN,int TR_MIN,int TC_MIN,int TN,int TRow,int TCol,int Padding, 500 | int IHxIW,int KxK,int IFM_numxKxK,int nLoops,bool IsNL,int LayerType,int TM,int TMP_X_next[1],int TX_MIN_next[1],bool pingpongx,bool input_flag,bool process_flag) 501 | { 502 | static float weight_buffer0[Tm][Tn][K][K]; 503 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=1 504 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=2 505 | 506 | static float weight_buffer1[Tm][Tn][K][K]; 507 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=1 508 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=2 509 | 510 | static int NOP[1]; 511 | static int tmp_x; 512 | static int tmp_tx_min; 513 | //printf("TM_MIN=%d,TR_MIN=%d,TC_MIN=%d\n",TM_MIN,TR_MIN,TC_MIN); 514 | //printf("intra OK 2\n"); 515 | if(LayerType==0) 516 | { 517 | 518 | if(!input_flag) 519 | return; 520 | TMP_X_next[0] = TMP_M;//consider by the inner-out loop 521 | TX_MIN_next[0] = TM_MIN;// like above 522 | 523 | 524 | bool pingpong = 0; 525 | int TMP_N_next0[1]; 526 | int TMP_N_next1[1]; 527 | int n; 528 | int TMP_N; 529 | for(TMP_N = 0,n = 0;n < nLoops+1; n++,TMP_N += TN) 530 | { 531 | if(pingpong == 1) 532 | { 533 | copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N, 534 | TM_MIN,TN,TRow,TCol,Padding,input_buffer1,weight_buffer1,TMP_N_next1,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType); 535 | compute(input_buffer0,output_buffer,weight_buffer0,beta_buffer,TMP_N_next0,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops); 536 | pingpong = 0; 537 | }else 538 | { 539 | copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N, 540 | TM_MIN,TN,TRow,TCol,Padding,input_buffer0,weight_buffer0,TMP_N_next0,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType); 541 | compute(input_buffer1,output_buffer,weight_buffer1,beta_buffer,TMP_N_next1,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops); 542 | pingpong = 1; 543 | } 544 | } 545 | } 546 | else if(LayerType==1) 547 | { 548 | if(pingpongx==0) 549 | { 550 | TMP_X_next[0] = tmp_x; 551 | TX_MIN_next[0] = tmp_tx_min; 552 | tmp_x = TMP_M; 553 | tmp_tx_min = TM_MIN; 554 | 555 | //copy_input_weight(Input,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 556 | // TM_MIN,TM,TRow,TCol,0,input_buffer0,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 557 | copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 558 | TM_MIN,TM,TRow,TCol,0,input_buffer0,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 559 | copy_input_weight(Input1,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 560 | TM_MIN,TM,TRow,TCol,0,input_buffer00,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 561 | //pool_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 562 | /* 563 | void shortcut(short input_buffer[Tm][Tr][Tc],int output_buffer[Tm][Tr][Tc], 564 | const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable) 565 | */ 566 | shortcut(input_buffer1,input_buffer10,output_buffer,TM_MIN,TR_MIN,TC_MIN,process_flag); 567 | //pool_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 568 | }else 569 | { 570 | TMP_X_next[0] = tmp_x; 571 | TX_MIN_next[0] = tmp_tx_min; 572 | tmp_x = TMP_M; 573 | tmp_tx_min = TM_MIN; 574 | 575 | //copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 576 | // TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops); 577 | copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 578 | TM_MIN,TM,TRow,TCol,0,input_buffer1,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 579 | copy_input_weight(Input1,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 580 | TM_MIN,TM,TRow,TCol,0,input_buffer10,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 581 | //pool_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 582 | shortcut(input_buffer0,input_buffer00,output_buffer,TM_MIN,TR_MIN,TC_MIN,process_flag); 583 | } 584 | 585 | } 586 | else if(LayerType==2) 587 | { 588 | if(pingpongx==0) 589 | { 590 | TMP_X_next[0] = tmp_x; 591 | TX_MIN_next[0] = tmp_tx_min; 592 | tmp_x = TMP_M; 593 | tmp_tx_min = TM_MIN; 594 | 595 | copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 596 | TM_MIN,TM,TRow,TCol,0,input_buffer0,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 597 | //reorg_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 598 | #ifdef UPSAMPLE_TEST 599 | for(int i = 0; i < 4; i++){ 600 | for(int j = 0; j < 13; j ++){ 601 | for(int k = 0;k < 13;k++){ 602 | printf("%f,",input_buffer0[i][j][k]); 603 | } 604 | printf("\n"); 605 | } 606 | printf("\n"); 607 | } 608 | printf("\n\n"); 609 | #endif 610 | upsample(input_buffer1,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag); 611 | }else 612 | { 613 | TMP_X_next[0] = tmp_x; 614 | TX_MIN_next[0] = tmp_tx_min; 615 | tmp_x = TMP_M; 616 | tmp_tx_min = TM_MIN; 617 | 618 | copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M, 619 | TM_MIN,TM,TRow,TCol,0,input_buffer1,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType); 620 | //reorg_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag); 621 | #ifdef UPSAMPLE_TEST 622 | for(int i = 0; i < 4; i++){ 623 | for(int j = 0; j < 13; j ++){ 624 | for(int k = 0;k < 13;k++){ 625 | printf("%f,",input_buffer1[i][j][k]); 626 | } 627 | printf("\n"); 628 | } 629 | printf("\n"); 630 | } 631 | printf("\n\n"); 632 | #endif 633 | upsample(input_buffer0,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag); 634 | 635 | } 636 | printf("TRow = %d,TMP_R = %d,TR_MIN = %d,TC_MIN = %d,Kernel_stride = %d\n",TRow,TMP_R,TR_MIN,TC_MIN,Kernel_stride); 637 | //printf("TM_MIN = %d,TR_MIN = %d,TC_MIN = %d\n",TM_MIN,TR_MIN,TC_MIN); 638 | // for(int i = 0;i < 26*26;i++){ 639 | // printf("output_buffer[%d][%d][%d] = %f\n",(int)i/(26*26),(int)(i/26),(int)(i%26),output_buffer[0][(int)(i/26)][(int)(i%26)]); 640 | // } 641 | 642 | } 643 | 644 | } 645 | /* 646 | float *Input0,float *Input1,float *Output,float *Weight,float *Beta,输入输出权值偏移数组地址 647 | ,const int InFM_num,const int OutFM_num, 输入的通道总数,输出通道总数,在v2代码里reorg的输入整个是按照一个channel处理的 648 | const int Kernel_size,const int Kernel_stride,卷积核大小步长 649 | const int Input_w,const int Input_h,const int Padding,const bool IsNL,const bool IsBN,输入长和宽,是否填充,是否relu是否批量正则化 650 | const int TM,const int TN,const int TR,const int TC,计算好的参数 651 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,同上,也就是加速器的设计 652 | ,const int LayerType 层类型 653 | */ 654 | 655 | /* 656 | 为什么需要Loop bound?为什么又和类型有关系? 657 | shortcut内KxK=0是不行的,方法1:在输入的时候设置,使用的时候设置为1 658 | const int TRow = (TR-1)*Kernel_stride+Kernel_size; const int TCol = (TC-1)*Kernel_stride+Kernel_size; 659 | 这两个在shortcut也成了0;,这里我也把kernel_stride设置成了1,原因是为了保持不变; 660 | 661 | 接下来就是LoopBound的设置了,我个人觉得这个和conv差不多所以对bound设置成1,他是什么含义咱们先不管 662 | 验一波输出 663 | 664 | */ 665 | void YOLO2_FPGA(float *Input0,float *Input1,float *Output,float *Weight,float *Beta,const int InFM_num,const int OutFM_num, 666 | const int Kernel_size,const int Kernel_stride, 667 | const int Input_w,const int Input_h,const int Padding,const bool IsNL,const bool IsBN, 668 | const int TM,const int TN,const int TR,const int TC, 669 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType) 670 | { 671 | //const int output_w = (Input_w - Kernel_size + 2*Padding)/Kernel_stride + 1 ; 672 | //const int output_h = (Input_h - Kernel_size + 2*Padding)/Kernel_stride + 1 ; 673 | int output_w; 674 | int output_h; 675 | int temptrow,temptcol; 676 | if(LayerType==0){ 677 | output_w = (Input_w - Kernel_size + (Padding << 1))/Kernel_stride + 1 ; 678 | output_h = (Input_h - Kernel_size + (Padding << 1))/Kernel_stride + 1 ; 679 | } else if(LayerType==1) 680 | { 681 | output_w = Input_w; 682 | output_h = Input_h; 683 | } else if(LayerType == 2){ 684 | //you dian wen ti 685 | output_w = Input_w*Kernel_stride; 686 | output_h = Input_h*Kernel_stride; 687 | } 688 | //This is ok! 689 | //printf("output_w=%d,output_h=%d\n\n",output_w,output_h); 690 | 691 | const int OHxOW = output_h*output_w; 692 | if(LayerType==2){ 693 | //temptrow = (TR+1-Kernel_size)/Kernel_stride; 694 | temptrow = TR; 695 | temptcol = TC; 696 | //temptcol = (TC+1-Kernel_size)/Kernel_stride; 697 | } else { 698 | temptrow = (TR-1)*Kernel_stride+Kernel_size; 699 | temptcol = (TC-1)*Kernel_stride+Kernel_size; 700 | } 701 | const int TRow = temptrow; 702 | const int TCol = temptcol; 703 | const int IHxIW = Input_h*Input_w; 704 | const int KxK = Kernel_size*Kernel_size; 705 | const int IFM_numxKxK = InFM_num*KxK; 706 | const int mLoops_bound = (LayerType) ? (mLoops +2): (mLoops + 1); 707 | //const int mLoops_bound = (mLoops + 1); 708 | 709 | 710 | printf("output_w=%d,output_h=%d,TRow=%d,TCol=%d,IHxIW=%d,KxK=%d,IFM_numxKxK=%d,mLoops_bound=%d\n\n",output_w,output_h,TRow,TCol,IHxIW,KxK,IFM_numxKxK,mLoops_bound); 711 | 712 | static float input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width]; 713 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1 714 | 715 | static float input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width]; 716 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1 717 | 718 | static float input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width]; 719 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1 720 | 721 | static float input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width]; 722 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1 723 | 724 | static float output_buffer[Tm][Tr*PARA][Tc*PARA]; 725 | #pragma HLS ARRAY_PARTITION variable=output_buffer complete dim=1 726 | 727 | static float output_buffer1[Tm][Tr*PARA][Tc*PARA]; 728 | #pragma HLS ARRAY_PARTITION variable=output_buffer1 complete dim=1 729 | 730 | static float beta_buffer[MAX_BETA_LENGTH]; 731 | 732 | int r,c,m; 733 | /////////////////////////////////param 734 | int TMP_R,TMP_C,TMP_M; 735 | int TM_MIN,TR_MIN,TC_MIN; 736 | /////////////////////////////////////// 737 | 738 | int TMP_M_next0[1]; 739 | int TMP_M_next1[1]; 740 | int TM_MIN_next0[1]; 741 | int TM_MIN_next1[1]; 742 | bool pingpongm; 743 | 744 | if(LayerType==0) 745 | memcpy(beta_buffer,Beta,OutFM_num*sizeof(float)); 746 | 747 | /* 748 | Loops都是设置好的,那么究竟是哪里出了问题呢 749 | */ 750 | printf("rLoops*cLoops*mLoops_bound = %d\n",rLoops*cLoops*mLoops_bound); 751 | for(TMP_R = 0,r = 0; r < rLoops; r++, TMP_R += TR) 752 | { 753 | TR_MIN = MIN(TR,output_h -TMP_R); 754 | for(TMP_C = 0,c = 0; c < cLoops; c++,TMP_C += TC) 755 | { 756 | TC_MIN = MIN(TC,output_w -TMP_C); 757 | pingpongm = 0; 758 | for(TMP_M = 0, m = 0; m < mLoops_bound; m++,TMP_M += TM) 759 | { 760 | TM_MIN = MIN(TM,OutFM_num-TMP_M); 761 | //if(LayerType == 1) 762 | if(LayerType!=0) TM_MIN = Tn; 763 | //if(LayerType==2) printf("TMP_R=%d,output_h=%d,output_h -TMP_R=%d,TR_MIN=%d\t TMP_C=%d,output_w=%d,output_w -TMP_C=%d,TC_MIN=%d\t TMP_M=%d,OutFM_num=%d,OutFM_num-TMP_M=%d,TM_MIN=%d\n",TMP_R,output_h,output_h-TMP_R,TR_MIN, TMP_C,output_w,output_w-TMP_C,TC_MIN, TMP_M,OutFM_num,OutFM_num-TMP_M,TM_MIN); 764 | bool MneZero = (m!=0); 765 | bool MneOne = (m!=1); 766 | bool MnemLoops = (m!=mLoops); 767 | bool MneMLoopsaddOne = (m!=(mLoops+1)); 768 | bool input_flag = LayerType ? MnemLoops&&MneMLoopsaddOne: MnemLoops; 769 | bool process_flag = LayerType ? MneZero&&MneMLoopsaddOne : MnemLoops; 770 | bool write_flag = LayerType ? MneZero&&MneOne : MneZero; 771 | //printf("FPGA OK 1\n"); 772 | if(pingpongm==0) 773 | { 774 | intra_pingpong_wrapper(Input0,Input1,Weight,output_buffer1,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10, 775 | InFM_num, Input_w, Input_h, OutFM_num, Kernel_size, Kernel_stride, 776 | TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next1,TM_MIN_next1, pingpongm, input_flag, process_flag); 777 | 778 | write_back_output_reorg(output_buffer,Output,TMP_R,TMP_C,TMP_M_next0[0],output_w,output_h,TM_MIN_next0[0],TR_MIN,TC_MIN,OHxOW,LayerType,write_flag); 779 | pingpongm = 1; 780 | }else 781 | { 782 | intra_pingpong_wrapper(Input0,Input1,Weight,output_buffer,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10, 783 | InFM_num, Input_w, Input_h, OutFM_num, Kernel_size, Kernel_stride, 784 | TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next0,TM_MIN_next0, pingpongm, input_flag, process_flag); 785 | 786 | write_back_output_reorg(output_buffer1,Output,TMP_R,TMP_C,TMP_M_next1[0],output_w,output_h,TM_MIN_next1[0],TR_MIN,TC_MIN,OHxOW,LayerType,write_flag); 787 | pingpongm = 0; 788 | } 789 | 790 | } 791 | } 792 | } 793 | } 794 | 795 | int Weight_reorgnaization_anti(float *Weight,float *Weight_reorg,float* Alpha,int IFM_NUM,int OFM_NUM,int Kernel_size,int TM,int TN,const bool IsBN) 796 | { 797 | const int KxK = Kernel_size*Kernel_size; 798 | const int IFM_NUMxKxK = IFM_NUM*KxK; 799 | 800 | int m,n; 801 | int tm,tn,tk; 802 | 803 | float weight_buffer[Tm*Tn*K*K]; 804 | float weight_buffer2[Tm*Tn*K*K]; 805 | 806 | int TM_MIN,TN_MIN; 807 | int offset = 0; 808 | 809 | for( m = 0; m < OFM_NUM; m += TM) 810 | { 811 | TM_MIN = MIN(TM,OFM_NUM - m); 812 | 813 | for(n = 0;n < IFM_NUM; n += TN) 814 | { 815 | TN_MIN = MIN(TN,IFM_NUM - n); 816 | 817 | int Woffset = m*IFM_NUMxKxK + n*KxK; 818 | 819 | for(tm = 0;tm < TM_MIN; tm++) 820 | { 821 | memcpy((float *)(weight_buffer + tm*TN_MIN*KxK), 822 | (float *)(Weight + tm*IFM_NUMxKxK + Woffset),TN_MIN*KxK*sizeof(float)); 823 | } 824 | 825 | int TN_MINxTM_MIN = TN_MIN*TM_MIN; 826 | 827 | for(tk = 0;tk < KxK; tk++) 828 | for(tm = 0;tm < TM_MIN; tm++) 829 | for(tn = 0;tn < TN_MIN;tn++) 830 | { 831 | weight_buffer2[tk*TN_MINxTM_MIN + tm*TN_MIN + tn] = weight_buffer[tm*TN_MIN*KxK + tn*KxK + tk]; 832 | } 833 | 834 | 835 | 836 | memcpy((float *)(Weight_reorg+offset),weight_buffer2,TM_MIN*TN_MIN*KxK*sizeof(float)); 837 | offset += TM_MIN*TN_MIN*KxK; 838 | } 839 | } 840 | 841 | return 0; 842 | } 843 | 844 | /* 845 | TM * mLoops 是计算以后的通道数量 等于 l.n 846 | TN * nLoops 是计算以前的通道数量 等于 l.c 847 | TR * rLoops 是计算的行数量 等于 l.h 848 | TC * cLoops 是计算的列数量 等于 l.w 849 | */ 850 | 851 | 852 | void yolov2_hls_ps(network *net, float *input) 853 | { 854 | int x; 855 | 856 | network orig = *net; 857 | net->input = input; 858 | int weight_offset[128] = {864,18432,2048,18432, 859 | 73728,8192,73728, 860 | 8192,73728, 861 | 294912,32768,294912, 862 | 32768,294912, 863 | 32768,294912, 864 | 32768,294912, 865 | 32768,294912, 866 | 32768,294912, 867 | 32768,294912, 868 | 32768,294912, 869 | 1179648,131072,1179648, 870 | 131072,1179648, 871 | 131072,1179648, 872 | 131072,1179648, 873 | 131072,1179648, 874 | 131072,1179648, 875 | 131072,1179648, 876 | 131072,1179648, 877 | 4718592,524288,4718592, 878 | 524288,4718592, 879 | 524288,4718592, 880 | 524288,4718592, 881 | 524288,4718592,524288,4718592,524288,4718592,261120, 882 | 131072, 883 | 196608,1179648,131072,1179648,131072,1179648,130560, 884 | 32768, 885 | 49152,294912,32768,294912,32768,294912,65280, 886 | 0,0,0,0,0,0,0,0,0,0, 887 | 0,0,0,0,0,0,0,0,0,0, 888 | 0,0,0,0,0,0,0,0,0,0, 889 | 0,0,0,0,0,0,0,0,0,0, 890 | 0,0,0,0,0,0,0,0,0,0, 891 | 0,0,0}; 892 | int beta_offset[128] = {32,64,32,64, 893 | 128,64,128, 894 | 64,128, 895 | 256,128,256, 896 | 128,256, 897 | 128,256, 898 | 128,256, 899 | 128,256, 900 | 128,256, 901 | 128,256, 902 | 128,256, 903 | 512,256,512, 904 | 256,512, 905 | 256,512, 906 | 256,512, 907 | 256,512, 908 | 256,512, 909 | 256,512, 910 | 256,512, 911 | 1024,512,1024, 912 | 512,1024, 913 | 512,1024, 914 | 512,1024, 915 | 512,1024,512,1024,512,1024,255, 916 | 256, 917 | 256,512,256,512,256,512,255, 918 | 128, 919 | 128,256,128,256,128,256,255, 920 | 0,0,0,0,0,0,0,0,0,0, 921 | 0,0,0,0,0,0,0,0,0,0, 922 | 0,0,0,0,0,0,0,0,0,0, 923 | 0,0,0,0,0,0,0,0,0,0, 924 | 0,0,0,0,0,0,0,0,0,0, 925 | 0,0,0}; 926 | 927 | int offset_index = 0; 928 | 929 | //float *Weight_buf = (float *)calloc(203767168/4,sizeof(float)); 930 | //float *Beta_buf = (float *)calloc(43044/4,sizeof(float)); 931 | float *Weight_buf = (float *)calloc(247583104/4,sizeof(float)); 932 | float *Beta_buf = (float *)calloc(108276/4,sizeof(float)); 933 | 934 | #ifdef REORG_TEST 935 | FILE *fp_w = fopen("weights_reorg.bin", "rb"); 936 | if(!fp_w) file_error("weights_reorg.bin"); 937 | #else 938 | FILE *fp_w = fopen("weights.bin", "rb"); 939 | if(!fp_w) file_error("weights.bin"); 940 | #endif 941 | 942 | #ifdef REORG_GEN 943 | //float *Weight_reorg_buf = (float *)calloc(203767168/4,sizeof(float)); 944 | float *Weight_reorg_buf = (float *)calloc(247583104/4,sizeof(float)); 945 | FILE *fp_w_reorg = fopen("weights_reorg.bin", "wb"); 946 | if(!fp_w_reorg) file_error("weights_reorg.bin"); 947 | #endif 948 | 949 | FILE *fp_b = fopen("bias.bin", "rb"); 950 | if(!fp_b) file_error("bias.bin"); 951 | 952 | //fread(Weight_buf, sizeof(float), 203767168/4, fp_w); 953 | //fread(Beta_buf, sizeof(float), 43044/4, fp_b); 954 | fread(Weight_buf, sizeof(float), 247583104/4, fp_w); 955 | fread(Beta_buf, sizeof(float), 108276/4, fp_b); 956 | 957 | fclose(fp_w); 958 | fclose(fp_b); 959 | 960 | 961 | //#define MEM_LEN (416*416*32+208*208*32) 962 | //#define MEM_LEN (608*608*32) 963 | /* 964 | float *Memory_buf = (float*)calloc(MEM_LEN+1024*2,sizeof(float));//leave some memories for overflow 965 | float *Memory_top = Memory_buf+1024; 966 | float *Memory_bottom = Memory_top + MEM_LEN; 967 | memcpy(Memory_top,input,416*416*3*sizeof(float));//416x416x3 input_pic 968 | */ 969 | 970 | #define MEM_LEN (HALFWID*HALFWID*64) 971 | 972 | float* Memory_top = (float*)calloc(MEM_LEN*6+BLOCK*6,sizeof(float));/*为什么加1024?*/ 973 | 974 | float* Memory_top1 = Memory_top+BLOCK; 975 | float* Memory_top2 = Memory_top1+MEM_LEN*2+BLOCK; 976 | float* Memory_top3 = Memory_top2+MEM_LEN+BLOCK; 977 | float* Memory_top4 = Memory_top3+MEM_LEN+BLOCK; 978 | float* Memory_top5 = Memory_top4+MEM_LEN+BLOCK; 979 | float* Memory_bot = Memory_top5+MEM_LEN+BLOCK; 980 | 981 | memcpy(Memory_top2,input,HALFWID*HALFWID*4*3*sizeof(float)); 982 | 983 | float* in_ptr[107]; 984 | float* out_ptr[107]; 985 | 986 | #define ROUTE85_LEN (ATOMWID*ATOMWID*1024) 987 | #define ROUTE97_LEN (ATOMWID*ATOMWID*2048) 988 | 989 | //float* yolo_buf = (float *)calloc(HALFWID*HALFWID*64,sizeof(float)); 990 | 991 | 992 | /* 993 | 重写了内存管理部分,只需要五个小的buff就可以做完 994 | 但是不知道为什么,buf1一直不对,所以用了六个 995 | 修改后的代码不再需要route层,相对于之前的更快一些 996 | 997 | 似乎懂了为什么有问题,因为36层依赖于33,而33存在buf1,所以需要换个位置 998 | 给33层换个位置以后就好了。确实只需要五个小buf 999 | 1000 | */ 1001 | int i = 0; 1002 | 1003 | in_ptr[0] = Memory_top2; 1004 | out_ptr[0] = Memory_top1; 1005 | in_ptr[1] = out_ptr[0]; 1006 | out_ptr[1] = Memory_top2; 1007 | in_ptr[2] = Memory_top2; 1008 | out_ptr[2] = Memory_top1; 1009 | 1010 | for(i=3;i<6;i++){ 1011 | if(i%2==0){ 1012 | in_ptr[i] = Memory_top3; 1013 | out_ptr[i] = Memory_top1; 1014 | } else { 1015 | 1016 | in_ptr[i] = out_ptr[i-1]; 1017 | out_ptr[i] = Memory_top3; 1018 | } 1019 | } 1020 | 1021 | for(i=6;i<10;i++){ 1022 | if(i%2==0){ 1023 | in_ptr[i] = out_ptr[i-1]; 1024 | out_ptr[i] = Memory_top2; 1025 | } else { 1026 | in_ptr[i] = Memory_top2; 1027 | out_ptr[i] = Memory_top1; 1028 | } 1029 | } 1030 | 1031 | for(i = 10;i < 14;i++){ 1032 | if(i%2==0){ 1033 | in_ptr[i] = Memory_top1; 1034 | out_ptr[i] = Memory_top3; 1035 | } else { 1036 | in_ptr[i] = out_ptr[i-1]; 1037 | out_ptr[i] = Memory_top1; 1038 | } 1039 | } 1040 | 1041 | for(i = 14;i< 17;i++){ 1042 | if(i%2==0){ 1043 | in_ptr[i] = Memory_top1; 1044 | out_ptr[i] = Memory_top2; 1045 | } else { 1046 | in_ptr[i] = out_ptr[i-1]; 1047 | out_ptr[i] = Memory_top1; 1048 | } 1049 | } 1050 | for(i=17;i<20;i++){ 1051 | if(i%2==0){ 1052 | in_ptr[i] = Memory_top3; 1053 | out_ptr[i] = Memory_top2; 1054 | } else { 1055 | in_ptr[i] = out_ptr[i-1]; 1056 | out_ptr[i] = Memory_top3; 1057 | } 1058 | } 1059 | for(i=20;i<23;i++){ 1060 | in_ptr[i] = out_ptr[i-1]; 1061 | if(i%2==0) 1062 | out_ptr[i] = Memory_top1; 1063 | else 1064 | out_ptr[i] = Memory_top3; 1065 | } 1066 | for(i=23;i<26;i++){ 1067 | in_ptr[i] = out_ptr[i-1]; 1068 | if(i%2==0) 1069 | out_ptr[i] = Memory_top1; 1070 | else 1071 | out_ptr[i] = Memory_top2; 1072 | } 1073 | for(i=26;i<29;i++){ 1074 | in_ptr[i] = out_ptr[i-1]; 1075 | if(i%2==0) 1076 | out_ptr[i] = Memory_top3; 1077 | else 1078 | out_ptr[i] = Memory_top2; 1079 | } 1080 | for(i=29;i<32;i++){ 1081 | in_ptr[i] = out_ptr[i-1]; 1082 | if(i%2==0) 1083 | out_ptr[i] = Memory_top3; 1084 | else 1085 | out_ptr[i] = Memory_top1; 1086 | } 1087 | for(i=32;i<35;i++){ 1088 | in_ptr[i] = out_ptr[i-1]; 1089 | if(i%2==0) 1090 | out_ptr[i] = Memory_top2; 1091 | else{ 1092 | out_ptr[i] = Memory_top4; 1093 | } 1094 | } 1095 | 1096 | in_ptr[35] = Memory_top2; 1097 | out_ptr[35] = Memory_top3; 1098 | 1099 | in_ptr[36] = Memory_top3; 1100 | out_ptr[36] = Memory_top1+ROUTE97_LEN; 1101 | /*0~36 现在都没有问题了*/ 1102 | in_ptr[37] = out_ptr[36]; 1103 | out_ptr[37] = Memory_top2; 1104 | 1105 | in_ptr[38] = Memory_top2; 1106 | out_ptr[38] = Memory_top3; 1107 | 1108 | for(i=39;i<42;i++){ 1109 | in_ptr[i] = out_ptr[i-1]; 1110 | if(i%2==0) 1111 | out_ptr[i] = Memory_top3; 1112 | else 1113 | out_ptr[i] = Memory_top4; 1114 | } 1115 | 1116 | for(i=42;i<45;i++){ 1117 | in_ptr[i] = out_ptr[i-1]; 1118 | if(i%2==0) 1119 | out_ptr[i] = Memory_top2; 1120 | else 1121 | out_ptr[i] = Memory_top4; 1122 | } 1123 | 1124 | for(i=45;i<48;i++){ 1125 | in_ptr[i] = out_ptr[i-1]; 1126 | if(i%2==0) 1127 | out_ptr[i] = Memory_top2; 1128 | else 1129 | out_ptr[i] = Memory_top3; 1130 | } 1131 | 1132 | for(i=48;i<51;i++){ 1133 | in_ptr[i] = out_ptr[i-1]; 1134 | if(i%2==0) 1135 | out_ptr[i] = Memory_top4; 1136 | else 1137 | out_ptr[i] = Memory_top3; 1138 | } 1139 | for(i=51;i<54;i++){ 1140 | in_ptr[i] = out_ptr[i-1]; 1141 | if(i%2==0) 1142 | out_ptr[i] = Memory_top4; 1143 | else 1144 | out_ptr[i] = Memory_top2; 1145 | } 1146 | for(i=54;i<57;i++){ 1147 | in_ptr[i] = out_ptr[i-1]; 1148 | if(i%2==0) 1149 | out_ptr[i] = Memory_top3; 1150 | else 1151 | out_ptr[i] = Memory_top2; 1152 | } 1153 | for(i=57;i<60;i++){ 1154 | in_ptr[i] = out_ptr[i-1]; 1155 | if(i%2==0) 1156 | out_ptr[i] = Memory_top3; 1157 | else 1158 | out_ptr[i] = Memory_top4; 1159 | } 1160 | 1161 | in_ptr[60] = Memory_top4; 1162 | out_ptr[60] = Memory_top5; 1163 | in_ptr[61] = Memory_top5; 1164 | out_ptr[61] = Memory_top2+ROUTE85_LEN; 1165 | in_ptr[62] = Memory_top2+ROUTE85_LEN; 1166 | out_ptr[62] = Memory_top3; 1167 | in_ptr[63] = Memory_top3; 1168 | out_ptr[63] = Memory_top4; 1169 | 1170 | for(i=64;i<67;i++){ 1171 | in_ptr[i] = out_ptr[i-1]; 1172 | if(i%2==0) 1173 | out_ptr[i] = Memory_top5; 1174 | else 1175 | out_ptr[i] = Memory_top4; 1176 | } 1177 | for(i=67;i<70;i++){ 1178 | in_ptr[i] = out_ptr[i-1]; 1179 | if(i%2==0) 1180 | out_ptr[i] = Memory_top5; 1181 | else 1182 | out_ptr[i] = Memory_top3; 1183 | } 1184 | 1185 | for(i=70;i<73;i++){ 1186 | in_ptr[i] = out_ptr[i-1]; 1187 | if(i%2==0) 1188 | out_ptr[i] = Memory_top4; 1189 | else 1190 | out_ptr[i] = Memory_top3; 1191 | } 1192 | for(i=73;i<81;i++){ 1193 | in_ptr[i] = out_ptr[i-1]; 1194 | if(i%2==0) 1195 | out_ptr[i] = Memory_top4; 1196 | else 1197 | out_ptr[i] = Memory_top5; 1198 | } 1199 | 1200 | for(i=81;i<83;i++){ 1201 | in_ptr[i] = out_ptr[i-1]; 1202 | if(i%2==0) 1203 | out_ptr[i] = Memory_top4; 1204 | else 1205 | out_ptr[i] = Memory_top3; 1206 | } 1207 | in_ptr[83] = out_ptr[79]; 1208 | out_ptr[83] = out_ptr[79]; 1209 | in_ptr[84] = out_ptr[83]; 1210 | out_ptr[84] = Memory_top4; 1211 | 1212 | in_ptr[85] = Memory_top4; 1213 | out_ptr[85] = Memory_top2; 1214 | in_ptr[86] = Memory_top2; 1215 | out_ptr[86] = Memory_top2; 1216 | 1217 | for(i = 87;i<93;i++){ 1218 | in_ptr[i] = out_ptr[i-1]; 1219 | if(i%2==0) 1220 | out_ptr[i] = Memory_top2; 1221 | else 1222 | out_ptr[i] = Memory_top3; 1223 | } 1224 | 1225 | in_ptr[93] = Memory_top2; 1226 | out_ptr[93] = Memory_top4; 1227 | in_ptr[94] = Memory_top4; 1228 | out_ptr[94] = Memory_top2; 1229 | in_ptr[95] = Memory_top3; 1230 | out_ptr[95] = Memory_top3; 1231 | in_ptr[96] = Memory_top3; 1232 | out_ptr[96] = Memory_top2; 1233 | in_ptr[97] = Memory_top2; 1234 | out_ptr[97] = Memory_top1; 1235 | in_ptr[98] = Memory_top1; 1236 | out_ptr[98] = Memory_top1; 1237 | 1238 | for(i = 99;i<107;i++){ 1239 | in_ptr[i] = out_ptr[i-1]; 1240 | if(i%2==0) 1241 | out_ptr[i] = Memory_top1; 1242 | else 1243 | out_ptr[i] = Memory_top2; 1244 | } 1245 | 1246 | network netp = *net; 1247 | //int i; 1248 | int woffset = 0; 1249 | int aoffset = 0; 1250 | int boffset = 0; 1251 | int TR,TC,TM,TN; 1252 | int output_w,output_h; 1253 | int rLoops,cLoops,mLoops,nLoops; 1254 | double sum_gop = 0.0; 1255 | 1256 | int T2Rate; 1257 | int TRow; 1258 | int trow_loops; 1259 | 1260 | for(i = 0; i < netp.n; ++i) 1261 | { 1262 | netp.index = i; 1263 | layer l = netp.layers[i]; 1264 | printf("Layer[%2d]: ",i); 1265 | switch(l.type) 1266 | { 1267 | case CONVOLUTIONAL:{ 1268 | printf("outputMemory:%8d;BN=%d;Activation=%d;conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BFLOPs\n",l.outputs,l.batch_normalize,l.activation, l.n, l.size, l.size, l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.); 1269 | sum_gop += (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.; 1270 | output_w = (l.w - l.size + 2*l.pad)/l.stride + 1 ; 1271 | output_h = (l.h - l.size + 2*l.pad)/l.stride + 1 ; 1272 | 1273 | TR = MIN(((OnChipIB_Height-l.size)/l.stride+1),Tr);//keep Kernel_stride>=1 1274 | TR = MIN(output_h,TR); 1275 | TC = MIN(((OnChipIB_Width-l.size)/l.stride+1),Tc); 1276 | TC = MIN(output_w,TC); 1277 | TM = MIN(l.n,Tm); 1278 | TN = MIN(l.c,Tn); 1279 | 1280 | rLoops = (int)ceil(((float)output_h)/TR); 1281 | cLoops = (int)ceil(((float)output_w)/TC); 1282 | mLoops = (int)ceil(((float)l.n)/TM); 1283 | nLoops = (int)ceil(((float)l.c)/TN); 1284 | 1285 | switch(l.w) 1286 | { 1287 | case 26: 1288 | T2Rate = 2; 1289 | break; 1290 | case 13: 1291 | T2Rate = 4; 1292 | break; 1293 | default: 1294 | T2Rate = 1; 1295 | break; 1296 | } 1297 | TRow = (TR-1)*l.stride+l.size; 1298 | trow_loops = (int)ceil(((float)TRow/T2Rate)); 1299 | 1300 | //printf("TR=%d,TC=%d,TM=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d\n",TR,TC,TM,rLoops,cLoops,mLoops,nLoops); 1301 | printf("TRow = %d,trow_loops=%d,l.pad=%d,l.c=%d,l.n=%d,TR=%d,TC=%d,TM=%d,TN=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d,output_w=%d,output_h=%d\n",TRow,trow_loops,l.pad,l.c,l.n,TR,TC,TM,TN,rLoops,cLoops,mLoops,nLoops,output_w,output_h); 1302 | 1303 | YOLO2_FPGA(in_ptr[i],NULL,out_ptr[i],Weight_buf+woffset,Beta_buf+boffset, 1304 | l.c,l.n,l.size, 1305 | l.stride,l.w,l.h,l.pad,l.activation==LEAKY?1:0,l.batch_normalize?1:0, 1306 | TM,TN,TR,TC, 1307 | mLoops,nLoops,rLoops,cLoops,0); 1308 | #ifdef REORG_GEN 1309 | Weight_reorgnaization_anti(Weight_buf + woffset,Weight_reorg_buf + woffset,NULL,l.c,l.n,l.size,TM,TN,0); 1310 | #endif 1311 | 1312 | woffset += weight_offset[offset_index]; 1313 | boffset += beta_offset[offset_index]; 1314 | offset_index++; 1315 | 1316 | break; 1317 | } 1318 | /* 1319 | case MAXPOOL: 1320 | printf("outputMemory:%8d;max %d x %d / %d %4d x%4d x%4d -> %4d x%4d x%4d\n",l.outputs, l.size, l.size, l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c); 1321 | //output_w = (l.w - l.size)/l.stride + 1 ; 1322 | //output_h = (l.h - l.size)/l.stride + 1 ; 1323 | output_w = l.out_h; 1324 | output_h = l.out_w; 1325 | 1326 | TR = MIN(((OnChipIB_Height-l.size)/l.stride+1),Tr);//keep Kernel_stride>=1 1327 | TC = MIN(((OnChipIB_Width-l.size)/l.stride+1),Tc); 1328 | 1329 | TR = MIN(output_h,TR); 1330 | TC = MIN(output_w,TC); 1331 | TM = MIN(Tm,Tn); 1332 | TM = MIN(l.c,TM); 1333 | 1334 | rLoops = (int)ceil(((float)output_h)/TR); 1335 | cLoops = (int)ceil(((float)output_w)/TC); 1336 | mLoops = (int)ceil(((float)l.c)/TM); 1337 | 1338 | YOLO2_FPGA(in_ptr[i],out_ptr[i],NULL,NULL,l.c,l.c, 1339 | l.size,l.stride,l.w,l.h,l.pad,0,0,TM,0,TR,TC,mLoops,0,rLoops,cLoops,1); 1340 | 1341 | break; 1342 | case REORG: 1343 | printf("outputMemory:%8d;reorg /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n",l.outputs, l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c); 1344 | output_w = 26; 1345 | output_h = 32*13; 1346 | 1347 | TR = MIN(((OnChipIB_Height-l.stride)/l.stride+1),Tr);//keep Kernel_stride>=1 1348 | TR = MIN(output_h,TR); 1349 | TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc); 1350 | TC = MIN(output_w,TC); 1351 | TM = 4; 1352 | 1353 | rLoops = (int)ceil(((float)output_h)/TR); 1354 | cLoops = (int)ceil(((float)output_w)/TC); 1355 | mLoops = 1; 1356 | 1357 | YOLO2_FPGA(in_ptr[i],out_ptr[i],NULL,NULL,1,4, 1358 | l.stride,l.stride,52,32*26,0,0,0,TM,0,TR,TC,mLoops,0,rLoops,cLoops,2); 1359 | 1360 | break; 1361 | case ROUTE: 1362 | printf("outputMemory:%8d;route ",l.outputs); 1363 | int j; 1364 | for(j = 0; j < l.n; ++j){ 1365 | printf(" %d", l.input_layers[j]); 1366 | } 1367 | printf("\n"); 1368 | break; 1369 | case REGION: 1370 | printf("outputMemory:%8d;Detection\n",l.outputs); 1371 | forward_region_layer(l, in_ptr[i]); 1372 | break; 1373 | } 1374 | */ 1375 | case ROUTE:{ 1376 | printf("outputMemory:%8d;route ",l.outputs); 1377 | int j; 1378 | for(j = 0; j < l.n; ++j){ 1379 | printf(" %d", l.input_layers[j]); 1380 | } 1381 | printf("\n"); 1382 | //forward_route_layer(l,netp); 1383 | break; 1384 | } 1385 | case SHORTCUT:{ 1386 | //as same as reorg 1387 | printf("res %3d %4d x%4d x%4d -> %4d x%4d x%4d\n",l.index, netp.layers[i-1].w,netp.layers[i-1].h,netp.layers[i-1].n, l.w,l.h,l.c); 1388 | /* 1389 | output_w = l.out_h; 1390 | output_h = l.out_w; 1391 | 1392 | TR = MIN(((OnChipIB_Height-l.stride)/l.stride+1),Tr);//keep Kernel_stride>=1 1393 | TR = MIN(output_h,TR); 1394 | TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc); 1395 | TC = MIN(output_w,TC); 1396 | TM = 4; 1397 | TN = TM; 1398 | 1399 | rLoops = (int)ceil(((float)output_h)/TR); 1400 | cLoops = (int)ceil(((float)output_w)/TC); 1401 | mLoops = 1; 1402 | */ 1403 | //TM=TN; 1404 | //mLoops=nLoops; 1405 | //TN=TM; 1406 | //nLoops=mLoops; 1407 | output_w = l.out_h; 1408 | output_h = l.out_w; 1409 | 1410 | //TR = MIN(((OnChipIB_Height-l.stride)/l.stride+1),Tr);//keep Kernel_stride>=1 1411 | TR = MIN(output_h,Tr); 1412 | //TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc); 1413 | TC = MIN(output_w,Tc); 1414 | //TM = 4; 1415 | //TN = TM; 1416 | //TM = MIN(l.n,Tm); 1417 | TN = MIN(l.c,Tn); 1418 | TM = TN; 1419 | 1420 | rLoops = (int)ceil(((float)output_h)/TR); 1421 | cLoops = (int)ceil(((float)output_w)/TC); 1422 | //mLoops = (int)ceil(((float)l.n)/TM); 1423 | nLoops = (int)ceil(((float)l.c)/TN); 1424 | mLoops = nLoops; 1425 | //mLoops = 1; 1426 | 1427 | switch(l.w) 1428 | { 1429 | case 26: 1430 | T2Rate = 2; 1431 | break; 1432 | case 13: 1433 | T2Rate = 4; 1434 | break; 1435 | default: 1436 | T2Rate = 1; 1437 | break; 1438 | } 1439 | TRow = TR; 1440 | trow_loops = (int)ceil(((float)TRow/T2Rate)); 1441 | 1442 | //printf("TR=%d,TC=%d,TM=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d\n",TR,TC,TM,rLoops,cLoops,mLoops,nLoops); 1443 | printf("TRow = %d,trow_loops=%d,l.pad=%d,l.c=%d,l.n=%d,TR=%d,TC=%d,TM=%d,TN=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d,output_w=%d,output_h=%d\n",TRow,trow_loops,l.pad,l.c,l.n,TR,TC,TM,TN,rLoops,cLoops,mLoops,nLoops,output_w,output_h); 1444 | /* 1445 | switch(52) 1446 | { 1447 | case 26: 1448 | T2Rate = 2; 1449 | break; 1450 | case 13: 1451 | T2Rate = 4; 1452 | break; 1453 | default: 1454 | T2Rate = 1; 1455 | break; 1456 | } 1457 | TRow = (TR-1)*l.stride+l.stride; 1458 | trow_loops = (int)ceil(((float)TRow/T2Rate)); 1459 | */ 1460 | /* 1461 | YOLO2_FPGA(in_ptr[i],out_ptr[i],NULL,NULL,l.c,l.c, 1462 | l.size,l.stride,l.w,l.h,l.pad,0,0,TM,0,TR,TC,mLoops,0,rLoops,cLoops,1); 1463 | */ 1464 | YOLO2_FPGA(out_ptr[i-1],out_ptr[l.index],out_ptr[i],NULL,NULL,l.c,l.c, 1465 | 1,1,l.w,l.h,l.pad,0,0,TM,TN,TR,TC,mLoops,nLoops,rLoops,cLoops,1); 1466 | //inputQ[offset_index],inputQ[offset_index],INTERWIDTH,INTERWIDTH,trow_loops); 1467 | break; 1468 | } 1469 | case UPSAMPLE:{ 1470 | //as same as pool 1471 | printf("upsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c); 1472 | output_w = l.out_w; 1473 | output_h = l.out_h; 1474 | 1475 | //TR TC 到底怎么选择?是按照最小的还是只按输出? 1476 | //是按照输出来的,因为loop除以的是输出宽度,但是事实只需要一遍 1477 | //所以是按照输入计算 1478 | //但是好像都可以,看后期实现哪个方便吧 1479 | //TR = MIN(output_h,Tr); 1480 | TR = 13; 1481 | //TR = MIN(TR,l.h); 1482 | //TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc); 1483 | //TC = MIN(output_w,Tc); 1484 | TC = 13; 1485 | //TC = MIN(TC,l.w); 1486 | //TM = 4; 1487 | //TN = TM; 1488 | //TM = MIN(l.n,Tm); 1489 | TN = MIN(l.c,Tn); 1490 | TM = TN; 1491 | 1492 | rLoops = (int)ceil(((float)l.h)/TR); 1493 | cLoops = (int)ceil(((float)l.w)/TC); 1494 | //mLoops = (int)ceil(((float)l.n)/TM); 1495 | nLoops = (int)ceil(((float)l.c)/TN); 1496 | mLoops = nLoops; 1497 | 1498 | switch(l.w) 1499 | { 1500 | case 13: 1501 | T2Rate = 1; 1502 | break; 1503 | default: 1504 | T2Rate = 1; 1505 | break; 1506 | } 1507 | TRow = TR; 1508 | trow_loops = (int)ceil(((float)TRow/T2Rate)); 1509 | 1510 | //printf("TR=%d,TC=%d,TM=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d\n",TR,TC,TM,rLoops,cLoops,mLoops,nLoops); 1511 | printf("l.w = %d,l.stride = %d,TRow = %d,trow_loops=%d,l.pad=%d,l.c=%d,l.n=%d,TR=%d,TC=%d,TM=%d,TN=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d,output_w=%d,output_h=%d\n",l.w,l.stride,TRow,trow_loops,l.pad,l.c,l.n,TR,TC,TM,TN,rLoops,cLoops,mLoops,nLoops,output_w,output_h); 1512 | /* 1513 | switch(52) 1514 | { 1515 | case 26: 1516 | T2Rate = 2; 1517 | break; 1518 | case 13: 1519 | T2Rate = 4; 1520 | break; 1521 | default: 1522 | T2Rate = 1; 1523 | break; 1524 | } 1525 | TRow = (TR-1)*l.stride+l.stride; 1526 | trow_loops = (int)ceil(((float)TRow/T2Rate)); 1527 | */ 1528 | // YOLO2_FPGA(in_ptr[i],in_ptr[i],in_ptr[i],in_ptr[i],out_ptr[i],out_ptr[i],NULL,NULL,1,4, 1529 | // l.stride,l.stride,52,32*26,output_w,output_h, 1530 | // 0,0,0,TM,TN,TR,TC,mLoops,1,rLoops,cLoops,2); 1531 | //(float *Input0,float *Input1,float *Output,float *Weight,float *Beta,const int InFM_num,const int OutFM_num, 1532 | // const int Kernel_size,const int Kernel_stride, 1533 | // const int Input_w,const int Input_h,const int Padding,const bool IsNL,const bool IsBN, 1534 | // const int TM,const int TN,const int TR,const int TC, 1535 | // const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType) 1536 | YOLO2_FPGA(in_ptr[i],in_ptr[i],out_ptr[i],NULL,NULL,l.c,l.c, 1537 | 1,l.stride, 1538 | l.w,l.h,l.pad,0,0, 1539 | TM,TN,TR,TC, 1540 | mLoops,nLoops,rLoops,cLoops,2); 1541 | //inputQ[offset_index],inputQ[offset_index],INTERWIDTH,INTERWIDTH,trow_loops); 1542 | 1543 | break; 1544 | } 1545 | case YOLO:{ 1546 | //YOLO as same as REGION 1547 | printf("outputMemory:%8d;yolo ",l.outputs); 1548 | /* 1549 | double OutputPara = pow(2.0,-inputQ[offset_index]); 1550 | bool NextPixelFlag = true; 1551 | int OutputPixelOffset = 0; 1552 | short current_p,next_p,output_p; 1553 | int *Output_ptr = (int *)(in_ptr[i]); 1554 | for(j=0;j> 16; 1562 | output_p = current_p; 1563 | NextPixelFlag = false; 1564 | }else 1565 | { 1566 | output_p = next_p; 1567 | NextPixelFlag = true; 1568 | } 1569 | yolo_buf[j] = output_p*OutputPara; 1570 | }*/ 1571 | netp.layers[i].output = forward_yolo_array(l,out_ptr[i-1]); 1572 | break; 1573 | } 1574 | 1575 | } 1576 | 1577 | netp.input = l.output; 1578 | /* 1579 | for( x = 0; x < 500; x++) 1580 | { 1581 | //sprintf(line, "%f\n", out_ptr[i][x]); 1582 | printf("%f,",out_ptr[i][x]); 1583 | if((x+1)%10==0){ 1584 | printf("\n"); 1585 | } 1586 | //if(fputs(line,fp3)<0)fprintf(stderr,"write FILE failed\n"); 1587 | } 1588 | */ 1589 | /* 1590 | //mycode 1591 | //if(i==84||i==85){ 1592 | char line[256]; 1593 | FILE *fp3; 1594 | char filename[256]; 1595 | sprintf(filename, "fpga_net_layer_%d.txt", i); 1596 | if( (fp3 = fopen(filename, "w")) == NULL)fprintf(stderr,"CANNOT OPEN\n"); 1597 | for( x = 0; x < l.outputs; x++) 1598 | { 1599 | sprintf(line, "%f\n", out_ptr[i][x]); 1600 | if(fputs(line,fp3)<0)fprintf(stderr,"write FILE failed\n"); 1601 | } 1602 | fclose(fp3); 1603 | printf("layer[%d]:Write END!\n\n",i); 1604 | */ 1605 | //} 1606 | 1607 | } 1608 | printf("SUM_GOP=%g\n",sum_gop); 1609 | *net = orig; 1610 | 1611 | #ifdef REORG_GEN 1612 | //fwrite(Weight_reorg_buf, sizeof(float), 203767168/4, fp_w_reorg); 1613 | fwrite(Weight_reorg_buf, sizeof(float), 247583104/4, fp_w_reorg); 1614 | fclose(fp_w_reorg); 1615 | free(Weight_reorg_buf); 1616 | #endif 1617 | free(Memory_top); 1618 | free(Weight_buf); 1619 | free(Beta_buf); 1620 | 1621 | } 1622 | ///////////////////////////////////////////////////////////////////////20181229 anti-reorg ok end n4m32 1623 | --------------------------------------------------------------------------------