├── soft_version
    ├── README.MD
    ├── Step03
    │   ├── README.MD
    │   ├── main3.c
    │   └── yolov3.cfg
    └── Step02
    │   ├── README.MD
    │   ├── main3.cpp
    │   ├── yolov3.cfg
    │   └── yolov3_acc_sim.h
├── YOLOV3实验细节.pdf
├── yolov2实验复现报告.pdf
├── yolov3_hls
    ├── README.md
    ├── simu1.png
    ├── sourceFile
    │   ├── README.md
    │   ├── cnn.h
    │   └── yolov3.cpp
    ├── testBench
    │   ├── README.md
    │   ├── dog.jpg
    │   ├── coco.names
    │   ├── main.cpp
    │   └── yolov3.cfg
    └── files needed.png
├── petalinux
    ├── README.MD
    └── command_petalinux
├── yolov3_elf
    ├── README.MD
    ├── main.cc
    └── xconv_hw.h
└── README.md


/soft_version/README.MD:
--------------------------------------------------------------------------------
1 | 这些文件对应陈辰大佬里的这一步的修改，第一步分离的步骤不动，第二与第三步都进行了修改。
2 | 


--------------------------------------------------------------------------------
/YOLOV3实验细节.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/YOLOV3实验细节.pdf


--------------------------------------------------------------------------------
/soft_version/Step03/README.MD:
--------------------------------------------------------------------------------
1 | 这里是第三步的改动后的代码，基本上改动不大
2 | 
3 | 
4 | 缺的文件与STEP02相似，用法也和陈辰大佬的代码相似。
5 | 


--------------------------------------------------------------------------------
/yolov2实验复现报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov2实验复现报告.pdf


--------------------------------------------------------------------------------
/yolov3_hls/README.md:
--------------------------------------------------------------------------------
1 |  There  are two folders ,one is source File and the other is test bench.
2 | 


--------------------------------------------------------------------------------
/yolov3_hls/simu1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov3_hls/simu1.png


--------------------------------------------------------------------------------
/yolov3_hls/sourceFile/README.md:
--------------------------------------------------------------------------------
1 | This folder is source file which is used to generate the ip core.
2 | 


--------------------------------------------------------------------------------
/yolov3_hls/testBench/README.md:
--------------------------------------------------------------------------------
1 | this file is about to test that ip core,in my computer it is OK.
2 | 


--------------------------------------------------------------------------------
/yolov3_hls/files needed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov3_hls/files needed.png


--------------------------------------------------------------------------------
/petalinux/README.MD:
--------------------------------------------------------------------------------
1 | petalinux的设置与陈辰大佬提供的有些不同，我使用的全部是默认设置，也就是说存储的东西会掉电消失。因此我使用了nfs系统来弥补因设置问题造成的缺陷。这里是petalinux的命令过程。
2 | 


--------------------------------------------------------------------------------
/yolov3_hls/testBench/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbdxwyh/yolov3_fpga_project/HEAD/yolov3_hls/testBench/dog.jpg


--------------------------------------------------------------------------------
/soft_version/Step02/README.MD:
--------------------------------------------------------------------------------
1 | 这里对应着第二步的具体代码
2 | 
3 | 这里还缺少coco.names,dog.jpg,labels文件夹以及第一步生成好的权重和偏置。
4 | 陈辰大佬对文件的命名情况做过改动，所以对权重的名称尽可能仔细核对，
5 | 具体则相似，通过注释控制生成reorg还是qunti
6 | 


--------------------------------------------------------------------------------
/yolov3_elf/README.MD:
--------------------------------------------------------------------------------
1 | those code are used to complied to generate .elf file to run on fpga.
2 | 这块我没有使用SDK，而是使用命令直接编译的，在使vivado的环境变量生效以后，`arm-linux-gnueabihf-g++ -static -O3  * -o myfile.elf`,运行此命令会生成elf文件
3 | 


--------------------------------------------------------------------------------
/petalinux/command_petalinux:
--------------------------------------------------------------------------------
 1 | source /opt/pkg/petalinux/settings.sh 
 2 | source /opt/Xilinx/Vivado/2017.4/settings64.sh
 3 | petalinux-create --type project --template zynq --name hdyolo
 4 | cd hdyolo
 5 | cp -r ../project_2.sdk ./
 6 | rm -rf ./project-spec/meta-user/recipes-bsp/device-tree/files/system-user.dtsi
 7 | cp ../system-user.dtsi ./project-spec/meta-user/recipes-bsp/device-tree/files/
 8 | petalinux-config --get-hw-description ./project_2.sdk/
 9 | petalinux-config -c kernel
10 | petalinux-config -c rootfs
11 | petalinux-build
12 | petalinux-package --boot --fsbl ./images/linux/zynq_fsbl.elf --fpga --u-boot --force
13 | 
14 | 


--------------------------------------------------------------------------------
/yolov3_hls/sourceFile/cnn.h:
--------------------------------------------------------------------------------
 1 | #ifndef CNN_H
 2 | 
 3 | #define CNN_H
 4 | 
 5 | 
 6 | void YOLO2_FPGA(int *Input,int *Input1,int *Input2,int *Input3,int *Output,int *Output1,int *Weight,int *Beta,const int InFM_num,const int OutFM_num,
 7 | 							  const int Kernel_size,const int Kernel_stride,
 8 | 							  const int Input_w,const int Input_h,const int output_w,const int output_h,const int Padding,const bool IsNL,const bool IsBN,
 9 | 							  const int TM,const int TN,const int TR,const int TC,
10 | 							  const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType,
11 | 							  const int InputQ,const int OutputQ,const int WeightQ,const int BetaQ,int trow_loops);
12 | 
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/yolov3_hls/testBench/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # yolov3_hls
 2 | 这一块的代码是在把上交的陈辰大佬的yolov2代码改成yolov3的时候的一些改动，主要步骤还是参考陈辰大佬的代码。
 3 | https://github.com/dhm2013724/yolov2_xilinx_fpga
 4 | 但是有一些和他不一样的细节。
 5 | 
 6 | # Tips：
 7 | 我目前已经不做这个方向了，有些细节记得不是很清楚了，只能把当时的复现实验报告和做v3时候的细节报告传上来（请忽略掉一些吐槽），请分别查看v2和v3的pdf报告。
 8 | 
 9 | # 在权重处理部分
10 | 在权重处理部分我写了好几个版本的，由于过去了一段时间，我也忘记了当初用的哪一个版本了。
11 | 但是主要的改动在于内存控制的部分，在第2步的第一小步骤：产生重组织的权重的时候，进行了修改。
12 | 在第二步的第二个小步骤，产生量化的文件只进行了偏移量的修改，此外为了和我的实验匹配还加了一些垃圾代码，你可以不看（比如固定位数的量化）
13 | 在第三步产生输入输出的特征图量化的步骤同样进行了上述步骤的修改。[但是事后才发现有很大的问题，这一块的补偿体现在了elf代码里，在对卷积层判定的时候对于层号的选择那块]
14 | 
15 | # IP核部分
16 | IP核的改动不算特别大，移除了YOLOv2的maxpool部分，增加了shortcut（add方式的）和upsample层。
17 | 同时对于upsample层的块大小做了固定[尝试增加块大小到26（最小的二倍），但是发现性能低到爆炸]
18 | 由于记性不好，我忘记了生成ipcore使用的是yolov3.cpp还是cnn.cpp了
19 | 
20 | # 最终的elf部分
21 | 由于以上的种种问题，在这块的代码上缝缝补补改了很多。
22 | 首先是由于shortcut选择了对其的方式，所以会因为不同层之间的量化位数不同产生溢出，因此需要把输出较大的层和输出较小的层做同步。[比如A层有输出13.6，A+3层有输出0.01,获得A层的位数是10，A+3是15,这个时候shortcut如果向A+3层的位数同步，那么就会造成13.6*pow(2,15)产生超过16位的结果]
23 | 其次由于yolov3有比较复杂的FPN机制，所以需要保存很多层的信息，因此我的缓存区利用的并不好[虽然后面重写了一个版本的，但是在fpga上好像并不行我也不知道是什么原因，业已证明只需要5个小缓冲区就能做完yolov3，我这里相当于用了十个]
24 | 同样的，由于缓冲区设计以及IPcore的设计，在并不需要做route层的情况下就可以实现[使用地址直接拼接，反正route也是拼接]
25 | [实际上在fpga做upsample并不快，只是少了因为拷贝内存的时间，所以相对于总时间来说这一块也放fpga上了]
26 | 
27 | # 最终的效果
28 | 其实并不怎么好，因为陈辰大佬使用了im2col的方式进行计算，而yolov3过深的网络结构给arm核的拷贝带来了大量的负担，为了拷贝数据造成的时延实际上相当大。
29 | 但是最终的能耗确实很低。
30 | 


--------------------------------------------------------------------------------
/yolov3_hls/testBench/main.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <iostream>
 5 | #include <math.h>
 6 | #include <fcntl.h>
 7 | #include <string.h>
 8 | #include <time.h>
 9 | #include "yolov3.h"
10 | 
11 | int main( int argc, char *argv[])
12 | {
13 | 	//freopen("result.txt","w",stdout);
14 | 
15 | 	printf("YOLOv3 TEST Begin\n");
16 |     char **names = get_labels("coco.names");
17 | 	int x;
18 | 	for(x=0;x<80;x++)//80 classe labels
19 | 	{
20 | 		printf("[%d]%s\n",x,names[x]);
21 | 	}
22 |     image **alphabet = load_alphabet();
23 | 
24 |     network *net = load_network("yolov3.cfg", "yolov2.weights", 0);
25 | 	set_batch_network(net, 1);
26 | 
27 | ////////////////////load img resize img begin
28 | 	char buff[256];
29 |     char *input_imgfn = buff;
30 | 	strncpy(input_imgfn, "dog.jpg", 256);
31 | 	printf("Input img:%s\n",input_imgfn);
32 | 	image im = load_image_stb(input_imgfn, 3);//3 channel img
33 | 	printf("img w=%d,h=%d,c=%d\n",im.w,im.h,im.c);
34 | 	image sized = letterbox_image(im, 416, 416);
35 | 	save_image_png(sized, "sized");// convert to yolov3 net input size 416x416x3
36 | ////////////////////load img resize img end
37 | 
38 | 	time_t first, second;
39 | 
40 | 	layer l = net->layers[net->n-1];
41 |     float *X = sized.data;
42 | 
43 | 	first=time(NULL);
44 | 	yolov2_hls_ps(net, X);
45 | 	second=time(NULL);
46 | 	printf("%s: Predicted in %f seconds.\n", input_imgfn, difftime(second,first));
47 | 
48 |     int nboxes = 0;
49 |     float nms=.45;
50 | 	float thresh = .5;
51 | 	float hier_thresh = .5;
52 |     detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
53 |     printf("%d\n", nboxes);
54 | 
55 |     if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
56 |     draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
57 | 
58 |     free_detections(dets, nboxes);
59 | 	
60 | ///////////////////write predictions img
61 | 	save_image_png(im, "predictions");// output
62 | 
63 | 	free_image(im);
64 |     free_image(sized);
65 | 
66 | 	printf("YOLOv3 TEST End\n");
67 | 
68 |     return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/yolov3_elf/main.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Empty C++ Application
 3 |  */
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <iostream>
 7 | #include <math.h>
 8 | #include <fcntl.h>
 9 | #include <string.h>
10 | #include <time.h>
11 | #include "yolo3.h"
12 | 
13 | 
14 | #include <chrono>
15 | 
16 | extern inline long long getTimeStampMill();
17 | 
18 | 
19 | int main( int argc, char *argv[])
20 | {
21 | 	unsigned int WEIGHT_BASE = 0x10000000;
22 |     unsigned int BETA_BASE = 0x1EC38000;
23 |     unsigned int MEM_BASE  = 0x1EC53000;
24 | 
25 | 	printf("YOLOv3 TEST Begin\n");
26 | 	printf("timestamp is %ld\n",getTimeStampMill());
27 |     char **names = get_labels((char*)"coco.names");
28 | 	int x;
29 |     image **alphabet = load_alphabet();
30 |     network *net = load_network((char*)"yolov3.cfg", (char*)"yolov3.weights", 0);
31 |     set_batch_network(net, 1);
32 | 
33 | 	char buff[256];
34 |     char *input_imgfn = buff;
35 |     if(argc==1)
36 |     {
37 |     	strncpy(input_imgfn, (char*)"dog.jpg", 256);
38 |     }
39 |     else
40 |     {
41 |     	strncpy(input_imgfn, argv[1], 256);
42 |     }
43 | 	image im = load_image_stb(input_imgfn, 3);
44 | 	image sized = letterbox_image(im, 416,416);
45 | 	save_image_png(sized, "sized");
46 | 	double time;
47 | 	layer l = net->layers[net->n-1];
48 |     float *X = sized.data;
49 |     time = what_time_is_it_now();
50 | 	net = yolov2_hls_ps(net, X,WEIGHT_BASE,BETA_BASE,MEM_BASE);
51 | 	printf("Predicted in %f seconds.\n",what_time_is_it_now()-time);
52 | 
53 |     int nboxes = 0;
54 | 	int total = 0;
55 |     float nms=0.45;
56 | 	float thresh = .5;
57 | 	float hier_thresh = .5;
58 | 
59 |     detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
60 | 
61 |     if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
62 | 
63 |     draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
64 |     free_detections(dets, nboxes);
65 | 
66 | 	save_image_png(im, "predictions");// output
67 | 
68 | 	free_image(im);
69 |     free_image(sized);
70 | 
71 | 	printf("YOLOv3 TEST End\n");
72 | 
73 |     return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/soft_version/Step03/main3.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <iostream>
 4 | #include <math.h>
 5 | #include <fcntl.h>
 6 | #include <string.h>
 7 | #include <time.h>
 8 | #include "yolov3.h"
 9 | 
10 | int main( int argc, char *argv[])
11 | {
12 | 	//freopen("result.txt","w",stdout);
13 | 
14 | 	printf("YOLOv3 TEST Begin\n");
15 |     char **names = get_labels("coco.names");
16 | 	int x;
17 | 	for(x=0;x<80;x++)//80 classe labels
18 | 	{
19 | 		printf("[%d]%s\n",x,names[x]);
20 | 	}
21 |     image **alphabet = load_alphabet();
22 | 
23 |     network *net = load_network("yolov3.cfg", "yolov3.weights", 0);
24 | 	set_batch_network(net, 1);
25 | 
26 | ////////////////////load img resize img begin
27 | 	char buff[256];
28 |     char *input_imgfn = buff;
29 | 	strncpy(input_imgfn, "dog.jpg", 256);
30 | 	printf("Input img:%s\n",input_imgfn);
31 | 	image im = load_image_stb(input_imgfn, 3);//3 channel img
32 | 	printf("img w=%d,h=%d,c=%d\n",im.w,im.h,im.c);
33 | 	//image sized = letterbox_image(im, 416, 416);
34 | 	image sized = letterbox_image(im, 416, 416);
35 | 
36 | 	save_image_png(sized, "sized");// convert to yolov3 net input size 416x416x3
37 | ////////////////////load img resize img end
38 | 
39 | 	time_t first, second;       
40 | 	
41 | 	layer l = net->layers[net->n-1];
42 |     float *X = sized.data;
43 | 
44 | 	//char line[256];
45 | 	//FILE *fp3;
46 | 	//char filename[256];
47 | 	//sprintf(filename, "yolo_layer_input_%d.txt", 123123);
48 | 	//printf("YOLO_layer:intputs=%d,%s\n",416*416*3,filename);
49 |  //   if( (fp3 = fopen(filename, "w")) == NULL)fprintf(stderr,"CANNOT OPEN\n");
50 |  //   for( x = 0; x < l.outputs; x++)
51 | 	//{
52 | 	//	sprintf(line, "%f\n", X[x]);
53 | 	//	if(fputs(line,fp3)<0)fprintf(stderr,"write FILE failed\n");
54 |  //   }
55 |  //   fclose(fp3);
56 | 
57 | 	first=time(NULL); 
58 | 	yolov2_hls_ps(net, X);
59 | 	printf("yolov2_hls_ps END!\n");
60 | 	second=time(NULL); 
61 | 	printf("%s: Predicted in %f seconds.\n", input_imgfn, difftime(second,first));
62 | 
63 |     int nboxes = 0;
64 |     float nms=.45;
65 | 	float thresh = .5;
66 | 	float hier_thresh = .5;
67 |     detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
68 |     printf("%d\n", nboxes);
69 | 	printf("get_network_boxes END!\n");
70 | 	/*
71 | 	for(x=0;x<nboxes;x++)
72 | 	{
73 | 		printf("[%3d]:h=%f,w=%f,x=%f,y=%f,objectness=%f\n",x,dets[x].bbox.h,dets[x].bbox.w,dets[x].bbox.x,dets[x].bbox.y,dets[x].objectness);
74 | 	}*/
75 |     if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
76 |     draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
77 | 
78 |     free_detections(dets, nboxes);
79 | 	
80 | ///////////////////write predictions img
81 | 	save_image_png(im, "predictions");// output
82 | 
83 | 	free_image(im);
84 |     free_image(sized);
85 | 
86 | 	printf("YOLOv3 TEST End\n");
87 | 
88 |     return 0;
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/yolov3_elf/xconv_hw.h:
--------------------------------------------------------------------------------
  1 | // ==============================================================
  2 | // File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
  3 | // Version: 2017.3
  4 | // Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
  5 | // 
  6 | // ==============================================================
  7 | 
  8 | #ifndef _LENET5_HW_H
  9 | #define _LENET5_HW_H
 10 | 
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <sys/mman.h>
 14 | #include <fcntl.h>
 15 | #include <sys/ioctl.h>
 16 | #include <unistd.h>
 17 | #include <linux/fb.h>
 18 | #include <string.h>
 19 | #include <time.h>
 20 | #include <stdint.h>
 21 | #include <sys/time.h>
 22 | 
 23 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_AP_CTRL            0x000
 24 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_GIE                0x004
 25 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_IER                0x008
 26 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_ISR                0x00c
 27 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT_R_DATA       0x010
 28 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT_R_DATA       32
 29 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT1_DATA        0x018
 30 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT1_DATA        32
 31 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT2_DATA        0x020
 32 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT2_DATA        32
 33 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT3_DATA        0x028
 34 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT3_DATA        32
 35 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT_R_DATA      0x030
 36 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT_R_DATA      32
 37 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT1_DATA       0x038
 38 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT1_DATA       32
 39 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_WEIGHT_DATA        0x040
 40 | #define XYOLO2_FPGA_CTRL_BUS_BITS_WEIGHT_DATA        32
 41 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_BETA_DATA          0x048
 42 | #define XYOLO2_FPGA_CTRL_BUS_BITS_BETA_DATA          32
 43 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INFM_NUM_DATA      0x050
 44 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INFM_NUM_DATA      32
 45 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTFM_NUM_DATA     0x058
 46 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTFM_NUM_DATA     32
 47 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_KERNEL_SIZE_DATA   0x060
 48 | #define XYOLO2_FPGA_CTRL_BUS_BITS_KERNEL_SIZE_DATA   32
 49 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_KERNEL_STRIDE_DATA 0x068
 50 | #define XYOLO2_FPGA_CTRL_BUS_BITS_KERNEL_STRIDE_DATA 32
 51 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT_W_DATA       0x070
 52 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT_W_DATA       32
 53 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUT_H_DATA       0x078
 54 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUT_H_DATA       32
 55 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT_W_DATA      0x080
 56 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT_W_DATA      32
 57 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUT_H_DATA      0x088
 58 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUT_H_DATA      32
 59 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_PADDING_DATA       0x090
 60 | #define XYOLO2_FPGA_CTRL_BUS_BITS_PADDING_DATA       32
 61 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_ISNL_DATA          0x098
 62 | #define XYOLO2_FPGA_CTRL_BUS_BITS_ISNL_DATA          1
 63 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_ISBN_DATA          0x0a0
 64 | #define XYOLO2_FPGA_CTRL_BUS_BITS_ISBN_DATA          1
 65 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TM_DATA            0x0a8
 66 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TM_DATA            32
 67 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TN_DATA            0x0b0
 68 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TN_DATA            32
 69 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TR_DATA            0x0b8
 70 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TR_DATA            32
 71 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TC_DATA            0x0c0
 72 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TC_DATA            32
 73 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_MLOOPS_DATA        0x0c8
 74 | #define XYOLO2_FPGA_CTRL_BUS_BITS_MLOOPS_DATA        32
 75 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_NLOOPS_DATA        0x0d0
 76 | #define XYOLO2_FPGA_CTRL_BUS_BITS_NLOOPS_DATA        32
 77 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_RLOOPS_DATA        0x0d8
 78 | #define XYOLO2_FPGA_CTRL_BUS_BITS_RLOOPS_DATA        32
 79 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_CLOOPS_DATA        0x0e0
 80 | #define XYOLO2_FPGA_CTRL_BUS_BITS_CLOOPS_DATA        32
 81 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_LAYERTYPE_DATA     0x0e8
 82 | #define XYOLO2_FPGA_CTRL_BUS_BITS_LAYERTYPE_DATA     32
 83 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_INPUTQ_DATA        0x0f0
 84 | #define XYOLO2_FPGA_CTRL_BUS_BITS_INPUTQ_DATA        32
 85 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_OUTPUTQ_DATA       0x0f8
 86 | #define XYOLO2_FPGA_CTRL_BUS_BITS_OUTPUTQ_DATA       32
 87 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_WEIGHTQ_DATA       0x100
 88 | #define XYOLO2_FPGA_CTRL_BUS_BITS_WEIGHTQ_DATA       32
 89 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_BETAQ_DATA         0x108
 90 | #define XYOLO2_FPGA_CTRL_BUS_BITS_BETAQ_DATA         32
 91 | #define XYOLO2_FPGA_CTRL_BUS_ADDR_TROW_LOOPS_DATA    0x110
 92 | #define XYOLO2_FPGA_CTRL_BUS_BITS_TROW_LOOPS_DATA    32
 93 | 
 94 | 
 95 | #define YOLO2_BASEADDR 0x43c00000
 96 | 
 97 | #define WriteReg(BaseAddress, RegOffset, Data) *(volatile unsigned int*)((BaseAddress) + (RegOffset)) = (Data)
 98 | #define ReadReg(BaseAddress, RegOffset) *(volatile unsigned int*)((BaseAddress) + (RegOffset))
 99 | 
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/soft_version/Step02/main3.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <iostream>
  5 | #include <math.h>
  6 | #include <fcntl.h>
  7 | #include <string.h>
  8 | #include <time.h>
  9 | #include "yolov3.h"
 10 | 
 11 | #define MIN_VALUE (-1024*1024*1024)
 12 | #define MAX_VALUE (1024*1024*1024)
 13 | 
 14 | #define QUANTI
 15 | 
 16 | #ifndef QUANTI
 17 | int main( int argc, char *argv[])
 18 | {
 19 | 	//freopen("result.txt","w",stdout);
 20 | 	printf("YOLOv3 TEST Begin\n");
 21 |     	char **names = get_labels("coco.names");
 22 | 	int x;
 23 | 	for(x=0;x<80;x++)//80 classe labels
 24 | 	{
 25 | 		printf("[%d]%s\n",x,names[x]);
 26 | 	}
 27 |     	image **alphabet = load_alphabet();
 28 |     	network *net = load_network("yolov3.cfg");
 29 | 	set_batch_network(net, 1);
 30 | 
 31 | ////////////////////load img resize img begin
 32 | 	char img_buff[256];
 33 | 	char *input_imgfn = img_buff;
 34 | 	if(argc==1)
 35 | 		strncpy(input_imgfn, "./dog.jpg", 256);
 36 | 	else
 37 | 		strncpy(input_imgfn, argv[1], 256);
 38 | 	image im = load_image_stb(input_imgfn, 3);//3 channel img
 39 | 	printf("Input img:%s\n w=%d,h=%d,c=%d\n", input_imgfn, im.w, im.h, im.c);
 40 | 	image sized = letterbox_image(im, 416, 416);
 41 | 	save_image_png(sized, "sized");// convert to yolov3 net input size 416x416x3
 42 | ////////////////////load img resize img end
 43 | 
 44 | 	time_t first, second;       
 45 | 	layer l = net->layers[net->n-1];
 46 |     	float *X = sized.data;
 47 | 
 48 | 	first=time(NULL); 
 49 | 	yolov2_hls_ps(net, X);
 50 | 	second=time(NULL); 
 51 | 	printf("%s: Predicted in %f seconds.\n", input_imgfn, difftime(second,first));
 52 | 
 53 |     int nboxes = 0;
 54 |     float nms=.45;
 55 | 	float thresh = .5;
 56 | 	float hier_thresh = .5;
 57 |     detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
 58 |     printf("%d\n", nboxes);
 59 | 	//for(x=0;x<nboxes;x++)
 60 | 	//{
 61 | 	//	printf("[%3d]:h=%f,w=%f,x=%f,y=%f,objectness=%f\n",x,dets[x].bbox.h,dets[x].bbox.w,dets[x].bbox.x,dets[x].bbox.y,dets[x].objectness);
 62 | 	//}
 63 | 
 64 |     if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
 65 |     draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
 66 | 
 67 |     free_detections(dets, nboxes);
 68 | 	
 69 | ///////////////////write predictions img
 70 | 	save_image_png(im, "predictions");// output
 71 | 
 72 | 	free_image(im);
 73 |     free_image(sized);
 74 | 
 75 | 	printf("YOLOv3 TEST End\n");
 76 | 
 77 |     return 0;
 78 | }
 79 | 
 80 | #else
 81 | 
 82 | int quantize_short16(float *in,short *out,int *offset,int layer_num,float *ap16_range,int *maxQ_array)
 83 | {
 84 | 	int i;
 85 | 	int offset_index = 0;
 86 | 	int woffset = 0;
 87 | 	for(i=0;i<layer_num;i++)
 88 | 	{
 89 | 		if(offset[offset_index]==0)
 90 | 			return i;
 91 | 		printf("Layer %2d;weight num=%12d ",i,offset[offset_index]);
 92 | 		int j;
 93 | 		float min,max;
 94 | 		min = MAX_VALUE;
 95 | 		max = MIN_VALUE;
 96 | 		for(j=0;j<offset[offset_index];j++)
 97 | 		{
 98 | 			float tmp_in_float = in[woffset+j];
 99 | 			if(tmp_in_float<min)
100 | 				min = tmp_in_float;
101 | 			if(tmp_in_float>max)
102 | 				max = tmp_in_float;
103 | 		}
104 | 		printf("float min=%.7lf,max=%.7lf ",min,max);//find float min max
105 | 
106 | 		int k;
107 | 		int maxQ = -1;
108 | 		for(k=0;k<16;k++)//find maxQ
109 | 		{
110 | 			if(min>ap16_range[2*k]&&max<ap16_range[2*k+1])
111 | 			{
112 | 				maxQ = k;
113 | 			}
114 | 			else if(k==0)
115 | 			{
116 | 				printf("beyond Q0 min=%.7lf,max=%.7lf ",min,max);
117 | 				break;
118 | 			}
119 | 		}
120 | 		printf("maxQ=%d ",maxQ);
121 | 		maxQ_array[i] = maxQ;
122 | 
123 | 		double max_error,min_error,sum_error;
124 | 		sum_error = 0;
125 | 		max_error = MIN_VALUE;
126 | 		min_error = MAX_VALUE;
127 | 		for(j=0;j<offset[offset_index];j++)
128 | 		{
129 | 			float tmp_in_float = in[woffset+j];
130 | 			short tmp_fixed = (short)(tmp_in_float*pow(2.0,maxQ));
131 | 			float tmp_out_float = (float)tmp_fixed*pow(2.0,-maxQ);
132 | 			double error = (tmp_out_float - tmp_in_float)*(tmp_out_float - tmp_in_float);
133 | 			error = sqrt(error);
134 | 			sum_error += error;
135 | 			if(error<min_error)
136 | 				min_error = error;
137 | 			if(error>max_error)
138 | 				max_error = error;
139 | 
140 | 			out[woffset+j] = tmp_fixed;
141 | 		}
142 | 		printf("sum2_error = %.7lf,min_error=%.7lf,max_error=%.7lf",sum_error,min_error,max_error);
143 | 		printf("\n");
144 | 
145 | 		woffset += offset[offset_index];
146 | 		offset_index++;
147 | 	}
148 | 
149 | 	return 0;
150 | }
151 | 
152 | int main(int argc,char *argv[])
153 | {
154 | 	int i;
155 | 	printf("Test fixed-point\n");
156 | /*
157 | 	int weight_offset[32] = {864, 18432, 73728, 8192, 73728,
158 | 		294912, 32768, 294912, 1179648, 131072, 1179648, 131072,
159 | 		1179648, 4718592, 524288, 4718592, 524288, 4718592, 9437184,
160 | 		9437184, 32768, 11796480, 435200, 0, 0, 0, 0, 0, 0, 0, 0, 0};
161 | 
162 | 	int beta_offset[32] = {32, 64, 128, 64, 128, 256, 128, 256, 512, 256, 512, 256, 512, 1024,
163 | 		512, 1024, 512, 1024, 1024, 1024, 64, 1024, 425, 0, 0, 0, 0, 0, 0, 0, 0, 0};
164 | 
165 | 	short *Weight_fixed_buf = (short *)calloc(203767168/4,sizeof(short));
166 | 	float *Weight_buf = (float *)calloc(203767168/4,sizeof(float));
167 | 	float *Beta_buf   = (float *)calloc(43044/4,sizeof(float));
168 | */
169 | int yolo3_weight_offset[128] = {864,18432,2048,18432,
170 | 		73728,8192,73728,
171 | 		8192,73728,
172 | 		294912,32768,294912,
173 | 		32768,294912,
174 | 		32768,294912,
175 | 		32768,294912,
176 | 		32768,294912,
177 | 		32768,294912,
178 | 		32768,294912,
179 | 		32768,294912,
180 | 		1179648,131072,1179648,
181 | 		131072,1179648,
182 | 		131072,1179648,
183 | 		131072,1179648,
184 | 		131072,1179648,
185 | 		131072,1179648,
186 | 		131072,1179648,
187 | 		131072,1179648,
188 | 		4718592,524288,4718592,
189 | 		524288,4718592,
190 | 		524288,4718592,
191 | 		524288,4718592,
192 | 		524288,4718592,524288,4718592,524288,4718592,261120,
193 | 		131072,
194 | 		196608,1179648,131072,1179648,131072,1179648,130560,
195 | 		32768,
196 | 		49152,294912,32768,294912,32768,294912,65280,
197 | 		0,0,0,0,0,0,0,0,0,0,
198 | 		0,0,0,0,0,0,0,0,0,0,
199 | 		0,0,0,0,0,0,0,0,0,0,
200 | 		0,0,0,0,0,0,0,0,0,0,
201 | 		0,0,0,0,0,0,0,0,0,0,
202 | 		0,0,0};
203 | 	
204 | 	int yolo3_beta_offset[128] = {32,64,32,64,
205 | 		128,64,128,
206 | 		64,128,
207 | 		256,128,256,
208 | 		128,256,
209 | 		128,256,
210 | 		128,256,
211 | 		128,256,
212 | 		128,256,
213 | 		128,256,
214 | 		128,256,
215 | 		512,256,512,
216 | 		256,512,
217 | 		256,512,
218 | 		256,512,
219 | 		256,512,
220 | 		256,512,
221 | 		256,512,
222 | 		256,512,
223 | 		1024,512,1024,
224 | 		512,1024,
225 | 		512,1024,
226 | 		512,1024,
227 | 		512,1024,512,1024,512,1024,255,
228 | 		256,
229 | 		256,512,256,512,256,512,255,
230 | 		128,
231 | 		128,256,128,256,128,256,255,
232 | 		0,0,0,0,0,0,0,0,0,0,
233 | 		0,0,0,0,0,0,0,0,0,0,
234 | 		0,0,0,0,0,0,0,0,0,0,
235 | 		0,0,0,0,0,0,0,0,0,0,
236 | 		0,0,0,0,0,0,0,0,0,0,
237 | 		0,0,0};
238 | 	
239 | 	short* yolo3_weight_fixed_buf = (short *)calloc(247583104/4,sizeof(short));
240 | 	float* yolo3_weight_buf = (float *)calloc(247583104/4,sizeof(float));
241 | 	float* yolo3_beta_buf   = (float *)calloc(108276/4,sizeof(float));
242 | 	
243 | 	
244 | 	FILE *fp_w = fopen("weights_reorg.bin", "rb");
245 |     if(!fp_w) printf("fopen weights_reorg.bin error\n");
246 | 	FILE *fp_b = fopen("bias.bin", "rb");
247 |     if(!fp_b) printf("fopen bias.bin error\n");
248 | /*
249 | 	fread(Weight_buf, sizeof(float), 203767168/4, fp_w);
250 | 	fread(Beta_buf, sizeof(float), 43044/4, fp_b);
251 | */
252 |     fread(yolo3_weight_buf, sizeof(float), 247583104/4, fp_w);
253 | 	fread(yolo3_beta_buf, sizeof(float), 108276/4, fp_b);
254 | 	
255 | 	fclose(fp_w);
256 | 	fclose(fp_b);
257 | ////////////////////////////////
258 | 
259 | 	short ap16_min = 0x8000;
260 | 	short ap16_max = 0x7fff;
261 | 	printf("ap16_min = %d \nap16_max = %d\n",ap16_min,ap16_max);
262 | 	float ap16_range[16*2];
263 | 	for(i=0;i<16;i++)
264 | 	{
265 | 		printf("Q%2d:",i);
266 | 		ap16_range[2*i]   = (float)ap16_min*pow((float)2,-i);//min
267 | 		ap16_range[2*i+1] = (float)ap16_max*pow((float)2,-i);//max
268 | 		printf("min=%.7lf,max=%.7lf\n",ap16_range[2*i],ap16_range[2*i+1]);
269 | 	}
270 | ////////////////////////////////
271 | 	int maxQ_array[32];
272 | 	int layer_num;
273 | 	FILE* fout;
274 | 	char layer_num_string[256];
275 | 	char s[256];
276 | 
277 | 	printf("weight quantize begin\n");
278 | 	layer_num = quantize_short16(yolo3_weight_buf,yolo3_weight_fixed_buf,yolo3_weight_offset,128,ap16_range,maxQ_array);
279 | 	for(i=0;i<layer_num;i++)
280 | 	{
281 | 		printf("[%d]=%d\n",i,maxQ_array[i]);
282 | 	}
283 | 	sprintf(s,"weights_reorg_ap16_maxQ_%d.bin", layer_num);
284 | 	printf("%s\n",s);
285 | 
286 | 	fout = fopen(s,"wb");
287 |     if(!fout) printf("fopen %s error\n",s);
288 | 	fwrite(maxQ_array,sizeof(int), layer_num,fout);
289 | 	fclose(fout);
290 | 
291 | 	fout = fopen("weights_reorg_ap16.bin","wb");
292 |     if(!fout) printf("fopen weights_reorg_ap16.bin error\n");
293 | 	//fwrite(Weight_fixed_buf,sizeof(short), 203767168/4,fout);
294 | 	fwrite(yolo3_weight_fixed_buf,sizeof(short), 247583104/4,fout);
295 | 	fclose(fout);
296 | 	printf("weight quantize end\n");
297 | 
298 | 	printf("beta quantize begin\n");
299 | 	layer_num = quantize_short16(yolo3_beta_buf,yolo3_weight_fixed_buf,yolo3_beta_offset,128,ap16_range,maxQ_array);
300 | 	for(i=0;i<layer_num;i++)
301 | 	{
302 | 		printf("[%d]=%d\n",i,maxQ_array[i]);
303 | 	}
304 | 	sprintf(s,"bias_ap16_maxQ_%d.bin", layer_num);
305 | 	printf("%s\n",s);
306 | 
307 | 	fout = fopen(s,"wb");
308 |     if(!fout) printf("fopen %s error\n",s);
309 | 	fwrite(maxQ_array,sizeof(int), layer_num,fout);
310 | 	fclose(fout);
311 | 
312 | 	fout = fopen("bias_ap16.bin","wb");
313 |     if(!fout) printf("fopen bias_ap16.bin error\n");
314 | 	//fwrite(Weight_fixed_buf,sizeof(short), 43044/4+1,fout);
315 | 	fwrite(yolo3_weight_fixed_buf,sizeof(short), 108276/4+1,fout);
316 | 	fclose(fout);
317 | 	printf("beta quantize end\n");
318 | 
319 | 	free(yolo3_weight_fixed_buf);
320 | 	free(yolo3_weight_buf);
321 | 	free(yolo3_beta_buf);
322 | 	
323 | 	printf("0.1=%.10lf\n",((short)(0.1*pow(2.0,15)))*pow(2.0,-15));
324 | 	printf("0.1=%x\n",(short)(0.1*pow(2.0,15)));
325 | 	return 0;
326 | }
327 | 
328 | #endif
329 | 
330 | 


--------------------------------------------------------------------------------
/soft_version/Step02/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/soft_version/Step03/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/yolov3_hls/testBench/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/yolov3_hls/sourceFile/yolov3.cpp:
--------------------------------------------------------------------------------
   1 | 
   2 | #include <stdio.h>
   3 | #include <string.h>
   4 | #include <assert.h>
   5 | #include <ap_int.h>
   6 | #include "cnn.h"
   7 | 
   8 | 
   9 | /*
  10 |  *mei gai sha a za conv dou bu dui jin le?
  11 |  * */
  12 | ////////////////////////////////////////////20181229 n4m32  v2 without input and reorg opt ok input opt ok combine input relu comb ok // input opt ok // output opt ok //weight opt ok (5)n4m32i4o2 ok start
  13 | #define MAX(x,y) ((x)>(y)?(x):(y))
  14 | #define MIN(x,y) ((x)<(y)?(x):(y))
  15 | #define S 2
  16 | #define K 3
  17 | 
  18 | //Tn he Tm hai bu zhi dao shen me yi si
  19 | #define Tn 4
  20 | #define Tm 32
  21 | 
  22 | //Tr,Tc yao gen ju wang luo da xiao she zhi  
  23 | #define Tr 26
  24 | #define Tc 26
  25 | 
  26 | #define SIZE 13
  27 | #define PARA 1
  28 | 
  29 | #define OnChipIB_Width  ((Tc-1)*S+K)
  30 | #define OnChipIB_Height ((Tr-1)*S+K)
  31 | //MAX_BETA_LENGTH yao gen ju ce ding de shu ju she zhi
  32 | #define MAX_BETA_LENGTH (1024)
  33 | #define INTERWIDTH 20
  34 | /*
  35 | 方案1：shortcut分别加载两次，每次四个
  36 | 	尝试ing
  37 | 方案2：重写加载函数只包含两个端口
  38 | 	等方案一的结果
  39 | */
  40 | 
  41 | typedef unsigned char UCHAR;
  42 | 
  43 | void mmcpy_inputport(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum)
  44 | {
  45 | 	bool enable = TN_MIN > 0;
  46 | 	if(!enable)
  47 | 		return;
  48 | 
  49 | 	memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int));
  50 | 
  51 | }
  52 | 
  53 | void mmcpy_inputport1(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum)
  54 | {
  55 | 	bool enable = TN_MIN > 1;
  56 | 	if(!enable)
  57 | 		return;
  58 | 
  59 | 	memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int));
  60 | 
  61 | }
  62 | 
  63 | void mmcpy_inputport2(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum)
  64 | {
  65 | 	bool enable = TN_MIN > 2;
  66 | 	if(!enable)
  67 | 		return;
  68 | 
  69 | 	memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int));
  70 | 
  71 | 
  72 | }
  73 | 
  74 | void mmcpy_inputport3(int *input,int input_memcpy_buffer[(OnChipIB_Width+3)/2],ap_uint<3> TN_MIN,int RowOffset,UCHAR RowIntNum)
  75 | {
  76 | 	bool enable = TN_MIN > 3;
  77 | 	if(!enable)
  78 | 		return;
  79 | 
  80 | 	memcpy(input_memcpy_buffer,(int *)(input + RowOffset),RowIntNum*sizeof(int));
  81 | 
  82 | }
  83 | 
  84 | /*
  85 | 是从四个端口加入的
  86 | 主要是看大小
  87 | */
  88 | void mmcpy_inputpixel_m2b_comb(int *input,int *input1,int *input2,int *input3,
  89 | 						  int input_memcpy_buffer[(OnChipIB_Width+3)/2],int input_memcpy_buffer1[(OnChipIB_Width+3)/2],
  90 | 						  int input_memcpy_buffer2[(OnChipIB_Width+3)/2],int input_memcpy_buffer3[(OnChipIB_Width+3)/2],
  91 | 						  ap_uint<1>  RowBeginByte[Tn],ap_uint<3> TN_MIN_3b,ap_uint<6> t2,ap_uint<1> RowSub,int IN_OFFSET,ap_uint<9> RowIncreaseLength,ap_uint<18> IHxIW_18b,ap_uint<6> ColIncreaseLength,ap_uint<6> next_t2[1],bool next_IsRowPixel[1],bool IsRowPixel,bool enable)
  92 | {
  93 | 	static int tmp_inoffset;
  94 | 
  95 | 	next_t2[0] = t2;
  96 | 	next_IsRowPixel[0] = IsRowPixel;
  97 | 
  98 | 	if(!enable)
  99 | 		return;
 100 | 
 101 | 	bool init = (t2==0);
 102 | 	if(init)
 103 | 	{
 104 | 		tmp_inoffset = IN_OFFSET;
 105 | 	}else
 106 | 	{
 107 | 		tmp_inoffset += RowIncreaseLength;
 108 | 	}
 109 | 
 110 | 	int InOffset[Tn];
 111 | #pragma HLS ARRAY_PARTITION variable=InOffset complete dim=1
 112 | 	int RowOffset[Tn];
 113 | #pragma HLS ARRAY_PARTITION variable=RowOffset complete dim=1
 114 | 	ap_uint<1>  LowBit[Tn];
 115 | #pragma HLS ARRAY_PARTITION variable=LowBit complete dim=1
 116 | 	UCHAR BeginByteNum[Tn];
 117 | #pragma HLS ARRAY_PARTITION variable=BeginByteNum complete dim=1
 118 | 	UCHAR RowIntNum[Tn];
 119 | #pragma HLS ARRAY_PARTITION variable=RowIntNum complete dim=1
 120 | 
 121 | 	int t1;
 122 | 	for(t1 = 0;t1 < Tn;t1++)
 123 | 	{
 124 | #pragma HLS UNROLL
 125 | 		InOffset[t1] = tmp_inoffset + t1*IHxIW_18b;
 126 | 		RowOffset[t1] = InOffset[t1] >> 1;
 127 | 		LowBit[t1] = InOffset[t1]&0x1;
 128 | 		RowBeginByte[t1] = LowBit[t1];
 129 | 		BeginByteNum[t1] = ColIncreaseLength + LowBit[t1];
 130 | 
 131 | //		assert((BeginByteNum[t1] > 0)&&(BeginByteNum[t1] < 256));
 132 | 
 133 | 		RowIntNum[t1] = BeginByteNum[t1] >> 1;
 134 | 		if(BeginByteNum[t1]&0x1)
 135 | 			RowIntNum[t1]++;
 136 | 
 137 | //		assert((RowIntNum[t1] > 0)&&(RowIntNum[t1] < 256));
 138 | 	}
 139 | 
 140 | 	mmcpy_inputport(input,input_memcpy_buffer, TN_MIN_3b,RowOffset[0],RowIntNum[0]);
 141 | 	mmcpy_inputport1(input1,input_memcpy_buffer1, TN_MIN_3b,RowOffset[1],RowIntNum[1]);
 142 | 	mmcpy_inputport2(input2,input_memcpy_buffer2, TN_MIN_3b,RowOffset[2],RowIntNum[2]);
 143 | 	mmcpy_inputport3(input3,input_memcpy_buffer3, TN_MIN_3b,RowOffset[3],RowIntNum[3]);
 144 | }
 145 | 
 146 | void copy_input2buf_row(short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],ap_uint<6> row_len,ap_uint<6> col_len,ap_uint<1> RowSub,ap_uint<1> ColSub,
 147 | 					int input_memcpy_buffer[(OnChipIB_Width+3)/2],int input_memcpy_buffer1[(OnChipIB_Width+3)/2],
 148 | 					int input_memcpy_buffer2[(OnChipIB_Width+3)/2],int input_memcpy_buffer3[(OnChipIB_Width+3)/2],
 149 | 					ap_uint<1>  RowBeginByte[Tn],UCHAR TRow,UCHAR TCol,int LayerType,ap_uint<6> next_t2[1],bool next_enable[1],bool enable,ap_uint<3> T2Rate)
 150 | {
 151 | 
 152 | 	if(!enable)
 153 | 		return;
 154 | 
 155 | 	static ap_uint<6> t2_local = 0;
 156 | 	ap_uint<6> t2 = next_t2[0];
 157 | 	bool IsRowPixel = next_enable[0];
 158 | 	int t1,t3;
 159 | 	ap_uint<6> t2r;
 160 | 	ap_uint<3> T2R;
 161 | 
 162 | 	bool initial = (t2==0);
 163 | 	if(initial)
 164 | 	{
 165 | 		t2_local = 0;
 166 | 	}
 167 | 
 168 | 	short pad_value = 0;
 169 | 	/*
 170 | 	if(LayerType==1)
 171 | 		pad_value = 0x8001;
 172 | 	*/
 173 | 	int input_mmcpy_offset[Tn];
 174 | #pragma HLS ARRAY_PARTITION variable=input_mmcpy_offset complete dim=1
 175 | 	bool NextInputFlag[Tn];
 176 | #pragma HLS ARRAY_PARTITION variable=NextInputFlag complete dim=1
 177 | 	ap_uint<1>  cnt[Tn];
 178 | #pragma HLS ARRAY_PARTITION variable=cnt complete dim=1
 179 | 	short input_array[Tn][2];
 180 | #pragma HLS ARRAY_PARTITION variable=input_array complete dim=1
 181 | 
 182 | 	for(t1 = 0;t1 < Tn; t1++)
 183 | 	{
 184 | #pragma HLS UNROLL
 185 | 		input_mmcpy_offset[t1] = 0;
 186 | 	}
 187 | 
 188 | 	if(!IsRowPixel)
 189 | 	{
 190 | 		T2R = T2Rate + 1;
 191 | 	}else
 192 | 	{
 193 | 		T2R = T2Rate;
 194 | 	}
 195 | //	ap_uint<6> T2R_bound = MIN(t2_local + T2R,OnChipIB_Height);
 196 | 	unsigned char tmp_min = t2_local + T2R;
 197 | 	ap_uint<6> T2R_bound = MIN(tmp_min, OnChipIB_Height);
 198 | 
 199 | 	bool IsRowInit_flag = true;
 200 | 
 201 | 	for(t2r = t2_local;t2r < T2R_bound; t2r++)
 202 | 		for(t3 = 0;t3 < TCol; t3++)
 203 | 		{
 204 | #pragma HLS PIPELINE
 205 | 			bool IsRowPixel_t2r = (t2r >= RowSub)&&(t2r < (row_len + RowSub));
 206 | 			bool IsColPixel = (t3 >= ColSub)&&(t3 < (col_len + ColSub));
 207 | 			bool IsRowInit = (t3==ColSub)&&IsRowInit_flag;
 208 | 
 209 | 			if(IsRowPixel_t2r&&IsColPixel)
 210 | 			{
 211 | 				if(IsRowInit)
 212 | 				{
 213 | 					IsRowInit_flag = false;
 214 | 					cnt[0] = RowBeginByte[0];
 215 | 					cnt[1] = RowBeginByte[1];
 216 | 					cnt[2] = RowBeginByte[2];
 217 | 					cnt[3] = RowBeginByte[3];
 218 | 					NextInputFlag[0] = true;
 219 | 					NextInputFlag[1] = true;
 220 | 					NextInputFlag[2] = true;
 221 | 					NextInputFlag[3] = true;
 222 | 				}
 223 | 
 224 | 				if(NextInputFlag[0])
 225 | 				{
 226 | 					input_array[0][0] = input_memcpy_buffer[input_mmcpy_offset[0]];
 227 | 					input_array[0][1] = input_memcpy_buffer[input_mmcpy_offset[0]] >> 16;
 228 | 					input_mmcpy_offset[0]++;
 229 | 					NextInputFlag[0] = false;
 230 | 				}
 231 | 
 232 | 				if(NextInputFlag[1])
 233 | 				{
 234 | 					input_array[1][0] = input_memcpy_buffer1[input_mmcpy_offset[1]];
 235 | 					input_array[1][1] = input_memcpy_buffer1[input_mmcpy_offset[1]] >> 16;
 236 | 					input_mmcpy_offset[1]++;
 237 | 					NextInputFlag[1] = false;
 238 | 				}
 239 | 
 240 | 				if(NextInputFlag[2])
 241 | 				{
 242 | 					input_array[2][0] = input_memcpy_buffer2[input_mmcpy_offset[2]];
 243 | 					input_array[2][1] = input_memcpy_buffer2[input_mmcpy_offset[2]] >> 16;
 244 | 					input_mmcpy_offset[2]++;
 245 | 					NextInputFlag[2] = false;
 246 | 				}
 247 | 
 248 | 				if(NextInputFlag[3])
 249 | 				{
 250 | 					input_array[3][0] = input_memcpy_buffer3[input_mmcpy_offset[3]];
 251 | 					input_array[3][1] = input_memcpy_buffer3[input_mmcpy_offset[3]] >> 16;
 252 | 					input_mmcpy_offset[3]++;
 253 | 					NextInputFlag[3] = false;
 254 | 				}
 255 | 
 256 | 				input_buffer[0][t2r][t3] = input_array[0][cnt[0]];
 257 | 				input_buffer[1][t2r][t3] = input_array[1][cnt[1]];
 258 | 				input_buffer[2][t2r][t3] = input_array[2][cnt[2]];
 259 | 				input_buffer[3][t2r][t3] = input_array[3][cnt[3]];
 260 | 
 261 | 				if(cnt[0]==1)
 262 | 				{
 263 | 					NextInputFlag[0] = true;
 264 | 					cnt[0] = 0;
 265 | 				}else
 266 | 				{
 267 | 					cnt[0] = 1;
 268 | 				}
 269 | 
 270 | 				if(cnt[1]==1)
 271 | 				{
 272 | 					NextInputFlag[1] = true;
 273 | 					cnt[1] = 0;
 274 | 				}else
 275 | 				{
 276 | 					cnt[1] = 1;
 277 | 				}
 278 | 
 279 | 				if(cnt[2]==1)
 280 | 				{
 281 | 					NextInputFlag[2] = true;
 282 | 					cnt[2] = 0;
 283 | 				}else
 284 | 				{
 285 | 					cnt[2] = 1;
 286 | 				}
 287 | 
 288 | 				if(cnt[3]==1)
 289 | 				{
 290 | 					NextInputFlag[3] = true;
 291 | 					cnt[3] = 0;
 292 | 				}else
 293 | 				{
 294 | 					cnt[3] = 1;
 295 | 				}
 296 | 			}else
 297 | 			{
 298 | 				input_buffer[0][t2r][t3] = pad_value;
 299 | 				input_buffer[1][t2r][t3] = pad_value;
 300 | 				input_buffer[2][t2r][t3] = pad_value;
 301 | 				input_buffer[3][t2r][t3] = pad_value;
 302 | 			}
 303 | 
 304 | 		}
 305 | 
 306 | 		t2_local += T2R;
 307 | }
 308 | 
 309 | 
 310 | //到底影响不影响啊？喵喵喵？
 311 | /*
 312 | mmcpy_inputpixel_m2b_comb pingpong了这个函数
 313 | */
 314 | void input_load(int *input,int *input1,int *input2,int *input3,
 315 | 				short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],int r,int c,int n,int Kernel_stride,int Padding,UCHAR TRow,UCHAR TCol,int Input_w,int Input_h,int TN_MIN,int IHxIW,int LayerType,ap_uint<6> trow_loops)
 316 | {
 317 | 	static int input_memcpy_buffer0[(OnChipIB_Width+3)/2];
 318 | 	static int input_memcpy_buffer1[(OnChipIB_Width+3)/2];
 319 | 	static int input_memcpy_buffer2[(OnChipIB_Width+3)/2];
 320 | 	static int input_memcpy_buffer3[(OnChipIB_Width+3)/2];
 321 | 	ap_uint<1> RowBeginByte[Tn];
 322 | #pragma HLS ARRAY_PARTITION variable=RowBeginByte complete dim=1//0 ro 1
 323 | 
 324 | 	static int input_memcpy_buffer02[(OnChipIB_Width+3)/2];
 325 | 	static int input_memcpy_buffer12[(OnChipIB_Width+3)/2];
 326 | 	static int input_memcpy_buffer22[(OnChipIB_Width+3)/2];
 327 | 	static int input_memcpy_buffer32[(OnChipIB_Width+3)/2];
 328 | 	ap_uint<1> RowBeginByte2[Tn];//0 ro 1
 329 | #pragma HLS ARRAY_PARTITION variable=RowBeginByte2 complete dim=1//0 ro 1
 330 | 
 331 | 	ap_uint<1> RowSub,ColSub;
 332 | 
 333 | 	ap_uint<6> t2;
 334 | 
 335 | 	ap_uint<9> r_9b = r;
 336 | //	assert(r < 512);
 337 | 	ap_uint<9> c_9b = c;
 338 | //	assert(c < 512);
 339 | //	assert(n < 2048);
 340 | 	ap_uint<11> n_11b = n;
 341 | //	assert(Kernel_stride < 4);
 342 | 	ap_uint<2> Kernel_stride_2b = Kernel_stride;
 343 | //	assert(Padding < 2);
 344 | 	ap_uint<1> Padding_1b = Padding;
 345 | //	assert(Input_w < 512);
 346 | 	ap_uint<9> Input_w_9b = Input_w;
 347 | 	ap_uint<10> Input_h_10b = Input_h;
 348 | //	assert(Input_h < 1024);
 349 | //	assert(TN_MIN < 8);//xx8
 350 | 	ap_uint<3> TN_MIN_3b = TN_MIN;
 351 | 	ap_uint<18> IHxIW_18b = IHxIW;
 352 | //	assert(IHxIW < 512*512);
 353 | 
 354 | 	ap_int<12> Coffset;
 355 | 	ap_int<12> Roffset;
 356 | 	//   Coffset = c_9b*Kernel_stride_2b - Padding_1b;
 357 | 	//   Roffset = r_9b*Kernel_stride_2b - Padding_1b;
 358 | 	if(LayerType == 2){
 359 | 	    Coffset = c_9b;
 360 | 	    Roffset = r_9b;
 361 | 	} else {
 362 | 	   Coffset = c_9b*Kernel_stride_2b - Padding_1b;
 363 | 	   Roffset = r_9b*Kernel_stride_2b - Padding_1b;
 364 | 	}
 365 | 
 366 | 	ap_uint<12> TCol_right,TRow_bottom;
 367 | 	ap_uint<10> TRow_top,TCol_left;
 368 | 	ap_uint<6> row_len,col_len;
 369 | 
 370 | 	if(Coffset > 0)
 371 | 		TCol_left = Coffset;
 372 | 	else
 373 | 		TCol_left = 0;
 374 | 
 375 | 	if((Coffset + TCol-1)<Input_w_9b)
 376 | 		TCol_right = Coffset + TCol;
 377 | 	else
 378 | 		TCol_right = Input_w_9b;
 379 | 
 380 | 	col_len = TCol_right - TCol_left;
 381 | 
 382 | 	if(Roffset > 0)
 383 | 		TRow_top = Roffset;
 384 | 	else
 385 | 		TRow_top = 0;
 386 | 
 387 | 	if((Roffset + TRow-1)<Input_h_10b)
 388 | 		TRow_bottom = Roffset + TRow;
 389 | 	else
 390 | 		TRow_bottom = Input_h_10b;
 391 | 
 392 | 	row_len = TRow_bottom - TRow_top;
 393 | 
 394 | 	int IN_OFFSET = n_11b*IHxIW_18b + TRow_top*Input_w_9b +TCol_left;
 395 | 
 396 | 	ap_uint<9> RowIncreaseLength;
 397 | 	ap_uint<6> ColIncreaseLength;
 398 | 	ap_uint<3> T2Rate;
 399 | 	switch(Input_w_9b)
 400 | 	{
 401 | 		case Tr:
 402 | 			RowIncreaseLength = 2*Tr;
 403 | 			ColIncreaseLength = 2*Tr;
 404 | 			T2Rate = 2;
 405 | 			break;
 406 | 		case SIZE:
 407 | 			RowIncreaseLength = 4*SIZE;
 408 | 			ColIncreaseLength = 4*SIZE;
 409 | 			T2Rate = 4;
 410 | 			break;
 411 | 		default:
 412 | 			RowIncreaseLength = Input_w_9b;
 413 | 			ColIncreaseLength = col_len;
 414 | 			T2Rate = 1;
 415 | 			break;
 416 | 	}
 417 | 
 418 | 	//assert(ColNum < 64*64);
 419 | 	//assert(RowNum < 64);
 420 | 	RowSub = TRow_top - Roffset;
 421 | 	ColSub = TCol_left - Coffset;
 422 | 
 423 | 	bool pingpong = 1;
 424 | 	ap_uint<6> next_t2[1];
 425 | 	bool next_IsRowPixel[1];
 426 | 	ap_uint<6> next_t22[1];
 427 | 	bool next_IsRowPixel2[1];
 428 | 
 429 | //	ap_uint<6> trow_loops = (int)ceil(((float)TRow/T2Rate));
 430 | 	ap_uint<6> TMP_t2;
 431 | 	for(TMP_t2 = 0,t2 = 0;TMP_t2 < trow_loops + 1; t2 += T2Rate,TMP_t2++)
 432 | 	{
 433 | 		bool IsRowPixel = (t2 >= RowSub)&&(t2 < (row_len + RowSub));
 434 | 
 435 | 		if(pingpong == 1)
 436 | 		{
 437 | 			mmcpy_inputpixel_m2b_comb(input,input1,input2,input3,
 438 | 							   input_memcpy_buffer0, input_memcpy_buffer1,
 439 | 							   input_memcpy_buffer2, input_memcpy_buffer3,
 440 | 							   RowBeginByte, TN_MIN_3b, t2, RowSub, IN_OFFSET, RowIncreaseLength, IHxIW_18b, ColIncreaseLength, next_t2,next_IsRowPixel,IsRowPixel,TMP_t2!=trow_loops);
 441 | 
 442 | 			copy_input2buf_row( input_buffer, row_len, col_len, RowSub, ColSub,
 443 | 						 input_memcpy_buffer02, input_memcpy_buffer12,input_memcpy_buffer22, input_memcpy_buffer32,
 444 | 						RowBeginByte2, TRow, TCol,LayerType,next_t22,next_IsRowPixel2,TMP_t2!=0,T2Rate);
 445 | 			pingpong = 0;
 446 | 		}else
 447 | 		{
 448 | 			mmcpy_inputpixel_m2b_comb(input,input1,input2,input3,
 449 | 							   input_memcpy_buffer02, input_memcpy_buffer12,
 450 | 							   input_memcpy_buffer22, input_memcpy_buffer32,
 451 | 							   RowBeginByte2, TN_MIN_3b, t2, RowSub, IN_OFFSET, RowIncreaseLength, IHxIW_18b, ColIncreaseLength, next_t22,next_IsRowPixel2,IsRowPixel,TMP_t2!=trow_loops);
 452 | 
 453 | 			copy_input2buf_row( input_buffer, row_len, col_len, RowSub, ColSub,
 454 | 						 input_memcpy_buffer0, input_memcpy_buffer1,input_memcpy_buffer2, input_memcpy_buffer3,
 455 | 						RowBeginByte, TRow, TCol,LayerType,next_t2,next_IsRowPixel,TMP_t2!=0,T2Rate);
 456 | 			pingpong = 1;
 457 | 		}
 458 | 	}
 459 | 
 460 | //	assert(TRow_top < 1024);
 461 | //	assert(TCol_left < 1024);
 462 | 
 463 | }
 464 | 
 465 | void weight_mmcpy_everyKxK(int *Weight,int weight_memcpy_buffer[Tm*Tn/2],ap_uint<3> t3,ap_uint<3> t4,ap_uint<3> next_t3[1],ap_uint<3> next_t4[1],unsigned int ReadLength,bool init_enable,bool enable)
 466 | {
 467 | 	if(!enable)
 468 | 		return;
 469 | 
 470 | 	static int Woffset;
 471 | 	next_t3[0] = t3;
 472 | 	next_t4[0] = t4;
 473 | 
 474 | 	if(init_enable)
 475 | 	{
 476 | 		Woffset = 0;
 477 | 	}
 478 | 
 479 | 	memcpy(weight_memcpy_buffer,(int *)(Weight + Woffset),ReadLength*sizeof(int));
 480 | 	Woffset += ReadLength;
 481 | }
 482 | 
 483 | void load_weight2buf_everyKxK(int weight_memcpy_buffer[Tm*Tn/2],short weight_buffer[Tm][Tn][K][K],ap_uint<3> t3,ap_uint<3> t4,ap_uint<6> TM_MIN,ap_uint<3> TN_MIN,bool enable)
 484 | {
 485 | 
 486 | 	if(!enable)
 487 | 		return;
 488 | 
 489 | 	ap_uint<6> t1;
 490 | 	ap_uint<3> t2;
 491 | 	ap_uint<8> weight_memcpy_offset = 0;
 492 | 	ap_uint<2> cnt = 0;
 493 | 	short input_array[2];
 494 | #pragma HLS ARRAY_PARTITION variable=input_array complete dim=1
 495 | 	short input_value;
 496 | 
 497 | 	for(t1 = 0;t1 < Tm; t1++)
 498 | 		for(t2 = 0;t2 < Tn; t2++)
 499 | 		{
 500 | #pragma HLS PIPELINE
 501 | 			bool Enable = (t1 < TM_MIN)&&(t2 < TN_MIN);
 502 | 			if(Enable)
 503 | 			{
 504 | 				if(cnt==0)
 505 | 				{
 506 | 					input_array[0] = weight_memcpy_buffer[weight_memcpy_offset];
 507 | 					input_array[1] = weight_memcpy_buffer[weight_memcpy_offset] >> 16;
 508 | 					weight_memcpy_offset++;
 509 | 				}
 510 | 				input_value = input_array[cnt];
 511 | 
 512 | 				cnt++;
 513 | 				if(cnt==2)
 514 | 					cnt = 0;
 515 | 			}
 516 | 			else
 517 | 				input_value = 0;
 518 | 
 519 | 			weight_buffer[t1][t2][t3][t4] =  input_value;
 520 | 		}
 521 | }
 522 | 
 523 | void weight_load_reorg(int *Weight,short weight_buffer[Tm][Tn][K][K],bool weight_load_enable,int m,int n,int IFM_numxKxK,int KxK,int Kernel_size,int TM_MIN,int TN_MIN)
 524 | {
 525 | 	/*int t1,t2,t3,t4;*/
 526 | 	static int weight_memcpy_buffer[Tm*Tn/2];
 527 | 	static int weight_memcpy_buffer1[Tm*Tn/2];
 528 | 
 529 | 	if(!weight_load_enable)
 530 | 		return;
 531 | 
 532 | //	assert(m < 1024);
 533 | //	assert(n < 2048);//gg2048
 534 | //	assert(IFM_numxKxK < 1024*16);
 535 | //	assert(Kernel_size < 4);
 536 | //	assert(TM_MIN < 64);
 537 | //	assert(TN_MIN < 8);//xx8
 538 | 
 539 | 	ap_uint<2> Kernel_size_2b = Kernel_size;
 540 | 	ap_uint<6> TM_MIN_6b = TM_MIN;
 541 | 	ap_uint<3> TN_MIN_3b = TN_MIN;
 542 | 
 543 | 	ap_uint<10> m_10b = m;
 544 | 	ap_uint<11> n_11b = n;
 545 | 
 546 | 	bool Me0aNe0 = (m_10b==0)&&(n_11b==0);
 547 | 	unsigned int ReadLength = (TM_MIN_6b*TN_MIN_3b)>>1;
 548 | 
 549 | //	if((TM_MIN*TN_MIN)%2)
 550 | //		printf("weight % error\n");
 551 | 
 552 | 	ap_uint<3> t3,t4;
 553 | 	ap_uint<3> next_t3[1];
 554 | 	ap_uint<3> next_t4[1];
 555 | 	ap_uint<3> next_t31[1];
 556 | 	ap_uint<3> next_t41[1];
 557 | 
 558 | 	bool pingpong = true;
 559 | 
 560 | 	for(t3 = 0;t3 < Kernel_size_2b;t3++)
 561 | 		for(t4 = 0;t4 < Kernel_size_2b + 1;t4++)
 562 | 		{
 563 | 			if(pingpong)
 564 | 			{
 565 | 				weight_mmcpy_everyKxK(Weight, weight_memcpy_buffer, t3, t4,next_t3,next_t4, ReadLength,Me0aNe0&&(t3==0)&&(t4==0),t4!=Kernel_size_2b);
 566 | 				load_weight2buf_everyKxK(weight_memcpy_buffer1, weight_buffer, next_t31[0], next_t41[0], TM_MIN, TN_MIN,t4!=0);
 567 | 				pingpong = false;
 568 | 			}else
 569 | 			{
 570 | 				weight_mmcpy_everyKxK(Weight, weight_memcpy_buffer1, t3, t4,next_t31,next_t41, ReadLength,Me0aNe0&&(t3==0)&&(t4==0),t4!=Kernel_size_2b);
 571 | 				load_weight2buf_everyKxK(weight_memcpy_buffer, weight_buffer, next_t3[0], next_t4[0], TM_MIN, TN_MIN,t4!=0);
 572 | 				pingpong = true;
 573 | 			}
 574 | 		}
 575 | }
 576 | 
 577 | 
 578 | void copy_input_weight(int *input,int *input1,int *input2,int *input3,int *Weight,int InFM_num,int Input_w,int Input_h,int Kernel_size,int Kernel_stride,int r,int c,int m,int n,
 579 | 		int TM_MIN,int TN,UCHAR TRow,UCHAR TCol,int Padding,short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],short weight_buffer[Tm][Tn][K][K],int TMP_N_next[1],
 580 | 		bool enable,bool weight_load_enable,bool initialize,const int IHxIW,const int KxK,const int IFM_numxKxK,const int LayerType,ap_uint<6> trow_loops)
 581 | {
 582 | 	if(!enable)
 583 | 		return ;
 584 | 
 585 | 	const int TN_MIN = MIN(TN,InFM_num - n);
 586 | 	TMP_N_next[0] = n;
 587 | 
 588 | 	input_load(input,input1,input2,input3, input_buffer, r, c, n, Kernel_stride, Padding, TRow, TCol, Input_w, Input_h, TN_MIN, IHxIW, LayerType,trow_loops);
 589 | 	weight_load_reorg(Weight,weight_buffer,weight_load_enable,m,n,IFM_numxKxK,KxK,Kernel_size,TM_MIN,TN_MIN);
 590 | 
 591 | }
 592 | 
 593 | //////////////////////////////////////////////////T3 end
 594 | 
 595 | void copy_local_beta(short beta_buffer[MAX_BETA_LENGTH],int local_beta_buffer[MAX_BETA_LENGTH],const int TM_MIN,int m,UCHAR InterSubBeta)
 596 | {
 597 | 	ap_uint<4> InterSubBeta_4b = InterSubBeta;
 598 | 	int offset;
 599 | 	int tm;
 600 | 	for(tm = 0,offset = m;tm < TM_MIN;tm++)
 601 | 	{
 602 | #pragma HLS PIPELINE
 603 | 		local_beta_buffer[tm] = beta_buffer[offset] << InterSubBeta_4b;
 604 | 		offset++;
 605 | 	}
 606 | }
 607 | 
 608 | ///#######################################################################################################################
 609 | //现在缺少位数对齐
 610 | /*
 611 | 输入大小是没有改动的所以不用管，但是输出是要改动的
 612 | */
 613 | //buffer1需要和buffer0对其
 614 | //In1Sub和buff1对应， In0Sub和buffer0对应
 615 | //In1Sub 是要处理的值
 616 | void shortcut(short input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],short input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width],
 617 |         int output_buffer[Tm][Tr*PARA][Tc*PARA],const int TM_MIN,const int TR_MIN,
 618 |         const int TC_MIN,UCHAR In0Sub,UCHAR In1Sub,bool enable){
 619 | //对于shortcut来说，只有其中的一个需要和输出对其，那么问题来了，究竟是左移还是右移？
 620 |     ap_uint<5> tr,tc,tm;
 621 |     if(!enable){
 622 | 		return;
 623 |     }
 624 | 	
 625 | 	ap_uint<5> TM_MIN_5b = TM_MIN;
 626 | 	ap_uint<5> TR_MIN_5b = TR_MIN;
 627 | 	ap_uint<5> TC_MIN_5b = TC_MIN;
 628 | 	
 629 | 	ap_uint<4> In0Sub_4b = In0Sub;
 630 | 	ap_uint<4> In1Sub_4b = INTERWIDTH - In1Sub;
 631 | 	//由于不确定大小关系，所以不能直接减法求出差值然后计算，只能先同步到相同的位置，那就是 INTERWIDTH的位数，然后加法以后再处理回去
 632 | 
 633 |     for(tm = 0; tm < TM_MIN_5b;tm++){
 634 |         for(tr = 0; tr < TR_MIN_5b;tr++){
 635 |             for(tc = 0; tc < TC_MIN_5b; tc++){
 636 | #pragma HLS PIPELINE
 637 |                 int tempt_add0 = input_buffer0[tm][tr][tc] << In0Sub_4b;
 638 |                 int tempt_add1 = input_buffer1[tm][tr][tc] << In1Sub_4b;
 639 | 				//用32位容纳一个＜20位的数值，不会有精度损失，然后做加法
 640 | 				int tempt_out = (tempt_add0 + tempt_add1) >> In0Sub_4b;
 641 | 				//output_buffer[tm][tr][tc] = input_buffer0[tm][tr][tc] + input_buffer1[tm][tr][tc];
 642 | 				output_buffer[tm][tr][tc] = tempt_out;
 643 | 				//int 可以给short吗？
 644 | 				//所以数值上是int
 645 | 				//那么现在就存在一个问题了，究竟是需不需要右移呢？
 646 | 				//先试一下需要右移的吧
 647 |             }
 648 |         }
 649 |     }
 650 | 
 651 | }
 652 | ///#######################################################################################################################
 653 | void upsample(short input_bufferInput[Tn][OnChipIB_Height][OnChipIB_Width],int output_buffer[Tm*PARA][Tr*PARA][Tc*PARA],
 654 |         const int upsample_size,const int TM_MIN,const int TR_MIN,
 655 |         const int TC_MIN,bool enable){
 656 |     ap_uint<5> tr,tc,tm,i,j;
 657 |     if(!enable){
 658 | 		return;
 659 |     }
 660 | 	
 661 | 	ap_uint<2> upsample_size_2b = upsample_size;
 662 | 
 663 | 	ap_uint<5> TM_MIN_5b = TM_MIN;
 664 | 	ap_uint<5> TR_MIN_5b = TR_MIN;
 665 | 	ap_uint<5> TC_MIN_5b = TC_MIN;
 666 | 
 667 |     for(tm = 0; tm < TM_MIN_5b;tm++){
 668 |         for(tr = 0; tr < TR_MIN_5b;tr++){
 669 |             for(tc = 0; tc < TC_MIN_5b; tc++){
 670 | //#pragma HLS PIPELINE
 671 |                 for(i = 0; i < upsample_size_2b;i++){
 672 | #pragma HLS UNROLL
 673 |                     for(j = 0;j < upsample_size_2b;j++){
 674 | #pragma HLS UNROLL
 675 |                             output_buffer[tm][tr*upsample_size_2b+i][tc*upsample_size_2b+j] = input_bufferInput[tm][tr][tc];
 676 |                         }
 677 |                 }
 678 |             }
 679 |         }
 680 |     }
 681 | }
 682 | 
 683 | ///#######################################################################################################################
 684 | void compute(short input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],int output_buffer[Tm][Tr*PARA][Tc*PARA],
 685 | 		short weight_buffer[Tm][Tn][K][K],short beta_buffer[MAX_BETA_LENGTH],int TMP_N_next[1],
 686 | 		const int Kernel_size,const int Kernel_stride,int TMP_M,
 687 | 		const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable,const bool IsNL,const bool reluenable,
 688 | 		UCHAR InterSubBeta,UCHAR WeightAddInputSubInter,UCHAR InterSubOutput)
 689 | {
 690 | //输出完全没用到啊？InterSubOutput
 691 | //计算以后直接移位我服了，为什么是右移这个位数？
 692 | //因为两个运算了以后会超过位数
 693 | 	static int local_beta_buffer[Tm];
 694 | #pragma HLS ARRAY_PARTITION variable=local_beta_buffer complete dim=1
 695 | 
 696 | //	static int compute_buffer[Tm][Tr][Tc];
 697 | //#pragma HLS ARRAY_PARTITION variable=compute_buffer complete dim=1
 698 | 
 699 | 	if(!enable)
 700 | 	{
 701 | 		copy_local_beta(beta_buffer,local_beta_buffer,TM_MIN,TMP_M,InterSubBeta);
 702 | 		return;
 703 | 	}
 704 | 
 705 | 	int partial_mul[Tm][Tn];
 706 | #pragma HLS ARRAY_PARTITION variable=partial_mul complete dim=1
 707 | #pragma HLS ARRAY_PARTITION variable=partial_mul complete dim=2
 708 | 
 709 | 	ap_uint<2> i,j;
 710 | 	UCHAR tm,tn;
 711 | 	ap_uint<5> tr,tc;
 712 | 	ap_uint<2> Kernel_size_2b = Kernel_size;
 713 | 	ap_uint<2> Kernel_stride_2b = Kernel_stride;
 714 | 
 715 | 	ap_uint<5> TR_MIN_5b = TR_MIN;
 716 | 	ap_uint<5> TC_MIN_5b = TC_MIN;
 717 | 
 718 | //	ap_uint<4> InterSubBeta_4b = InterSubBeta;
 719 | 	ap_uint<4> WeightAddInputSubInter_4b = WeightAddInputSubInter;
 720 | 
 721 | //	assert(InterSubBeta < 16);
 722 | //	assert(WeightAddInputSubInter < 16);
 723 | //	assert(InterSubOutput < 16);
 724 | 
 725 | //	assert(Kernel_size < 4);
 726 | //	assert(TR_MIN < 32);
 727 | //	assert(TC_MIN < 32);
 728 | 
 729 | 	ap_uint<11> n = TMP_N_next[0];
 730 | //	assert(n < 2048);
 731 | 
 732 | 	for(i = 0;i < Kernel_size_2b; i++)
 733 | 		for(j = 0;j < Kernel_size_2b; j++)
 734 | 			for(tr = 0;tr < TR_MIN_5b;tr++)
 735 | 				for(tc = 0;tc < TC_MIN_5b;tc++)
 736 | 				{
 737 | #pragma HLS PIPELINE
 738 | 					for(tm = 0;tm < Tm;tm++)
 739 | 					{
 740 | #pragma HLS DEPENDENCE variable=output_buffer inter false
 741 | 						int tmp_add_result;
 742 | 						if(i==0&&j==0&&n==0)
 743 | 						{
 744 | 							tmp_add_result = local_beta_buffer[tm];
 745 | 						}
 746 | 						else
 747 | 							tmp_add_result = output_buffer[tm][tr][tc];
 748 | 
 749 | 						partial_mul[tm][0] = (weight_buffer[tm][0][i][j]*input_buffer[0][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3
 750 | 						partial_mul[tm][1] = (weight_buffer[tm][1][i][j]*input_buffer[1][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3
 751 | 						partial_mul[tm][2] = (weight_buffer[tm][2][i][j]*input_buffer[2][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3
 752 | 						partial_mul[tm][3] = (weight_buffer[tm][3][i][j]*input_buffer[3][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3
 753 | 
 754 | 						int tmp_add1 = partial_mul[tm][0] + partial_mul[tm][1];
 755 | 						int tmp_add2 = partial_mul[tm][2] + partial_mul[tm][3];
 756 | 						int tmp_add12 = tmp_add1 + tmp_add2;
 757 | 						output_buffer[tm][tr][tc] = tmp_add_result + tmp_add12;
 758 | 
 759 | //						partial_mul[tm][0] = (weight_buffer[tm][0][i][j]*input_buffer[0][tr+i][tc+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3
 760 | //						partial_mul[tm][1] = (weight_buffer[tm][1][i][j]*input_buffer[1][tr+i][tc+j]) >> WeightAddInputSubInter_4b;//Q1+Q2-Q3
 761 | //
 762 | //						compute_buffer[tm][tr][tc] = tmp_add_result + partial_mul[tm][0] + partial_mul[tm][1];
 763 | 					}
 764 | 				}
 765 | }
 766 | 
 767 | //////////////version-0.2 start
 768 | void mmcpy_outputport(int *Output,int output_tmp[Tr*PARA*Tc*PARA/4],ap_uint<6> tm,ap_uint<6> mLoop,int OutputOffset,int OutputLength)
 769 | {
 770 | 	bool enable = tm < mLoop;
 771 | 	if(!enable)
 772 | 		return;
 773 | 
 774 | 	memcpy((int *)(Output + OutputOffset),(int *)(output_tmp),OutputLength*sizeof(int));
 775 | }
 776 | 
 777 | void mmcpy_outputport1(int *Output,int output_tmp[Tr*PARA*Tc*PARA/4],ap_uint<6> tm,ap_uint<6> mLoop,int OutputOffset,int OutputLength)
 778 | {
 779 | 	bool enable = tm < mLoop;
 780 | 	if(!enable)
 781 | 		return;
 782 | 
 783 | 	memcpy((int *)(Output + OutputOffset),(int *)(output_tmp),OutputLength*sizeof(int));
 784 | }
 785 | 
 786 | 
 787 | /*
 788 | 并行调用拷贝进端口的函数
 789 | */
 790 | void mmcpy_outputpixel(int *Output,int *Output1,int output_tmp[Tr*PARA*Tc*PARA/4],int output_tmp1[Tr*PARA*Tc*PARA/4],ap_uint<6> tm,ap_uint<6> mLoop1,ap_uint<6> mLoop2,int outputoffsetarray[2],int OutputLength,int OutputLength1,bool enable)
 791 | {
 792 | 	if(!enable)
 793 | 	{
 794 | 		return;
 795 | 	}
 796 | 	mmcpy_outputport(Output ,output_tmp ,tm,mLoop1,outputoffsetarray[0],OutputLength );
 797 | 	mmcpy_outputport1(Output1,output_tmp1,tm,mLoop2,outputoffsetarray[1],OutputLength1);
 798 | }
 799 | 
 800 | /*
 801 | 不知道是干嘛的
 802 | */
 803 | void outputpixel2buf(int output_buffer[Tm][Tr*PARA][Tc*PARA],int output_tmp[Tr*PARA*Tc*PARA/4],int output_tmp1[Tr*PARA*Tc*PARA/4],bool IsNL,int InterSubOutput,int LayerType,bool TC_MINe26,int TR_MIN,int TC_MIN,int mLoop,int rLoop, bool init,
 804 | 					 int outputoffsetarray[2],int OutputOffset1_sum,int OutputOffset1_sum1,int OutputOffset2_sum,ap_uint<6> tm_next[1],bool enable)
 805 | {
 806 | 	if(!enable)
 807 | 	{
 808 | 		return;
 809 | 	}
 810 | 
 811 | 	tm_next[0] =  mLoop;
 812 | 
 813 | 	ap_uint<4> InterSubOutput_4b = InterSubOutput;
 814 | 	int tmp_output;
 815 | 	int tmp_output_1;
 816 | 	short tmp_output2;
 817 | 	short tmp_output2_1;
 818 | 	int tmp_output3;
 819 | 	int tmp_output3_1;
 820 | 	ap_uint<2> cnt = 0;
 821 | 	short ouput_array[2];
 822 | #pragma HLS ARRAY_PARTITION variable=ouput_array complete dim=1
 823 | 	short ouput_array1[2];
 824 | #pragma HLS ARRAY_PARTITION variable=ouput_array1 complete dim=1
 825 | 	ap_uint<5> tr;
 826 | 	static ap_uint<6> tm;
 827 | 
 828 | 	ap_uint<5> TC_MIN_5b = TC_MIN;
 829 | 	ap_uint<5> tc;
 830 | 	ap_uint<2> TM_LOOP,tm_count;
 831 | 	ap_uint<4> TR_LOOP,tr_count;
 832 | 	
 833 | 	
 834 | 	//const int para = (LayerType==2) ? 1:1;
 835 | 	if(init)
 836 | 	{
 837 | 		tm = 0;
 838 | 	}
 839 | 
 840 | 	if(TC_MINe26)
 841 | 	{
 842 | 		tm = mLoop;
 843 | 		tr = rLoop;
 844 | 		TM_LOOP = 1;
 845 | 		TR_LOOP = 1;
 846 | 	}else
 847 | 	{
 848 | 		tr = 0;
 849 | 		TM_LOOP = 2;
 850 | 		TR_LOOP = SIZE;
 851 | 	}
 852 | 
 853 | 	ap_uint<8> outputoffset = 0;
 854 | 	ap_uint<8> outputoffset1 = 0;
 855 | 	
 856 | 	for(tm_count = 0;tm_count < TM_LOOP;tm_count++,tm++,tr = 0)
 857 | 		for(tr_count = 0;tr_count < TR_LOOP;tr_count++,tr++)
 858 | 			for(tc = 0;tc < TC_MIN_5b;tc++)
 859 | 			{
 860 | #pragma HLS PIPELINE
 861 | 				int tmp = output_buffer[tm][tr][tc];
 862 | 				int tmp1 = output_buffer[tm + Tm/2][tr][tc];
 863 | 				if(IsNL&&tmp<0)
 864 | 				{
 865 | 					tmp_output = ((long long)tmp*0xccc)>>15;
 866 | 				}else
 867 | 				{
 868 | 					tmp_output = tmp;
 869 | 				}
 870 | 
 871 | 				if(IsNL&&tmp1<0)
 872 | 				{
 873 | 					tmp_output_1 = ((long long)tmp1*0xccc)>>15;
 874 | 				}else
 875 | 				{
 876 | 					tmp_output_1 = tmp1;
 877 | 				}
 878 | 
 879 | 				if(LayerType==0)
 880 | 				{
 881 | 					tmp_output2 = tmp_output >> InterSubOutput_4b;
 882 | 					tmp_output2_1 = tmp_output_1 >> InterSubOutput_4b;
 883 | 				}
 884 | 				else
 885 | 				{
 886 | 					tmp_output2 = tmp_output;
 887 | 					tmp_output2_1 = tmp_output_1;
 888 | 				}
 889 | 				ouput_array[cnt] = tmp_output2;
 890 | 				ouput_array1[cnt] = tmp_output2_1;
 891 | 				cnt++;
 892 | 				if(cnt==2)
 893 | 				{
 894 | 					tmp_output3 = (ouput_array[0]       &0x0000FFFF) |
 895 | 							((ouput_array[1] << 16 )&0xFFFF0000);
 896 | 					tmp_output3_1 = (ouput_array1[0]       &0x0000FFFF) |
 897 | 							((ouput_array1[1] << 16 )&0xFFFF0000);
 898 | 
 899 | 					output_tmp[outputoffset] = tmp_output3;
 900 | 					outputoffset++;
 901 | 
 902 | 					output_tmp1[outputoffset1] = tmp_output3_1;
 903 | 					outputoffset1++;
 904 | 					cnt = 0;
 905 | 				}
 906 | 			}
 907 | 
 908 | 	outputoffsetarray[0] = (OutputOffset1_sum  + OutputOffset2_sum)>>1;
 909 | 	outputoffsetarray[1] = (OutputOffset1_sum1 + OutputOffset2_sum)>>1;
 910 | 
 911 | }
 912 | 
 913 | 
 914 | //para 还没有设置好
 915 | void write_back_output_reorg(int output_buffer[Tm][Tr*PARA][Tc*PARA],int *Output,int *Output1,int r,int c,int m,const int Output_w,const int Output_h,
 916 | 					   const int TM_MIN,const int TR_MIN,const int TC_MIN,const int OHxOW,bool write_flag,const int OutputQ,bool IsNL,int InterSubOutput,int LayerType)
 917 | {
 918 | 	static int output_tmp00[Tr*PARA*Tc*PARA/4];
 919 | 	static int output_tmp01[Tr*PARA*Tc*PARA/4];
 920 | 
 921 | 	static int output_tmp10[Tr*PARA*Tc*PARA/4];
 922 | 	static int output_tmp11[Tr*PARA*Tc*PARA/4];
 923 | 	
 924 | 	//const int para = (LayerType==2) ? 2:1;
 925 | 
 926 | 	int tr,tm,tc;
 927 | 	int OutputLength,OutputLength1;
 928 | 	int mLoopc,mLoop,rLoop;
 929 | 	ap_uint<6> mLoop1,mLoop2;
 930 | 
 931 | 	if(!write_flag)
 932 | 		return;
 933 | 
 934 | //	assert(TM_MIN < 64);
 935 | 	assert(TR_MIN < 32);
 936 | 	assert(TC_MIN < 32);
 937 | 
 938 | 	ap_uint<6> TM_MIN_6b = TM_MIN;
 939 | 	ap_uint<18> OHxOW_18b = OHxOW;
 940 | 	ap_uint<9> Output_w_9b = Output_w;
 941 | 	ap_uint<10> m_10b = m;
 942 | 	ap_uint<9> r_9b = r;
 943 | 	ap_uint<9> c_9b = c;
 944 | 
 945 | //	assert(m < 1024);
 946 | //	assert(r < 512);
 947 | //	assert(c < 512);
 948 | //	assert(OHxOW < 512*512);
 949 | //	assert(Output_w < 512);
 950 | 
 951 | 	ap_uint<6> TM_MIN_g;
 952 | 	if(TM_MIN_6b==9)
 953 | 		TM_MIN_g = 12;
 954 | 	else
 955 | 		TM_MIN_g = TM_MIN_6b;
 956 | 
 957 | //	const int offset = m_10b*OHxOW_18b + r_9b*Output_w_9b + c_9b;
 958 | 	int tempoff;
 959 | 	if(LayerType == 2){
 960 | 	    tempoff = m*OHxOW_18b + r*Output_w_9b*2 + c_9b*2;
 961 | 	} else {
 962 |         tempoff = m*OHxOW_18b + r*Output_w_9b + c_9b;
 963 | 	}
 964 | 	//const int offset = m_10b*OHxOW_18b + r_9b*Output_w_9b + c_9b;
 965 | 	const int offset = tempoff;
 966 | 
 967 | 
 968 | 	bool TM_MINaboveTmdiv2 = TM_MIN_g > Tm/2;
 969 | 	bool TC_MINe26 = TC_MIN == Tr;
 970 | 
 971 | 	if(TM_MINaboveTmdiv2)
 972 | 	{
 973 | 		mLoop = Tm/2;
 974 | 		mLoop1 = Tm/2;
 975 | 		mLoop2 = TM_MIN_g - Tm/2;
 976 | 	}else
 977 | 	{
 978 | 		mLoop = TM_MIN_g;
 979 | 		mLoop1 = TM_MIN_g;
 980 | 		mLoop2 = 0;
 981 | 	}
 982 | 	mLoopc = mLoop;
 983 | 
 984 | 	int offset1 = offset + mLoop1*OHxOW_18b;
 985 | 
 986 | 	int OutputOffset1,OutputOffset2;
 987 | 	int OutputOffset1_sum,OutputOffset1_sum1;
 988 | 	int OutputOffset2_sum;
 989 | 
 990 | 	// when TC_MIN==26,burstlength = 13*2/2=13,else 13*13*2/2=169
 991 | 	if(TC_MINe26)
 992 | 	{
 993 | 		OutputLength = Tr/2;
 994 | 		OutputLength1 = Tr/2;
 995 | 		OutputOffset1 = OHxOW_18b;
 996 | 		OutputOffset2 = Output_w_9b;
 997 | 		rLoop = 26;
 998 | 	}else//TMxTRxTC TMx13x13 continues
 999 | 	{
1000 | 		OutputLength = SIZE*SIZE;
1001 | 		OutputLength1 = SIZE*SIZE;
1002 | 		rLoop = 1;
1003 | 		mLoop = mLoop/2;
1004 | 		OutputOffset1 = SIZE*SIZE*2;
1005 | 		OutputOffset2 = 0;
1006 | 	}
1007 | 
1008 | 	bool pingpong = true;
1009 | 	int outputoffsetarray[2];
1010 | #pragma HLS ARRAY_PARTITION variable=outputoffsetarray complete dim=1
1011 | 	int outputoffsetarray1[2];
1012 | #pragma HLS ARRAY_PARTITION variable=outputoffsetarray1 complete dim=1
1013 | 	ap_uint<6> tm_next[1];
1014 | 	ap_uint<6> tm_next1[1];
1015 | 	bool wb_start_flag = true;
1016 | 	for(tm = 0,OutputOffset1_sum = offset,OutputOffset1_sum1 = offset1;tm < mLoop;tm++,OutputOffset1_sum += OutputOffset1,OutputOffset1_sum1 += OutputOffset1)
1017 | 		for(tr = 0,OutputOffset2_sum = 0;tr < rLoop + 1;tr++,OutputOffset2_sum += OutputOffset2,wb_start_flag = false)
1018 | 		{
1019 | 			if(pingpong)
1020 | 			{
1021 | 				outputpixel2buf( output_buffer, output_tmp00, output_tmp01, IsNL, InterSubOutput, LayerType, TC_MINe26, TR_MIN, TC_MIN, tm, tr,wb_start_flag,
1022 | 					  outputoffsetarray, OutputOffset1_sum, OutputOffset1_sum1, OutputOffset2_sum,tm_next,tr != rLoop);
1023 | 				mmcpy_outputpixel(Output,Output1, output_tmp10, output_tmp11, tm_next1[0], mLoop1, mLoop2, outputoffsetarray1, OutputLength, OutputLength1,tr != 0);
1024 | 				pingpong = false;
1025 | 			}else
1026 | 			{
1027 | 				outputpixel2buf( output_buffer, output_tmp10, output_tmp11, IsNL, InterSubOutput, LayerType, TC_MINe26, TR_MIN, TC_MIN, tm, tr,wb_start_flag,
1028 | 					  outputoffsetarray1, OutputOffset1_sum, OutputOffset1_sum1, OutputOffset2_sum,tm_next1,tr != rLoop);
1029 | 				mmcpy_outputpixel(Output,Output1, output_tmp00, output_tmp01, tm_next[0], mLoop1, mLoop2, outputoffsetarray, OutputLength, OutputLength1,tr != 0);
1030 | 				pingpong = true;
1031 | 			}
1032 | 		}
1033 | 
1034 | }
1035 | /*
1036 | void pool_yolo2(short Input[Tn][OnChipIB_Height][OnChipIB_Width],int Output[Tm][Tr][Tc],
1037 | 		  const int Kernel_size,const int Kernel_stride,
1038 | 		  const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable)
1039 | {
1040 | 
1041 | 	if(!enable)
1042 | 		return;
1043 | 
1044 | 	ap_uint<5> TR_MIN_5b = TR_MIN;
1045 | 	ap_uint<5> TC_MIN_5b = TC_MIN;
1046 | 	ap_uint<2> Kernel_stride_2b = Kernel_stride;
1047 | 
1048 | //	assert(TR_MIN < 32);
1049 | //	assert(TC_MIN < 32);
1050 | //	assert(Kernel_stride < 4);
1051 | 
1052 | 	ap_uint<2> i,j;
1053 | 	ap_uint<5> tr,tc;
1054 | //	ap_uint<8> i,j,tr,tc;
1055 | 	int of;
1056 | 	short tmp[Tn];
1057 | #pragma HLS ARRAY_PARTITION variable=tmp complete dim=1
1058 | 	short input_short[Tn];
1059 | #pragma HLS ARRAY_PARTITION variable=input_short complete dim=1
1060 | 
1061 | 	for(tr = 0;tr < TR_MIN_5b;tr++)
1062 | 		for(tc = 0;tc < TC_MIN_5b;tc++)
1063 | 			for(i =0;i < 2; i++)
1064 | 				for(j = 0;j < 2; j++)
1065 | 				{
1066 | #pragma HLS PIPELINE
1067 | 					for( of = 0; of < Tn; of++)
1068 | 					{
1069 | 						if(i==0&&j==0)
1070 | 							tmp[of] = 0x8001;
1071 | 						input_short[of] = Input[of][tr*Kernel_stride_2b+i][tc*Kernel_stride_2b+j];
1072 | 						if(input_short[of] > tmp[of])
1073 | 							tmp[of] = input_short[of];
1074 | 
1075 | 						if(i==1&&j==1)
1076 | 							Output[of][tr][tc] = tmp[of];
1077 | 					}
1078 | 				}
1079 | }
1080 | 
1081 | void reorg_yolo2(short Input[Tn][OnChipIB_Height][OnChipIB_Width],int Output[Tm][Tr][Tc],
1082 | 		  const int Kernel_size,const int Kernel_stride,
1083 | 		  const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable)
1084 | {
1085 | 	int x, y,kx,ky;
1086 | 	unsigned char Yoffset;
1087 | 	unsigned char Xoffset;
1088 | 
1089 | 	if(!enable)
1090 | 		return;
1091 | 
1092 | //	ap_uint<5> TR_MIN_5b = TR_MIN;
1093 | //	ap_uint<5> TC_MIN_5b = TC_MIN;
1094 | 
1095 | 	assert(TR_MIN < 32);
1096 | 	assert(TC_MIN < 32);
1097 | 
1098 |     for( y = 0; y < TR_MIN; y++)
1099 |     	for( x = 0; x < TC_MIN; x++)
1100 | 			for(ky= 0;ky < 2; ky++)
1101 |     			for(kx = 0;kx < 2; kx++)
1102 | 				{
1103 | #pragma HLS PIPELINE
1104 | 						Yoffset = (y << 1) + ky;
1105 | 						Xoffset = (x << 1) + kx;
1106 | 
1107 | 						int in_index  = (ky << 1) + kx;
1108 | 						Output[in_index][y][x] = Input[0][Yoffset][Xoffset];
1109 |     			}
1110 | }
1111 | */
1112 | void intra_pingpong_wrapper(int *Input,int *Input1,int *Input2,int *Input3,int *Weight, int output_buffer[Tm][Tr*PARA][Tc*PARA],short beta_buffer[MAX_BETA_LENGTH],
1113 | 								 short input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],short input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width],
1114 | 								 short input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width],short input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width],
1115 | 								 int InFM_num,int Input_w,int Input_h,int Kernel_size,int Kernel_stride,
1116 | 								 int TMP_R,int TMP_C,int TMP_M,int m,int TM_MIN,int TR_MIN,int TC_MIN,int TN,UCHAR TRow,UCHAR TCol,int Padding,
1117 | 								 int IHxIW,int KxK,int IFM_numxKxK,int nLoops,bool IsNL,int LayerType,int TM,int TMP_X_next[1],int TX_MIN_next[1],bool pingpongx,bool input_flag,bool process_flag,
1118 | 								 UCHAR InterSubBeta,UCHAR WeightAddInputSubInter,UCHAR InterSubOutput,ap_uint<6> trow_loops)
1119 | {
1120 | 
1121 | 	static short weight_buffer0[Tm][Tn][K][K];
1122 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=1
1123 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=2
1124 | 
1125 | 	static short weight_buffer1[Tm][Tn][K][K];
1126 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=1
1127 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=2
1128 | 
1129 | 	static int NOP[1];
1130 | 	static int tmp_x;
1131 | 	static int tmp_tx_min;
1132 | 
1133 | 	if(LayerType==0)
1134 | 	{
1135 | 	//conv不用动，之前改好了的
1136 | 		if(!input_flag)
1137 | 			return;
1138 | 		TMP_X_next[0] = TMP_M;//consider by the inner-out loop
1139 | 		TX_MIN_next[0] = TM_MIN;// like above
1140 | 
1141 | 		bool pingpong = 0;
1142 | 		int TMP_N_next0[1];
1143 | 		int TMP_N_next1[1];
1144 | 		int n;
1145 | 		int TMP_N;
1146 | 		for(TMP_N = 0,n = 0;n < nLoops+1; n++,TMP_N += TN)
1147 | 		{
1148 | 			if(pingpong == 1)
1149 | 			{
1150 | 				copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N,
1151 | 					TM_MIN,TN,TRow,TCol,Padding,input_buffer1,weight_buffer1,TMP_N_next1,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1152 | 				compute(input_buffer0,output_buffer,weight_buffer0,beta_buffer,TMP_N_next0,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops,
1153 | 					   InterSubBeta,WeightAddInputSubInter,InterSubOutput);
1154 | 				pingpong = 0;
1155 | 			}else
1156 | 			{
1157 | 				copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N,
1158 | 					TM_MIN,TN,TRow,TCol,Padding,input_buffer0,weight_buffer0,TMP_N_next0,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1159 | 				compute(input_buffer1,output_buffer,weight_buffer1,beta_buffer,TMP_N_next1,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops,
1160 | 					   InterSubBeta,WeightAddInputSubInter,InterSubOutput);
1161 | 				pingpong = 1;
1162 | 			}
1163 | 		}
1164 | 	}
1165 | 	else if(LayerType==1)
1166 | 	{
1167 | 		//shortcut，注意要设置stride和kernelsize为1，这是在pc上实验好的结果
1168 | 		/*
1169 | 		对于shortcut的位数对齐有两个方案，第一个是在录入文件的时候就对齐，这一步就不管，但是这个也太low了
1170 | 		所以方案二就是在函数里新加内容，不过幸亏的是这个函数里有足够的接口可以用，所以选择方案二
1171 | 			方案二：那么要如何使用这些值呢？
1172 | 		*/
1173 | 		if(pingpongx==0)
1174 | 		{
1175 | 			TMP_X_next[0] = tmp_x;
1176 | 			TX_MIN_next[0] = tmp_tx_min;
1177 | 			tmp_x = TMP_M;
1178 | 			tmp_tx_min = TM_MIN;
1179 | 
1180 | 			//copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1181 | 			//	TM_MIN,TM,TRow,TCol,0,input_buffer0,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1182 | 			copy_input_weight(Input,Input1,Input,Input1,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1183 | 				TM_MIN,TM,TRow,TCol,0,input_buffer0,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1184 | 			copy_input_weight(Input2,Input3,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1185 | 				TM_MIN,TM,TRow,TCol,0,input_buffer00,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1186 | 			//pool_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
1187 | 			//可以看出位数还是在计算函数里用到的，先看看那些都是什么含义然后看看他是如何利用的
1188 | 			/*
1189 | 			    void shortcut(short input_buffer[Tm][Tr][Tc],int output_buffer[Tm][Tr][Tc],
1190 | 		            const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable)
1191 | 			*/
1192 | 			shortcut(input_buffer1,input_buffer10,output_buffer,TM_MIN,TR_MIN,TC_MIN,InterSubOutput,WeightAddInputSubInter,process_flag);
1193 | 		}else
1194 | 		{
1195 | 			TMP_X_next[0] = tmp_x;
1196 | 			TX_MIN_next[0] = tmp_tx_min;
1197 | 			tmp_x = TMP_M;
1198 | 			tmp_tx_min = TM_MIN;
1199 | 
1200 | 			//copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1201 | 			//	TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1202 | 			copy_input_weight(Input,Input1,Input,Input1,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1203 | 				TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1204 | 			copy_input_weight(Input2,Input3,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1205 | 				TM_MIN,TM,TRow,TCol,0,input_buffer10,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1206 | 			//pool_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
1207 | 			shortcut(input_buffer0,input_buffer00,output_buffer,TM_MIN,TR_MIN,TC_MIN,InterSubOutput,WeightAddInputSubInter,process_flag);
1208 | 		}
1209 | 
1210 | 	}
1211 | 	else if(LayerType==2)
1212 | 	{
1213 | 		//upsample是不需要做位数处理的
1214 | 		if(pingpongx==0)
1215 | 		{
1216 | 			TMP_X_next[0] = tmp_x;
1217 | 			TX_MIN_next[0] = tmp_tx_min;
1218 | 			tmp_x = TMP_M;
1219 | 			tmp_tx_min = TM_MIN;
1220 | 
1221 | 			copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1222 | 				TM_MIN,TM,TRow,TCol,0,input_buffer0,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1223 | 			//reorg_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
1224 | 			upsample(input_buffer1,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag);
1225 | 		}else
1226 | 		{
1227 | 			TMP_X_next[0] = tmp_x;
1228 | 			TX_MIN_next[0] = tmp_tx_min;
1229 | 			tmp_x = TMP_M;
1230 | 			tmp_tx_min = TM_MIN;
1231 | 
1232 | 			copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
1233 | 				TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
1234 | 			//reorg_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
1235 | 			upsample(input_buffer0,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag);
1236 | 		}
1237 | 
1238 | 	}
1239 | 
1240 | }
1241 | 
1242 | void copy_beta(short beta_buffer[MAX_BETA_LENGTH],int *Beta,const int OFM_NUM,const int BetaQ)
1243 | {
1244 | 	static int beta_tmp[MAX_BETA_LENGTH/2];
1245 | 	int NUM = (OFM_NUM+1)>>1;
1246 | 	memcpy(beta_tmp,(int *)Beta,NUM*sizeof(int));
1247 | 	int x;
1248 | 	for(x = 0;x < NUM;x++)
1249 | 	{
1250 | #pragma HLS PIPELINE
1251 | 		beta_buffer[2*x] = beta_tmp[x];
1252 | 		beta_buffer[2*x+1] = beta_tmp[x]>>16;
1253 | 	}
1254 | }
1255 | 
1256 | /*
1257 | void YOLO2_FPGA(
1258 | int *Input,int *Input1,int *Input2,int *Input3,    //Input Address,Using paralized input,four ports
1259 | ,int *Output,int *Output1, //two out Address paralized
1260 | int *Weight,int *Beta //weight and bias Address
1261 | const int InFM_num,const int OutFM_num, //Input size and out size
1262 | const int Kernel_size,const int Kernel_stride, //kernel size and stride
1263 | const int Input_w,const int Input_h,//Input winth and height
1264 | const int output_w,const int output_h, //Out width and height
1265 | const int Padding,//padding value
1266 | ,const bool IsNL,const bool IsBN, //is leaky ReLu,is batch normalization
1267 | const int TM,const int TN,const int TR,const int TC, //accelerate configuration,TM,TN,TR,TC,is setted,but i do not know why set like this
1268 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,//do not know
1269 | const int LayerType,//layertype ,defferent layertype do different operate
1270 | const int InputQ,const int OutputQ,const int WeightQ,const int BetaQ,//quanti weishu
1271 | int trow_loops//do not know,seems not used
1272 | ){
1273 | 
1274 | }
1275 | */
1276 | 
1277 | /*
1278 | 就差shortcut和upsample两层位数对齐了
1279 | upsample不需要对其
1280 | 也就是只差shortcut对其了
1281 | */
1282 | void YOLO2_FPGA(int *Input,int *Input1,int *Input2,int *Input3,int *Output,int *Output1,int *Weight,int *Beta,const int InFM_num,const int OutFM_num,
1283 | 							  const int Kernel_size,const int Kernel_stride,
1284 | 							  const int Input_w,const int Input_h,const int output_w,const int output_h,const int Padding,const bool IsNL,const bool IsBN,
1285 | 							  const int TM,const int TN,const int TR,const int TC,
1286 | 							  const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType,
1287 | 							  const int InputQ,const int OutputQ,const int WeightQ,const int BetaQ,int trow_loops)
1288 | {
1289 | 
1290 | #pragma HLS INTERFACE m_axi depth=512 port=Input   offset=slave bundle=DATA_BUS1 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64
1291 | #pragma HLS INTERFACE m_axi depth=512 port=Input1  offset=slave bundle=DATA_BUS2 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64
1292 | #pragma HLS INTERFACE m_axi depth=512 port=Input2  offset=slave bundle=DATA_BUS3 num_read_outstanding=1 max_read_burst_length=64
1293 | #pragma HLS INTERFACE m_axi depth=512 port=Input3  offset=slave bundle=DATA_BUS4 num_read_outstanding=1 max_read_burst_length=64
1294 | #pragma HLS INTERFACE m_axi depth=512 port=Output  offset=slave bundle=DATA_BUS1 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64
1295 | #pragma HLS INTERFACE m_axi depth=512 port=Output1 offset=slave bundle=DATA_BUS2 num_read_outstanding=1 num_write_outstanding=1 max_read_burst_length=64 max_write_burst_length=64
1296 | #pragma HLS INTERFACE m_axi depth=512 port=Weight  offset=slave bundle=DATA_BUS5 num_read_outstanding=1 max_read_burst_length=128
1297 | #pragma HLS INTERFACE m_axi depth=512 port=Beta    offset=slave bundle=DATA_BUS5 num_read_outstanding=1 max_read_burst_length=128
1298 | 
1299 | #pragma HLS INTERFACE s_axilite register port=return bundle=CTRL_BUS
1300 | #pragma HLS INTERFACE s_axilite register port=InFM_num bundle=CTRL_BUS
1301 | #pragma HLS INTERFACE s_axilite register port=OutFM_num bundle=CTRL_BUS
1302 | #pragma HLS INTERFACE s_axilite register port=Kernel_size bundle=CTRL_BUS
1303 | #pragma HLS INTERFACE s_axilite register port=Kernel_stride bundle=CTRL_BUS
1304 | #pragma HLS INTERFACE s_axilite register port=Input_w bundle=CTRL_BUS
1305 | #pragma HLS INTERFACE s_axilite register port=Input_h bundle=CTRL_BUS
1306 | #pragma HLS INTERFACE s_axilite register port=output_w bundle=CTRL_BUS
1307 | #pragma HLS INTERFACE s_axilite register port=output_h bundle=CTRL_BUS
1308 | #pragma HLS INTERFACE s_axilite register port=Padding bundle=CTRL_BUS
1309 | #pragma HLS INTERFACE s_axilite register port=IsNL bundle=CTRL_BUS
1310 | #pragma HLS INTERFACE s_axilite register port=IsBN bundle=CTRL_BUS
1311 | #pragma HLS INTERFACE s_axilite register port=TM bundle=CTRL_BUS
1312 | #pragma HLS INTERFACE s_axilite register port=TN bundle=CTRL_BUS
1313 | #pragma HLS INTERFACE s_axilite register port=TR bundle=CTRL_BUS
1314 | #pragma HLS INTERFACE s_axilite register port=TC bundle=CTRL_BUS
1315 | #pragma HLS INTERFACE s_axilite register port=mLoops bundle=CTRL_BUS
1316 | #pragma HLS INTERFACE s_axilite register port=nLoops bundle=CTRL_BUS
1317 | #pragma HLS INTERFACE s_axilite register port=rLoops bundle=CTRL_BUS
1318 | #pragma HLS INTERFACE s_axilite register port=cLoops bundle=CTRL_BUS
1319 | #pragma HLS INTERFACE s_axilite register port=LayerType bundle=CTRL_BUS
1320 | #pragma HLS INTERFACE s_axilite register port=InputQ bundle=CTRL_BUS
1321 | #pragma HLS INTERFACE s_axilite register port=OutputQ bundle=CTRL_BUS
1322 | #pragma HLS INTERFACE s_axilite register port=WeightQ bundle=CTRL_BUS
1323 | #pragma HLS INTERFACE s_axilite register port=BetaQ bundle=CTRL_BUS
1324 | #pragma HLS INTERFACE s_axilite register port=trow_loops bundle=CTRL_BUS
1325 | 
1326 | #pragma HLS INTERFACE s_axilite register port=Input bundle=CTRL_BUS
1327 | #pragma HLS INTERFACE s_axilite register port=Output bundle=CTRL_BUS
1328 | #pragma HLS INTERFACE s_axilite register port=Weight bundle=CTRL_BUS
1329 | #pragma HLS INTERFACE s_axilite register port=Beta bundle=CTRL_BUS
1330 | 
1331 | 	assert(Kernel_size < 4);
1332 | 	assert(Kernel_stride < 4);
1333 | 	assert(TR < 32);
1334 | 	assert(TC < 32);
1335 | 	assert(InFM_num < 2048);
1336 | 	assert(OutFM_num < 2048);
1337 | 	assert(output_h < 512);
1338 | 	assert(output_w < 512);
1339 | 	assert(Input_h < 1024);//gg///??????
1340 | 	assert(Input_w < 512);
1341 | 
1342 | 	assert(WeightQ < 32);
1343 | 	assert(InputQ < 32);
1344 | 	assert(OutputQ < 32);
1345 | 	assert(BetaQ < 32);
1346 | 
1347 | 	ap_uint<9> output_h_9b = output_h;
1348 | 	ap_uint<9> output_w_9b = output_w;
1349 | 	ap_uint<5> TR_5b = TR;
1350 | 	ap_uint<5> TC_5b = TC;
1351 | 	ap_uint<2> Kernel_stride_2b = Kernel_stride;
1352 | 	ap_uint<2> Kernel_size_2b = Kernel_size;
1353 | 	ap_uint<11> InFM_num_11b = InFM_num;
1354 | 	ap_uint<10> Input_h_10b = Input_h;
1355 | 	ap_uint<9> Input_w_9b = Input_w;
1356 | 	ap_uint<6> trow_loops_6b = trow_loops;
1357 | 
1358 | 	UCHAR tmprow,tmpcol;
1359 | 	if(LayerType==2){
1360 | 	    tmprow = TR_5b;
1361 | 	    tmpcol = TC_5b;
1362 | 	} else {
1363 | 	    tmprow = (TR_5b-1)*Kernel_stride_2b+Kernel_size_2b;
1364 | 	    tmpcol = (TC_5b-1)*Kernel_stride_2b+Kernel_size_2b;
1365 | 	}
1366 | 	const UCHAR TRow = tmprow;
1367 | 	const UCHAR TCol = tmpcol;
1368 | 	
1369 | 	
1370 | 	const int OHxOW = output_h_9b*output_w_9b;
1371 | 	//const UCHAR TRow = (TR_5b-1)*Kernel_stride_2b+Kernel_size_2b;
1372 | 	//const UCHAR TCol = (TC_5b-1)*Kernel_stride_2b+Kernel_size_2b;
1373 | 	const int IHxIW   = Input_h_10b*Input_w_9b;
1374 | 	const int KxK = Kernel_size_2b*Kernel_size_2b;
1375 | 	assert(KxK < 10);
1376 | 	ap_uint<4> KxK_4b = KxK;
1377 | 	const int IFM_numxKxK = InFM_num_11b*KxK_4b;
1378 | 	const int mLoops_add1 = mLoops + 1;
1379 | 	const int mLoops_add2 = mLoops + 2;
1380 | 	const int mLoops_bound = LayerType ? mLoops_add2: mLoops_add1;
1381 | 	//temp_inputQ[l.index+1],	temp_inputQ[i],		INTERWIDTH,			INTERWIDTH,			trow_loops
1382 | 	//const int InputQ,			const int OutputQ,	const int WeightQ,	const int BetaQ,	int trow_loops
1383 | 	const UCHAR InterSubBeta = INTERWIDTH - BetaQ;//总的与bias值的偏移量
1384 | 	const UCHAR WeightAddInputSubInter = WeightQ + InputQ - INTERWIDTH;//输入与权重相乘后大于差值的偏移量
1385 | 	const UCHAR InterSubOutput = INTERWIDTH - OutputQ;//超过输出的偏移量，虽然处理与写回看起来都用到了，但是实际上只有写回能用到
1386 | 	/*
1387 | 在shortcut里面   	InterSubBeta = INTERWIDTH-INTERWIDTH = 	0；
1388 | 					WeightAddInputSubInter = 				temp_inputQ[l.index+1]；//这个可以直接利用起来，用INTERWIDTH-它  定义成一个值
1389 | 					InterSubOutput		   = 				INTERWIDTH-temp_inputQ[i];；直接利用
1390 | 			那么这样子就是两者距离20位的差距，都左移，按放大处理，加法以后再处理成输出值？？？//需要在这里输出吗
1391 | 
1392 | 在conv内			InterSubBeta > 0 
1393 | 					WeightAddInputSubInter 较小
1394 | 					InterSubOutput 与上相等
1395 | 	*/
1396 | 	
1397 | 
1398 | 	assert(InterSubBeta < 16);
1399 | 	assert(WeightAddInputSubInter < 16);
1400 | 	assert(InterSubOutput < 16);
1401 | 
1402 | 	//assert(TRow < 256);
1403 | 	//assert(TCol < 256);
1404 | 
1405 | 	static short input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width];
1406 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1
1407 | 
1408 | 	static short input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width];
1409 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1
1410 | 
1411 | //mycode
1412 | 	static short input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width];
1413 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1
1414 | 
1415 | 	static short input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width];
1416 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1
1417 | //end
1418 | 	static int output_buffer0[Tm][Tr*PARA][Tc*PARA];
1419 | #pragma HLS ARRAY_PARTITION variable=output_buffer0 complete dim=1
1420 | 
1421 | 	static int output_buffer1[Tm][Tr*PARA][Tc*PARA];
1422 | #pragma HLS ARRAY_PARTITION variable=output_buffer1 complete dim=1
1423 | 
1424 | 	static short beta_buffer[MAX_BETA_LENGTH];
1425 | 
1426 | 	int r,c,m;
1427 | /////////////////////////////////param
1428 | 	int TMP_R,TMP_C,TMP_M;
1429 | 	int TM_MIN,TR_MIN,TC_MIN;
1430 | ///////////////////////////////////////
1431 | 
1432 | 	int TMP_M_next0[1];
1433 | 	int TMP_M_next1[1];
1434 | 	int TM_MIN_next0[1];
1435 | 	int TM_MIN_next1[1];
1436 | 	bool pingpongm;
1437 | 	
1438 | 	const int para = (LayerType==2) ? 2 : 1;
1439 | 
1440 | 	if(LayerType==0)
1441 | 		copy_beta(beta_buffer,Beta,OutFM_num,BetaQ);
1442 | 
1443 | 	for(TMP_R = 0,r = 0; r < rLoops; r++, TMP_R += TR)
1444 | 	{
1445 | 		TR_MIN = MIN(TR,output_h -TMP_R);
1446 | 		for(TMP_C = 0,c = 0; c < cLoops; c++,TMP_C += TC)
1447 | 		{
1448 | 			TC_MIN = MIN(TC,output_w -TMP_C);
1449 | 			pingpongm = 0;
1450 | 			for(TMP_M = 0, m = 0; m < mLoops_bound; m++,TMP_M += TM)
1451 | 			{
1452 | 				TM_MIN = MIN(TM,OutFM_num-TMP_M);
1453 | 				if(LayerType!=0) TM_MIN = Tn;
1454 | 
1455 | 				bool MneZero = (m!=0);
1456 | 				bool MneOne = (m!=1);
1457 | 				bool MnemLoops = (m!=mLoops);
1458 | 				bool MneMLoopsaddOne = (m!=mLoops_add1);
1459 | 				bool input_flag = LayerType ? MnemLoops&&MneMLoopsaddOne: MnemLoops;
1460 | 				bool process_flag = LayerType ? MneZero&&MneMLoopsaddOne : MnemLoops;
1461 | 				bool write_flag = LayerType ? MneZero&&MneOne : MneZero;
1462 | 
1463 | 				if(pingpongm==0)
1464 | 				{
1465 | 					intra_pingpong_wrapper(Input,Input1,Input2,Input3,Weight,output_buffer1,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10,
1466 | 									InFM_num, Input_w, Input_h, Kernel_size, Kernel_stride,
1467 | 									TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next1,TM_MIN_next1, pingpongm, input_flag,
1468 | 									process_flag,InterSubBeta,WeightAddInputSubInter,InterSubOutput,trow_loops_6b);
1469 | 
1470 | 					write_back_output_reorg(output_buffer0,Output,Output1,TMP_R,TMP_C,TMP_M_next0[0],output_w,output_h,TM_MIN_next0[0],TR_MIN*para,TC_MIN*para,OHxOW,write_flag,OutputQ, IsNL, InterSubOutput, LayerType);
1471 | 					pingpongm = 1;
1472 | 				}else
1473 | 				{
1474 | 					intra_pingpong_wrapper(Input,Input1,Input2,Input3,Weight,output_buffer0,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10,
1475 | 									InFM_num, Input_w, Input_h, Kernel_size, Kernel_stride,
1476 | 									TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next0,TM_MIN_next0, pingpongm, input_flag,
1477 | 									process_flag,InterSubBeta,WeightAddInputSubInter,InterSubOutput,trow_loops_6b);
1478 | 
1479 | 					write_back_output_reorg(output_buffer1,Output,Output1,TMP_R,TMP_C,TMP_M_next1[0],output_w,output_h,TM_MIN_next1[0],TR_MIN*para,TC_MIN*para,OHxOW,write_flag,OutputQ, IsNL, InterSubOutput, LayerType);
1480 | 					pingpongm = 0;
1481 | 				}
1482 | 
1483 | 			}
1484 | 		}
1485 | 	}
1486 | }
1487 | ////////////////////////////////////////////20181229 n4m32  v2 without input and reorg opt end input opt ok relu comb ok // input opt ok //output opt ok //weight opt ok (5)n4m32i4o2 ok end
1488 | 


--------------------------------------------------------------------------------
/soft_version/Step02/yolov3_acc_sim.h:
--------------------------------------------------------------------------------
   1 | ///////////////////////////////////////////////////////////////////////20181229 anti-reorg start => KxKxTmxTn
   2 | #define MAX(x,y) ((x)>(y)?(x):(y))
   3 | #define MIN(x,y) ((x)<(y)?(x):(y))
   4 | #define S 2
   5 | #define K 3
   6 | 
   7 | #define HALFWID 208
   8 | #define ATOMWID 13
   9 | #define BLOCK 512
  10 | 
  11 | #define Tn 4
  12 | #define Tm 32
  13 | #define Tr 26
  14 | //#define Tr 38
  15 | #define Tc 26
  16 | //#define Tc 38
  17 | #define OnChipIB_Width  ((Tc-1)*S+K)
  18 | #define OnChipIB_Height ((Tr-1)*S+K)
  19 | #define MAX_BETA_LENGTH (1024)
  20 | #define PARA 1
  21 | 
  22 | #define REORG_GEN
  23 | //#define REORG_TEST
  24 | 
  25 | //#define UPSAMPLE_TEST
  26 | 
  27 | /*
  28 | 关于TR与 ONCHIPTR
  29 | 显然TR< ONCHIPTR 这很显然，那是因为yolo2中所有的都是减小size的操作而没有upsmp
  30 | 但是yolo3是有的，例如upsmp就是增加size，那么对于upsmp而言，
  31 | 需要设置的输出数组大小要做一些改变，应该是输入size的2x2倍
  32 | 但是我又不是很想改动size大小，因为要兼容其他的代码，那么只能考虑把out填充满，而inpt以及
  33 | 循环控制要做修改
  34 | 
  35 | 具体有关shorcut 这一步应该是可以完成的，因为有很多size不变的操作，他是要做一个统一的，那么在我这里
  36 | shortcut除了是做加法，size应该也是可复用，无非就是输入的加载需要一定的设置。
  37 | 
  38 | */
  39 | 
  40 | //////////////////////////////////////////////////T3 start
  41 | 
  42 | /*
  43 | float *input,float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width]  总的输入数组，要存放的新数组，
  44 | int r,int c,int n,当前运行到的行，列，页的位置块标记
  45 | int Kernel_stride,int Padding, 卷积核步长，是否padding
  46 | int TRow,int TCol,是否padding计算后的新的行列值
  47 | int Input_w,int Input_h,输入宽高
  48 | int TN_MIN,int IHxIW,输入的页长度，一页的总大小
  49 | int LayerType 类型，决定是否设置最小负值，但是在v3里无用
  50 | 
  51 | LOOP1：将当前的输入从二维化的扁平到一维化
  52 | LOOP2: 从第一个开始对每个位置遍历，对要存放的新数组的位置，如果不处于padding位置就赋值，否则置成padding值
  53 | maxpool为了不影响结果，padding设置为最小的负值
  54 | 
  55 | 
  56 | 也就是说，这个函数加载（Tn * TRow * TCol）的输入值进入数组里
  57 | */
  58 | void input_load(float *input,float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],int r,int c,int n,int Kernel_stride,int Padding,int TRow,int TCol,int Input_w,int Input_h,int TN_MIN,int IHxIW,int LayerType)
  59 | {
  60 | 	int t1,t2,t3,t4;
  61 | 	int xoffset;
  62 | 	int yoffset;
  63 | 
  64 | 	static float input_memcpy_buffer[Tn*OnChipIB_Height*OnChipIB_Width];
  65 | 
  66 |     int tempcoff,temproff,tempcurr;
  67 |     if(LayerType == 2){
  68 |         tempcoff = c;
  69 |         temproff = r;
  70 |         //tempcurr = 
  71 |     } else {
  72 |         tempcoff = c*Kernel_stride - Padding;
  73 |         temproff = r*Kernel_stride - Padding;
  74 |     }
  75 | 
  76 | 	//const int Coffset = c*Kernel_stride - Padding;
  77 | 	//const int Roffset = r*Kernel_stride - Padding;
  78 | 	const int Coffset = tempcoff;
  79 | 	const int Roffset = temproff;
  80 | 	const int CurrentOffset = n*IHxIW + Roffset*Input_w + Coffset;
  81 | 	//虽然说设置了是这么大的，但是可以不加载满
  82 | 	//printf("TN_MIN*TRow:%d x %d = %d\n",TN_MIN,TRow,TN_MIN*TRow);
  83 | 
  84 | 	float pad_value = 0;
  85 | 	
  86 | 	#ifdef UPSAMPLE_TEST
  87 | 	if(LayerType==2){
  88 | 		//pad_value = -1024*1024;
  89 | 		printf("r = %d,c = %d,n = %d,Coffset = %d,Roffset = %d,CurrentOffset = %d,TRow = %d\n",r,c,n,Coffset,Roffset,CurrentOffset,TRow);
  90 | 	}
  91 | 	#endif
  92 | 	
  93 | 	int input_mmcpy_offset = 0;
  94 | 	for(t1 = 0;t1 < TN_MIN; t1++)
  95 | 		for(t2 = 0;t2 < TRow; t2++)
  96 | 		{
  97 | 			memcpy((float *)(input_memcpy_buffer + input_mmcpy_offset),(float *)(input + CurrentOffset + t1*IHxIW + t2*Input_w),TCol*sizeof(float));
  98 | 			input_mmcpy_offset += TCol;
  99 | 		}
 100 | 
 101 | 	input_mmcpy_offset = 0;
 102 | 	for(t1 = 0;t1 < Tn; t1++)
 103 | 		for(t2 = 0;t2 < TRow; t2++)
 104 | 			for(t3 = 0;t3 < TCol; t3++)
 105 | 			{
 106 | 				xoffset = Coffset + t3;
 107 | 				yoffset = Roffset + t2;
 108 | 				bool XEnable    = (xoffset >= 0)&&(xoffset < Input_w);
 109 | 				bool YEnable    = (yoffset >= 0)&&(yoffset < Input_h);
 110 | 				bool PaddingEnable = XEnable&&YEnable;
 111 | 				if(PaddingEnable&&(t1 < TN_MIN))
 112 | 					input_buffer[t1][t2][t3] = input_memcpy_buffer[input_mmcpy_offset];
 113 | 				else
 114 | 					input_buffer[t1][t2][t3] = pad_value;
 115 | 				input_mmcpy_offset++;
 116 | 			}
 117 | }
 118 | 
 119 | void weight_load(float *Weight,float weight_buffer[Tm][Tn][K][K],bool weight_load_enable,int m,int n,int IFM_numxKxK,int KxK,int Kernel_size,int TM_MIN,int TN_MIN)
 120 | {
 121 | 	int t1,t2,t3,t4;
 122 | 	static float weight_memcpy_buffer[Tm*Tn*K*K];
 123 | 	
 124 | 	if(!weight_load_enable)
 125 | 		return;
 126 | 	
 127 | 	const int Woffset = m*IFM_numxKxK + n*KxK;
 128 | 		
 129 | 	int weight_memcpy_offset = 0;
 130 | 	for(t1 = 0;t1 < TM_MIN; t1++)
 131 | 		for(t2 = 0;t2 < TN_MIN; t2++)
 132 | 		{
 133 | 			memcpy((float *)(weight_memcpy_buffer + weight_memcpy_offset),(float *)(Weight + Woffset + t1*IFM_numxKxK + t2*KxK),KxK*sizeof(float));
 134 | 			weight_memcpy_offset += KxK;
 135 | 		}
 136 | 	
 137 | 	weight_memcpy_offset = 0;
 138 | 	for(t1 = 0;t1 < Tm; t1++)
 139 | 		for(t2 = 0;t2 < Tn; t2++)
 140 | 			for(t3 = 0;t3 <Kernel_size; t3++)
 141 | 				for(t4 = 0;t4 <Kernel_size; t4++)
 142 | 				{
 143 | 					bool Enable = (t1 < TM_MIN)&&(t2 < TN_MIN);
 144 | 					if(Enable)
 145 | 					{
 146 | 						weight_buffer[t1][t2][t3][t4] =  weight_memcpy_buffer[weight_memcpy_offset];
 147 | 						weight_memcpy_offset++;
 148 | 					}
 149 | 					else
 150 | 						weight_buffer[t1][t2][t3][t4] = 0;
 151 | 				}
 152 | }
 153 | 
 154 | void weight_load_reorg(float *Weight,float weight_buffer[Tm][Tn][K][K],bool weight_load_enable,int m,int n,int IFM_numxKxK,int KxK,int Kernel_size,int TM_MIN,int TN_MIN)
 155 | {
 156 | 	int t1,t2,t3,t4;
 157 | 	static float weight_memcpy_buffer[Tm*Tn*K*K];
 158 | 	static int Woffset;
 159 | 
 160 | 	if(!weight_load_enable)
 161 | 		return;
 162 | 
 163 | 	if(m==0&&n==0)
 164 | 		Woffset = 0;
 165 | 
 166 | 	memcpy(weight_memcpy_buffer,(float *)(Weight + Woffset),TM_MIN*TN_MIN*KxK*sizeof(float));
 167 | 	Woffset += TM_MIN*TN_MIN*KxK;
 168 | 	
 169 | 	int weight_memcpy_offset = 0;
 170 | 	for(t3 = 0;t3 <Kernel_size; t3++)
 171 | 		for(t4 = 0;t4 <Kernel_size; t4++)
 172 | 			for(t1 = 0;t1 < Tm; t1++)
 173 | 				for(t2 = 0;t2 < Tn; t2++)
 174 | 				{
 175 | 					bool Enable = (t1 < TM_MIN)&&(t2 < TN_MIN);
 176 | 					if(Enable)
 177 | 					{
 178 | 						weight_buffer[t1][t2][t3][t4] =  weight_memcpy_buffer[weight_memcpy_offset];
 179 | 						weight_memcpy_offset++;
 180 | 					}
 181 | 					else
 182 | 						weight_buffer[t1][t2][t3][t4] = 0;
 183 | 				}	
 184 | }
 185 | 
 186 | 
 187 | /*
 188 | Kernel_stride, Padding, TRow, TCol, Input_w, Input_h, TN_MIN, IHxIW, LayerType 在某个层内一直是常数
 189 | bool initialize 无效参数
 190 | 
 191 | float *input,float *Weight, 输入和权重的地址
 192 | int InFM_num, 输入通道的总数
 193 | int Input_w,int Input_h, 输入宽高
 194 | int OutFM_num, 输出通道的总数
 195 | int Kernel_size,int Kernel_stride, 卷积核大小步长
 196 | int r,int c,int m,int n, 当前已经加载了的行列，输出页数，输入页数
 197 | int TM_MIN,
 198 | int TN,
 199 | int TRow,int TCol,int Padding, 是否进行padding计算后的新的行列值，是否padding
 200 | float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],float weight_buffer[Tm][Tn][K][K], 输入buff，权重buff
 201 | ,int TMP_N_next[1],
 202 | bool enable,bool weight_load_enable,bool initialize,使能端，是否加载权重，无效参数
 203 | const int IHxIW,const int KxK,一页的总大小，卷积核大小
 204 | const int IFM_numxKxK,
 205 | const int LayerType 层类型
 206 | 
 207 | 判断是否需要加载weight，分别加载input和weight，对非conv层来说，＝ input_load
 208 | */
 209 | void copy_input_weight(float *input,float *Weight,int InFM_num,int Input_w,int Input_h,int OutFM_num,int Kernel_size,int Kernel_stride,int r,int c,int m,int n,
 210 | 		int TM_MIN,int TN,int TRow,int TCol,int Padding,float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],float weight_buffer[Tm][Tn][K][K],int TMP_N_next[1],
 211 | 		bool enable,bool weight_load_enable,bool initialize,const int IHxIW,const int KxK,const int IFM_numxKxK,const int LayerType)
 212 | {
 213 | 	if(!enable)
 214 | 		return ;
 215 | 
 216 | 	const int TN_MIN = MIN(TN,InFM_num - n);
 217 | 	TMP_N_next[0] = n;
 218 | 	//if(LayerType == 2) printf("TN_MIN = %d,TRow = %d,TCol = %d\n",TN_MIN,TRow,TCol);
 219 | 
 220 | 	input_load(input, input_buffer, r, c, n, Kernel_stride, Padding, TRow, TCol, Input_w, Input_h, TN_MIN, IHxIW, LayerType);
 221 | #ifdef REORG_TEST
 222 | 	weight_load_reorg(Weight,weight_buffer,weight_load_enable,m,n,IFM_numxKxK,KxK,Kernel_size,TM_MIN,TN_MIN);
 223 | #else
 224 | 	weight_load(Weight,weight_buffer,weight_load_enable,m,n,IFM_numxKxK,KxK,Kernel_size,TM_MIN,TN_MIN);
 225 | #endif
 226 | 
 227 | }
 228 | //////////////////////////////////////////////////T3 end
 229 | 	void copy_local_beta(float beta_buffer[MAX_BETA_LENGTH],float local_beta_buffer[MAX_BETA_LENGTH],const int TM_MIN,int m)
 230 | {
 231 | 	//memcpy(local_beta_buffer,(float *)(beta_buffer+m),TM_MIN*sizeof(float));
 232 | 	int offset;
 233 | 	int tm;
 234 | 	for(tm = 0,offset = m;tm < TM_MIN;tm++)
 235 | 	{
 236 | 		local_beta_buffer[tm] = beta_buffer[offset];
 237 | 		offset++;
 238 | 	}
 239 | }
 240 | 
 241 | void nonlinear_leaky(float Input[Tm][Tr*PARA][Tc*PARA],const int TM_MIN,const int TR_MIN,const int TC_MIN,const bool IsNL)
 242 | {
 243 | 	int tr,tc,tm;
 244 | 
 245 | 	if(!IsNL)
 246 | 		return ;
 247 | 	
 248 | 	for(tm = 0;tm < TM_MIN;tm++)
 249 | #pragma HLS LOOP_TRIPCOUNT min=1 max=1
 250 | 		for(tr = 0;tr < TR_MIN;tr++)
 251 | #pragma HLS LOOP_TRIPCOUNT min=1 max=14
 252 | 			for(tc = 0;tc < TC_MIN;tc++)
 253 | 			{
 254 | #pragma HLS LOOP_TRIPCOUNT min=14 max=14
 255 | #pragma HLS PIPELINE
 256 | 				float tmp = Input[tm][tr][tc];
 257 | 				if(tmp < 0)
 258 | 					Input[tm][tr][tc] = tmp*0.1;
 259 | 			}
 260 | 
 261 | }
 262 | 
 263 | ///#######################################################################################################################
 264 | void shortcut(float input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],float input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width],
 265 |         float output_buffer[Tm][Tr*PARA][Tc*PARA],const int TM_MIN,const int TR_MIN,
 266 |         const int TC_MIN,bool enable){
 267 |     int tr,tc,tm;
 268 | 	//printf("TM_MIN=%d,TR_MIN=%d,TC_MIN=%d\n",TM_MIN,TR_MIN,TC_MIN);
 269 | 	//if(enable) printf("yes\n");
 270 | 
 271 |     if(!enable){
 272 |         //printf("No!\n");
 273 | 		return;
 274 |     }
 275 |     //printf("Tn=%d,OnChipIB_Height=%d,OnChipIB_Width=%d,TM_MIN=%d,TR_MIN=%d,TC_MIN=%d\n",Tn,OnChipIB_Height,OnChipIB_Width,TM_MIN,TR_MIN,TC_MIN);
 276 |     //assert(TR_MIN < 32);
 277 | 	//assert(TC_MIN < 32);
 278 | 	
 279 |     for(tm = 0; tm < TM_MIN;tm++){
 280 |         for(tr = 0; tr < TR_MIN;tr++){
 281 |             for(tc = 0; tc < TC_MIN; tc++){
 282 | //#pragma HLS PIPELINE
 283 |                 output_buffer[tm][tr][tc] = input_buffer0[tm][tr][tc] + input_buffer1[tm][tr][tc];
 284 |             }
 285 |         }
 286 |     }
 287 |     //printf("tm=%d,tr=%d,tc=%d\n",tm,tr,tc);
 288 |     //printf("output_buffer[tm][tr][tc] = input_buffer0[tm][tr][tc] + input_buffer1[tm][tr][tc] is %f = %f + %f\n",output_buffer[tm][tr][tc],input_buffer0[tm][tr][tc],input_buffer1[tm][tr][tc]);
 289 |     //printf("shortcut OK 3\n");
 290 | }
 291 | ///#######################################################################################################################
 292 | //有问题，97层的太大了，弄出来不对劲，因为Tr,Tc只有26
 293 | void upsample(float input_bufferInput[Tn][OnChipIB_Height][OnChipIB_Width],float output_buffer[Tm][Tr*PARA][Tc*PARA],
 294 |         const int upsample_size,const int TM_MIN,const int TR_MIN,
 295 |         const int TC_MIN,bool enable){
 296 |     int tr,tc,tm,i,j;
 297 | 	//printf("TM_MIN=%d,TR_MIN=%d,TC_MIN=%d\n",TM_MIN,TR_MIN,TC_MIN);
 298 | 	//if(enable) printf("yes\n");
 299 | 
 300 |     if(!enable){
 301 |         //printf("No!\n");
 302 | 		return;
 303 |     }
 304 | 
 305 |     //assert(TR_MIN < 32);
 306 | 	//assert(TC_MIN < 32);
 307 | 	
 308 |     for(tm = 0; tm < TM_MIN;tm++){
 309 |         for(tr = 0; tr < TR_MIN;tr++){
 310 |             for(tc = 0; tc < TC_MIN; tc++){
 311 | //#pragma HLS PIPELINE
 312 |                 for(i = 0; i < upsample_size;i++){
 313 | #pragma HLS UNROLL
 314 |                     for(j = 0;j < upsample_size;j++){
 315 | #pragma HLS UNROLL
 316 |                             output_buffer[tm][tr*upsample_size+i][tc*upsample_size+j] = input_bufferInput[tm][tr][tc];
 317 | 							//printf("page = %d,x = %d,y = %d,value = %f\n",tm,tr*upsample_size+i,tc*upsample_size+j,input_bufferInput[tm][tr][tc]);
 318 |                         }
 319 |                 }
 320 |             }
 321 |         }
 322 |     }
 323 |     //printf("upsample OK 4\n");
 324 | }
 325 | 
 326 | ///#######################################################################################################################
 327 | 
 328 | void compute(float input_buffer[Tn][OnChipIB_Height][OnChipIB_Width],float output_buffer[Tm][Tr*PARA][Tc*PARA],
 329 | 		float weight_buffer[Tm][Tn][K][K],float beta_buffer[MAX_BETA_LENGTH],int TMP_N_next[1],
 330 | 		const int Kernel_size,const int Kernel_stride,int TMP_M,
 331 | 		const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable,const bool IsNL,const bool reluenable)
 332 | {
 333 | 	static float local_beta_buffer[Tm];
 334 | #pragma HLS ARRAY_PARTITION variable=local_beta_buffer complete dim=1
 335 | 
 336 | 	if(!enable)
 337 | 	{
 338 | 		copy_local_beta(beta_buffer,local_beta_buffer,TM_MIN,TMP_M);
 339 | 		return;
 340 | 	}
 341 | 	//printf("compute OK 5\n");
 342 | 	//printf("TM_MIN=%d,TR_MIN=%d,TC_MIN=%d\n",TM_MIN,TR_MIN,TC_MIN);
 343 | 
 344 | 	int i,j,tr,tc,tm,tn;
 345 | 	int n = TMP_N_next[0];
 346 | 	float partial_mul[Tm][Tn];
 347 | 	float partial_add[Tm];
 348 | 
 349 | 	for(i =0;i < Kernel_size; i++)
 350 | #pragma HLS LOOP_TRIPCOUNT min=1 max=5
 351 | 		for(j = 0;j < Kernel_size; j++)
 352 | #pragma HLS LOOP_TRIPCOUNT min=1 max=5
 353 | 			for(tr = 0;tr < TR_MIN;tr++)
 354 | #pragma HLS LOOP_TRIPCOUNT min=14 max=14
 355 | 				for(tc = 0;tc < TC_MIN;tc++)
 356 | 				{
 357 | #pragma HLS LOOP_TRIPCOUNT min=14 max=14
 358 | #pragma HLS PIPELINE
 359 | 					for(tm = 0;tm < Tm;tm++)
 360 | 					{
 361 | 						if(i==0&&j==0&&n==0)
 362 | 							partial_add[tm] = local_beta_buffer[tm];
 363 | 						else
 364 | 							partial_add[tm] = output_buffer[tm][tr][tc];
 365 | 					}
 366 | 
 367 | 					for(tm = 0;tm < Tm;tm++)
 368 | 						for(tn = 0;tn <Tn;tn++)
 369 | 						{
 370 | 							partial_mul[tm][tn] = weight_buffer[tm][tn][i][j]*input_buffer[tn][Kernel_stride*tr+i][Kernel_stride*tc+j];
 371 | 						}
 372 | 
 373 | 					
 374 | 					for(tm = 0;tm < Tm;tm++)
 375 | 					{
 376 | 						float partial_sum = 0;
 377 | 						for(tn = 0;tn <Tn;tn++)
 378 | 						{
 379 | 							 partial_sum += partial_mul[tm][tn];
 380 | 						}
 381 | 						output_buffer[tm][tr][tc] = partial_add[tm] + partial_sum;
 382 | 					}
 383 | 				}
 384 | 
 385 | 	if(reluenable)
 386 | 		nonlinear_leaky(output_buffer,TM_MIN,TR_MIN,TC_MIN,IsNL);
 387 | 
 388 | }
 389 | /*
 390 | float output_buffer[Tm][Tr][Tc],处理好的三维数组
 391 | float *Output,要存放的线性数组
 392 | int r,int c,int m,在这次输入的三维数组之前已经有多少页了，之前有多少行了，之前多少列
 393 | const int Output_w,const int Output_h,输出的宽高
 394 | 
 395 |  const int TM_MIN,这次输入多少页
 396 |  const int TR_MIN,本次输入多少行
 397 |  const int TC_MIN,本次输入多少列
 398 |  const int OHxOW,输出的页面大小
 399 |  bool write_flag是否写入
 400 | */
 401 | void write_back_output_reorg(float output_buffer[Tm][Tr*PARA][Tc*PARA],float *Output,int r,int c,int m,const int Output_w,const int Output_h,
 402 | 					   const int TM_MIN,const int TR_MIN,const int TC_MIN,const int OHxOW,const int LayerType,bool write_flag)
 403 | {
 404 | 	if(!write_flag)
 405 | 		return;
 406 | 
 407 | 	int tempoff;
 408 | 	if(LayerType == 2){
 409 | 	    tempoff = m*OHxOW + r*Output_w*2 + c*2;
 410 | 	} else {
 411 |         tempoff = m*OHxOW + r*Output_w + c;
 412 | 	}
 413 | 	const int offset = tempoff;
 414 | 	int tr,tm,tc;
 415 | 	
 416 | 	const int para = (LayerType==2) ? 2:1;
 417 | 	//if(LayerType==2)	    printf("r = %d,c = %d,m = %d,TM=%d,TR=%d,TC=%d,Out_w=%d,OWOH=%d,offset=%d\n",r,c,m,TM_MIN,TR_MIN,TC_MIN,Output_w,OHxOW,offset);
 418 | 	//if(LayerType==2) 	printf("TM=%d,TR=%d,TC=%d,Out_w=%d,OWOH=%d,offset=%d\n",TM_MIN,TR_MIN,TC_MIN,Output_w,OHxOW,offset);
 419 | 	for(tm = 0;tm < TM_MIN;tm++)
 420 | 		for(tr = 0;tr < TR_MIN*para;tr++)
 421 | 			for(tc = 0;tc < TC_MIN*para;tc++)
 422 | 			{
 423 | 					Output[tm*OHxOW + tr*Output_w + tc + offset] = output_buffer[tm][tr][tc];
 424 | 					if(LayerType==2){
 425 | 						//printf("index = %d,value = %f\n",tm*OHxOW + tr*Output_w + tc + offset,output_buffer[tm][tr][tc]);
 426 | 					}
 427 | 			}
 428 | 	//if(LayerType==2) 	printf("\n\n");
 429 | 
 430 | /*
 431 | 	for(tm = 0;tm < TM_MIN;tm++)
 432 | 		for(tr = 0;tr < TR_MIN*para;tr++)
 433 | 		{
 434 | 			memcpy((float *)(Output + tm*OHxOW + tr*Output_w + offset),output_buffer[tm][tr],TC_MIN*para*sizeof(float));
 435 | 		}
 436 | */
 437 | }
 438 | /*
 439 | void pool_yolo2(float Input[Tn][OnChipIB_Height][OnChipIB_Width],float Output[Tm][Tr][Tc],
 440 | 		  const int Kernel_size,const int Kernel_stride,
 441 | 		  const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable)
 442 | {
 443 | 	if(!enable)
 444 | 		return;
 445 | 
 446 | 	int i,j,tr,tc,of;
 447 | 	float tmp[Tn];
 448 | 
 449 | 	for(tr = 0;tr < TR_MIN;tr++)
 450 | 		for(tc = 0;tc < TC_MIN;tc++)
 451 | 			for(i =0;i < Kernel_size; i++)
 452 | 				for(j = 0;j < Kernel_size; j++)
 453 | 				{
 454 | #pragma HLS PIPELINE
 455 | 					for( of = 0; of < Tn; of++)
 456 | 					{
 457 | 						if(i==0&&j==0)
 458 | 							tmp[of] = -1024*1024;
 459 | 
 460 | 						if(Input[of][tr*Kernel_stride+i][tc*Kernel_stride+j] > tmp[of])
 461 | 							tmp[of] = Input[of][tr*Kernel_stride+i][tc*Kernel_stride+j];
 462 | 
 463 | 						if(i==1&&j==1)
 464 | 							Output[of][tr][tc] = tmp[of];
 465 | 					}
 466 | 				}
 467 | 
 468 | }
 469 | 
 470 | void reorg_yolo2(float Input[Tn][OnChipIB_Height][OnChipIB_Width],float Output[Tm][Tr][Tc],
 471 | 		  const int Kernel_size,const int Kernel_stride,
 472 | 		  const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable)
 473 | {
 474 | 	int x, y,kx,ky;
 475 | 	unsigned char Yoffset;
 476 | 	unsigned char Xoffset;
 477 | 
 478 | 	if(!enable)
 479 | 		return;
 480 | 
 481 |     for( y = 0; y < TR_MIN; y++)
 482 |     	for( x = 0; x < TC_MIN; x++)
 483 | 			for(ky= 0;ky < 2; ky++)
 484 |     			for(kx = 0;kx < 2; kx++)
 485 | 				{
 486 | #pragma HLS PIPELINE
 487 | 						Yoffset = (y << 1) + ky;
 488 | 						Xoffset = (x << 1) + kx;
 489 | 
 490 | 						int in_index  = (ky << 1) + kx;
 491 | 						Output[in_index][y][x] = Input[0][Yoffset][Xoffset];					
 492 |     			}
 493 | }
 494 | */
 495 | void intra_pingpong_wrapper(float *Input0,float *Input1,float *Weight, float output_buffer[Tm][Tr*PARA][Tc*PARA],float beta_buffer[MAX_BETA_LENGTH],
 496 | 								 float input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width],float input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width],
 497 | 								 float input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width],float input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width],
 498 | 								 int InFM_num,int Input_w,int Input_h,int OutFM_num,int Kernel_size,int Kernel_stride,
 499 | 								 int TMP_R,int TMP_C,int TMP_M,int m,int TM_MIN,int TR_MIN,int TC_MIN,int TN,int TRow,int TCol,int Padding,
 500 | 								 int IHxIW,int KxK,int IFM_numxKxK,int nLoops,bool IsNL,int LayerType,int TM,int TMP_X_next[1],int TX_MIN_next[1],bool pingpongx,bool input_flag,bool process_flag)
 501 | {
 502 | 	static float weight_buffer0[Tm][Tn][K][K];
 503 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=1
 504 | #pragma HLS ARRAY_PARTITION variable=weight_buffer0 complete dim=2
 505 | 
 506 | 	static float weight_buffer1[Tm][Tn][K][K];
 507 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=1
 508 | #pragma HLS ARRAY_PARTITION variable=weight_buffer1 complete dim=2
 509 | 
 510 | 	static int NOP[1];
 511 | 	static int tmp_x;
 512 | 	static int tmp_tx_min;
 513 | 	//printf("TM_MIN=%d,TR_MIN=%d,TC_MIN=%d\n",TM_MIN,TR_MIN,TC_MIN);
 514 | 	//printf("intra OK 2\n");
 515 | 	if(LayerType==0)
 516 | 	{
 517 | 
 518 | 		if(!input_flag)
 519 | 			return;
 520 | 		TMP_X_next[0] = TMP_M;//consider by the inner-out loop
 521 | 		TX_MIN_next[0] = TM_MIN;// like above
 522 | 		
 523 | 
 524 | 		bool pingpong = 0;
 525 | 		int TMP_N_next0[1];
 526 | 		int TMP_N_next1[1];
 527 | 		int n;
 528 | 		int TMP_N;
 529 | 		for(TMP_N = 0,n = 0;n < nLoops+1; n++,TMP_N += TN)
 530 | 		{
 531 | 			if(pingpong == 1)
 532 | 			{
 533 | 				copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N,
 534 | 					TM_MIN,TN,TRow,TCol,Padding,input_buffer1,weight_buffer1,TMP_N_next1,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType);
 535 | 				compute(input_buffer0,output_buffer,weight_buffer0,beta_buffer,TMP_N_next0,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops);
 536 | 				pingpong = 0;
 537 | 			}else
 538 | 			{
 539 | 				copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_N,
 540 | 					TM_MIN,TN,TRow,TCol,Padding,input_buffer0,weight_buffer0,TMP_N_next0,n!=nLoops,1,(m==0)&&(n==0),IHxIW,KxK,IFM_numxKxK,LayerType);
 541 | 				compute(input_buffer1,output_buffer,weight_buffer1,beta_buffer,TMP_N_next1,Kernel_size,Kernel_stride,TMP_M,TM_MIN,TR_MIN,TC_MIN,n!=0,IsNL,n==nLoops);
 542 | 				pingpong = 1;
 543 | 			}
 544 | 		}
 545 | 	}
 546 | 	else if(LayerType==1)
 547 | 	{
 548 | 		if(pingpongx==0)
 549 | 		{
 550 | 			TMP_X_next[0] = tmp_x;
 551 | 			TX_MIN_next[0] = tmp_tx_min;
 552 | 			tmp_x = TMP_M;
 553 | 			tmp_tx_min = TM_MIN;
 554 | 
 555 | 			//copy_input_weight(Input,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 556 | 			//	TM_MIN,TM,TRow,TCol,0,input_buffer0,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 557 | 			copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 558 | 				TM_MIN,TM,TRow,TCol,0,input_buffer0,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 559 | 			copy_input_weight(Input1,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 560 | 				TM_MIN,TM,TRow,TCol,0,input_buffer00,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 561 | 			//pool_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
 562 | 			/*
 563 | 			    void shortcut(short input_buffer[Tm][Tr][Tc],int output_buffer[Tm][Tr][Tc],
 564 | 		            const int TM_MIN,const int TR_MIN,const int TC_MIN,bool enable)
 565 | 			*/
 566 | 			shortcut(input_buffer1,input_buffer10,output_buffer,TM_MIN,TR_MIN,TC_MIN,process_flag);
 567 | 			//pool_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
 568 | 		}else
 569 | 		{
 570 | 			TMP_X_next[0] = tmp_x;
 571 | 			TX_MIN_next[0] = tmp_tx_min;
 572 | 			tmp_x = TMP_M;
 573 | 			tmp_tx_min = TM_MIN;
 574 | 
 575 | 			//copy_input_weight(Input,Input1,Input2,Input3,Weight,InFM_num,Input_w,Input_h,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 576 | 			//	TM_MIN,TM,TRow,TCol,0,input_buffer1,weight_buffer0,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType,trow_loops);
 577 | 			copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 578 | 				TM_MIN,TM,TRow,TCol,0,input_buffer1,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 579 | 			copy_input_weight(Input1,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 580 | 				TM_MIN,TM,TRow,TCol,0,input_buffer10,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 581 | 			//pool_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
 582 | 			shortcut(input_buffer0,input_buffer00,output_buffer,TM_MIN,TR_MIN,TC_MIN,process_flag);
 583 | 		}
 584 | 
 585 | 	}
 586 | 	else if(LayerType==2)
 587 | 	{
 588 | 		if(pingpongx==0)
 589 | 		{
 590 | 			TMP_X_next[0] = tmp_x;
 591 | 			TX_MIN_next[0] = tmp_tx_min;
 592 | 			tmp_x = TMP_M;
 593 | 			tmp_tx_min = TM_MIN;
 594 | 
 595 | 			copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 596 | 				TM_MIN,TM,TRow,TCol,0,input_buffer0,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 597 | 			//reorg_yolo2(input_buffer1,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
 598 | 			#ifdef UPSAMPLE_TEST
 599 | 			for(int i = 0; i < 4; i++){
 600 | 			    for(int j = 0; j < 13; j ++){
 601 | 			        for(int k = 0;k < 13;k++){
 602 | 			            printf("%f,",input_buffer0[i][j][k]);
 603 | 			        }
 604 | 			        printf("\n");
 605 | 			    }
 606 | 			    printf("\n");
 607 | 			}
 608 | 			printf("\n\n");
 609 | 			#endif
 610 | 			upsample(input_buffer1,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag);
 611 | 		}else
 612 | 		{
 613 | 			TMP_X_next[0] = tmp_x;
 614 | 			TX_MIN_next[0] = tmp_tx_min;
 615 | 			tmp_x = TMP_M;
 616 | 			tmp_tx_min = TM_MIN;
 617 | 
 618 | 			copy_input_weight(Input0,Weight,InFM_num,Input_w,Input_h,OutFM_num,Kernel_size,Kernel_stride,TMP_R,TMP_C,TMP_M,TMP_M,
 619 | 				TM_MIN,TM,TRow,TCol,0,input_buffer1,NULL,NOP,input_flag,0,0,IHxIW,KxK,IFM_numxKxK,LayerType);
 620 | 			//reorg_yolo2(input_buffer0,output_buffer,Kernel_size,Kernel_stride,TX_MIN_next[0],TR_MIN,TC_MIN,process_flag);
 621 | 			#ifdef UPSAMPLE_TEST
 622 | 			for(int i = 0; i < 4; i++){
 623 | 			    for(int j = 0; j < 13; j ++){
 624 | 			        for(int k = 0;k < 13;k++){
 625 | 			            printf("%f,",input_buffer1[i][j][k]);
 626 | 			        }
 627 | 			        printf("\n");
 628 | 			    }
 629 | 			    printf("\n");
 630 | 			}
 631 | 			printf("\n\n");
 632 | 			#endif
 633 | 			upsample(input_buffer0,output_buffer,Kernel_stride,TM_MIN,TR_MIN,TC_MIN,process_flag);
 634 | 
 635 | 		}
 636 | 		printf("TRow = %d,TMP_R = %d,TR_MIN = %d,TC_MIN = %d,Kernel_stride = %d\n",TRow,TMP_R,TR_MIN,TC_MIN,Kernel_stride);
 637 | 		//printf("TM_MIN = %d,TR_MIN = %d,TC_MIN = %d\n",TM_MIN,TR_MIN,TC_MIN);
 638 | 		//	for(int i = 0;i < 26*26;i++){
 639 | 		//        printf("output_buffer[%d][%d][%d] = %f\n",(int)i/(26*26),(int)(i/26),(int)(i%26),output_buffer[0][(int)(i/26)][(int)(i%26)]);
 640 | 		//    }
 641 | 
 642 | 	}
 643 | 
 644 | }
 645 | /*
 646 | float *Input0,float *Input1,float *Output,float *Weight,float *Beta,输入输出权值偏移数组地址
 647 | ,const int InFM_num,const int OutFM_num, 输入的通道总数，输出通道总数，在v2代码里reorg的输入整个是按照一个channel处理的
 648 | const int Kernel_size,const int Kernel_stride,卷积核大小步长
 649 | const int Input_w,const int Input_h,const int Padding,const bool IsNL,const bool IsBN,输入长和宽，是否填充，是否relu是否批量正则化
 650 | const int TM,const int TN,const int TR,const int TC,计算好的参数
 651 | const int mLoops,const int nLoops,const int rLoops,const int cLoops,同上，也就是加速器的设计
 652 | ,const int LayerType 层类型
 653 | */
 654 | 
 655 | /*
 656 | 为什么需要Loop bound？为什么又和类型有关系？
 657 | shortcut内KxK=0是不行的，方法1：在输入的时候设置，使用的时候设置为1
 658 | 	const int TRow = (TR-1)*Kernel_stride+Kernel_size;	const int TCol = (TC-1)*Kernel_stride+Kernel_size;
 659 | 	这两个在shortcut也成了0；，这里我也把kernel_stride设置成了1，原因是为了保持不变;
 660 | 
 661 | 接下来就是LoopBound的设置了,我个人觉得这个和conv差不多所以对bound设置成1，他是什么含义咱们先不管
 662 | 验一波输出
 663 | 
 664 | */
 665 | void YOLO2_FPGA(float *Input0,float *Input1,float *Output,float *Weight,float *Beta,const int InFM_num,const int OutFM_num,
 666 | 							  const int Kernel_size,const int Kernel_stride,
 667 | 							  const int Input_w,const int Input_h,const int Padding,const bool IsNL,const bool IsBN,
 668 | 							  const int TM,const int TN,const int TR,const int TC,
 669 | 							  const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType)
 670 | {
 671 | 	//const int output_w = (Input_w - Kernel_size + 2*Padding)/Kernel_stride + 1 ;
 672 | 	//const int output_h = (Input_h - Kernel_size + 2*Padding)/Kernel_stride + 1 ;
 673 | 	int output_w;
 674 | 	int output_h;
 675 | 	int temptrow,temptcol;
 676 | 	if(LayerType==0){
 677 | 	    output_w = (Input_w - Kernel_size + (Padding << 1))/Kernel_stride + 1 ;
 678 | 	    output_h = (Input_h - Kernel_size + (Padding << 1))/Kernel_stride + 1 ;
 679 | 	} else if(LayerType==1)
 680 | 	{
 681 | 		output_w = Input_w;
 682 | 		output_h = Input_h;
 683 | 	} else if(LayerType == 2){
 684 | 	    //you dian wen ti
 685 | 	    output_w = Input_w*Kernel_stride;
 686 | 	    output_h = Input_h*Kernel_stride;
 687 | 	}
 688 | 	//This is ok!
 689 | 	//printf("output_w=%d,output_h=%d\n\n",output_w,output_h);
 690 | 
 691 | 	const int OHxOW = output_h*output_w;
 692 | 	if(LayerType==2){
 693 | 	    //temptrow = (TR+1-Kernel_size)/Kernel_stride;
 694 | 	    temptrow = TR;
 695 | 	    temptcol = TC;
 696 | 	    //temptcol = (TC+1-Kernel_size)/Kernel_stride;
 697 | 	} else {
 698 | 	    temptrow = (TR-1)*Kernel_stride+Kernel_size;
 699 | 	    temptcol = (TC-1)*Kernel_stride+Kernel_size;
 700 | 	}
 701 | 	const int TRow = temptrow;
 702 | 	const int TCol = temptcol;
 703 | 	const int IHxIW   = Input_h*Input_w;
 704 | 	const int KxK = Kernel_size*Kernel_size;
 705 | 	const int IFM_numxKxK = InFM_num*KxK;
 706 | 	const int mLoops_bound = (LayerType) ? (mLoops +2): (mLoops + 1);
 707 | 	//const int mLoops_bound = (mLoops + 1);
 708 | 
 709 | 	
 710 | 	printf("output_w=%d,output_h=%d,TRow=%d,TCol=%d,IHxIW=%d,KxK=%d,IFM_numxKxK=%d,mLoops_bound=%d\n\n",output_w,output_h,TRow,TCol,IHxIW,KxK,IFM_numxKxK,mLoops_bound);
 711 | 
 712 | 	static float input_buffer0[Tn][OnChipIB_Height][OnChipIB_Width];
 713 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1
 714 | 
 715 | 	static float input_buffer1[Tn][OnChipIB_Height][OnChipIB_Width];
 716 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1
 717 | 
 718 |     static float input_buffer00[Tn][OnChipIB_Height][OnChipIB_Width];
 719 | #pragma HLS ARRAY_PARTITION variable=input_buffer0 complete dim=1
 720 | 
 721 | 	static float input_buffer10[Tn][OnChipIB_Height][OnChipIB_Width];
 722 | #pragma HLS ARRAY_PARTITION variable=input_buffer1 complete dim=1
 723 | 
 724 | 	static float output_buffer[Tm][Tr*PARA][Tc*PARA];
 725 | #pragma HLS ARRAY_PARTITION variable=output_buffer complete dim=1
 726 | 
 727 | 	static float output_buffer1[Tm][Tr*PARA][Tc*PARA];
 728 | #pragma HLS ARRAY_PARTITION variable=output_buffer1 complete dim=1
 729 | 
 730 | 	static float beta_buffer[MAX_BETA_LENGTH];
 731 | 
 732 | 	int r,c,m;
 733 | /////////////////////////////////param
 734 | 	int TMP_R,TMP_C,TMP_M;
 735 | 	int TM_MIN,TR_MIN,TC_MIN;
 736 | ///////////////////////////////////////
 737 | 
 738 | 	int TMP_M_next0[1];
 739 | 	int TMP_M_next1[1];
 740 | 	int TM_MIN_next0[1];
 741 | 	int TM_MIN_next1[1];
 742 | 	bool pingpongm;
 743 | 
 744 | 	if(LayerType==0)
 745 | 		memcpy(beta_buffer,Beta,OutFM_num*sizeof(float));
 746 | 
 747 | 	/*
 748 | 	Loops都是设置好的，那么究竟是哪里出了问题呢
 749 | 	*/
 750 | 	printf("rLoops*cLoops*mLoops_bound = %d\n",rLoops*cLoops*mLoops_bound);
 751 | 	for(TMP_R = 0,r = 0; r < rLoops; r++, TMP_R += TR)
 752 | 	{
 753 | 		TR_MIN = MIN(TR,output_h -TMP_R);
 754 | 		for(TMP_C = 0,c = 0; c < cLoops; c++,TMP_C += TC)
 755 | 		{
 756 | 			TC_MIN = MIN(TC,output_w -TMP_C);
 757 | 			pingpongm = 0;
 758 | 			for(TMP_M = 0, m = 0; m < mLoops_bound; m++,TMP_M += TM)
 759 | 			{
 760 | 				TM_MIN = MIN(TM,OutFM_num-TMP_M);
 761 | 				//if(LayerType == 1)
 762 | 				if(LayerType!=0) TM_MIN = Tn;
 763 | 				//if(LayerType==2) printf("TMP_R=%d,output_h=%d,output_h -TMP_R=%d,TR_MIN=%d\t TMP_C=%d,output_w=%d,output_w -TMP_C=%d,TC_MIN=%d\t TMP_M=%d,OutFM_num=%d,OutFM_num-TMP_M=%d,TM_MIN=%d\n",TMP_R,output_h,output_h-TMP_R,TR_MIN,  TMP_C,output_w,output_w-TMP_C,TC_MIN,  TMP_M,OutFM_num,OutFM_num-TMP_M,TM_MIN);
 764 | 				bool MneZero = (m!=0);
 765 | 				bool MneOne = (m!=1);
 766 | 				bool MnemLoops = (m!=mLoops);
 767 | 				bool MneMLoopsaddOne = (m!=(mLoops+1));
 768 | 				bool input_flag = LayerType ? MnemLoops&&MneMLoopsaddOne: MnemLoops;
 769 | 				bool process_flag = LayerType ? MneZero&&MneMLoopsaddOne : MnemLoops;
 770 | 				bool write_flag = LayerType ? MneZero&&MneOne : MneZero;
 771 | 				//printf("FPGA OK 1\n");
 772 | 				if(pingpongm==0)
 773 | 				{
 774 | 					intra_pingpong_wrapper(Input0,Input1,Weight,output_buffer1,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10,
 775 | 									InFM_num, Input_w, Input_h, OutFM_num, Kernel_size, Kernel_stride,
 776 | 									TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next1,TM_MIN_next1, pingpongm, input_flag, process_flag);
 777 | 
 778 | 					write_back_output_reorg(output_buffer,Output,TMP_R,TMP_C,TMP_M_next0[0],output_w,output_h,TM_MIN_next0[0],TR_MIN,TC_MIN,OHxOW,LayerType,write_flag);
 779 | 					pingpongm = 1;
 780 | 				}else
 781 | 				{
 782 | 					intra_pingpong_wrapper(Input0,Input1,Weight,output_buffer,beta_buffer,input_buffer0,input_buffer1,input_buffer00,input_buffer10,
 783 | 									InFM_num, Input_w, Input_h, OutFM_num, Kernel_size, Kernel_stride,
 784 | 									TMP_R, TMP_C, TMP_M, m, TM_MIN, TR_MIN, TC_MIN, TN, TRow, TCol, Padding,IHxIW,KxK,IFM_numxKxK,nLoops,IsNL,LayerType,TM, TMP_M_next0,TM_MIN_next0, pingpongm, input_flag, process_flag);
 785 | 
 786 | 					write_back_output_reorg(output_buffer1,Output,TMP_R,TMP_C,TMP_M_next1[0],output_w,output_h,TM_MIN_next1[0],TR_MIN,TC_MIN,OHxOW,LayerType,write_flag);
 787 | 					pingpongm = 0;
 788 | 				}
 789 | 
 790 | 			}
 791 | 		}
 792 | 	}
 793 | }
 794 | 
 795 | int Weight_reorgnaization_anti(float *Weight,float *Weight_reorg,float* Alpha,int IFM_NUM,int OFM_NUM,int Kernel_size,int TM,int TN,const bool IsBN)
 796 | {
 797 | 	const int KxK = Kernel_size*Kernel_size;
 798 | 	const int IFM_NUMxKxK = IFM_NUM*KxK;
 799 | 
 800 | 	int m,n;
 801 | 	int tm,tn,tk;
 802 | 
 803 | 	float weight_buffer[Tm*Tn*K*K];
 804 | 	float weight_buffer2[Tm*Tn*K*K];
 805 | 
 806 | 	int TM_MIN,TN_MIN;
 807 | 	int offset = 0;
 808 | 
 809 | 	for( m = 0; m < OFM_NUM; m += TM)
 810 | 	{
 811 | 		TM_MIN = MIN(TM,OFM_NUM - m);
 812 | 
 813 | 		for(n = 0;n < IFM_NUM; n += TN)
 814 | 		{
 815 | 			TN_MIN = MIN(TN,IFM_NUM - n);
 816 | 
 817 | 			int Woffset = m*IFM_NUMxKxK + n*KxK;
 818 | 
 819 | 			for(tm = 0;tm < TM_MIN; tm++)
 820 | 			{
 821 | 				memcpy((float *)(weight_buffer + tm*TN_MIN*KxK),
 822 | 					(float *)(Weight + tm*IFM_NUMxKxK + Woffset),TN_MIN*KxK*sizeof(float));
 823 | 			}
 824 | 
 825 | 			int TN_MINxTM_MIN = TN_MIN*TM_MIN;
 826 | 
 827 | 			for(tk = 0;tk < KxK; tk++)
 828 | 				for(tm = 0;tm < TM_MIN; tm++)
 829 | 					for(tn = 0;tn < TN_MIN;tn++)
 830 | 					{
 831 | 						weight_buffer2[tk*TN_MINxTM_MIN + tm*TN_MIN + tn] = weight_buffer[tm*TN_MIN*KxK + tn*KxK + tk];
 832 | 					}
 833 | 
 834 | 
 835 | 
 836 | 			memcpy((float *)(Weight_reorg+offset),weight_buffer2,TM_MIN*TN_MIN*KxK*sizeof(float));
 837 | 			offset += TM_MIN*TN_MIN*KxK;
 838 | 		}							
 839 | 	}
 840 | 
 841 | 	return 0;
 842 | }
 843 | 
 844 | /*
 845 | TM * mLoops 是计算以后的通道数量 等于 l.n
 846 | TN * nLoops 是计算以前的通道数量 等于 l.c
 847 | TR * rLoops 是计算的行数量 等于 l.h
 848 | TC * cLoops 是计算的列数量 等于 l.w
 849 | */
 850 | 
 851 | 
 852 | void yolov2_hls_ps(network *net, float *input)
 853 | {
 854 | 	int x;
 855 | 
 856 | 	network orig = *net;
 857 | 	net->input = input;
 858 |     int weight_offset[128] = {864,18432,2048,18432,
 859 | 		73728,8192,73728,
 860 | 		8192,73728,
 861 | 		294912,32768,294912,
 862 | 		32768,294912,
 863 | 		32768,294912,
 864 | 		32768,294912,
 865 | 		32768,294912,
 866 | 		32768,294912,
 867 | 		32768,294912,
 868 | 		32768,294912,
 869 | 		1179648,131072,1179648,
 870 | 		131072,1179648,
 871 | 		131072,1179648,
 872 | 		131072,1179648,
 873 | 		131072,1179648,
 874 | 		131072,1179648,
 875 | 		131072,1179648,
 876 | 		131072,1179648,
 877 | 		4718592,524288,4718592,
 878 | 		524288,4718592,
 879 | 		524288,4718592,
 880 | 		524288,4718592,
 881 | 		524288,4718592,524288,4718592,524288,4718592,261120,
 882 | 		131072,
 883 | 		196608,1179648,131072,1179648,131072,1179648,130560,
 884 | 		32768,
 885 | 		49152,294912,32768,294912,32768,294912,65280,
 886 | 		0,0,0,0,0,0,0,0,0,0,
 887 | 		0,0,0,0,0,0,0,0,0,0,
 888 | 		0,0,0,0,0,0,0,0,0,0,
 889 | 		0,0,0,0,0,0,0,0,0,0,
 890 | 		0,0,0,0,0,0,0,0,0,0,
 891 | 		0,0,0};
 892 | 	int beta_offset[128] = {32,64,32,64,
 893 | 		128,64,128,
 894 | 		64,128,
 895 | 		256,128,256,
 896 | 		128,256,
 897 | 		128,256,
 898 | 		128,256,
 899 | 		128,256,
 900 | 		128,256,
 901 | 		128,256,
 902 | 		128,256,
 903 | 		512,256,512,
 904 | 		256,512,
 905 | 		256,512,
 906 | 		256,512,
 907 | 		256,512,
 908 | 		256,512,
 909 | 		256,512,
 910 | 		256,512,
 911 | 		1024,512,1024,
 912 | 		512,1024,
 913 | 		512,1024,
 914 | 		512,1024,
 915 | 		512,1024,512,1024,512,1024,255,
 916 | 		256,
 917 | 		256,512,256,512,256,512,255,
 918 | 		128,
 919 | 		128,256,128,256,128,256,255,
 920 | 		0,0,0,0,0,0,0,0,0,0,
 921 | 		0,0,0,0,0,0,0,0,0,0,
 922 | 		0,0,0,0,0,0,0,0,0,0,
 923 | 		0,0,0,0,0,0,0,0,0,0,
 924 | 		0,0,0,0,0,0,0,0,0,0,
 925 | 		0,0,0};
 926 | 	
 927 | 	int offset_index = 0;
 928 | 
 929 | 	//float *Weight_buf = (float *)calloc(203767168/4,sizeof(float));
 930 | 	//float *Beta_buf   = (float *)calloc(43044/4,sizeof(float));
 931 | 	float *Weight_buf = (float *)calloc(247583104/4,sizeof(float));
 932 | 	float *Beta_buf   = (float *)calloc(108276/4,sizeof(float));
 933 | 
 934 | #ifdef REORG_TEST
 935 | 	FILE *fp_w = fopen("weights_reorg.bin", "rb");
 936 |     	if(!fp_w) file_error("weights_reorg.bin");
 937 | #else
 938 | 	FILE *fp_w = fopen("weights.bin", "rb");
 939 |     	if(!fp_w) file_error("weights.bin");
 940 | #endif
 941 | 
 942 | #ifdef REORG_GEN
 943 | 	//float *Weight_reorg_buf = (float *)calloc(203767168/4,sizeof(float));
 944 | 	float *Weight_reorg_buf = (float *)calloc(247583104/4,sizeof(float));
 945 | 	FILE *fp_w_reorg = fopen("weights_reorg.bin", "wb");
 946 |     	if(!fp_w_reorg) file_error("weights_reorg.bin");
 947 | #endif
 948 | 
 949 | 	FILE *fp_b = fopen("bias.bin", "rb");
 950 |     	if(!fp_b) file_error("bias.bin");
 951 | 
 952 | 	//fread(Weight_buf, sizeof(float), 203767168/4, fp_w);
 953 | 	//fread(Beta_buf, sizeof(float), 43044/4, fp_b);
 954 | 	fread(Weight_buf, sizeof(float), 247583104/4, fp_w);
 955 | 	fread(Beta_buf, sizeof(float), 108276/4, fp_b);
 956 | 	
 957 | 	fclose(fp_w);
 958 | 	fclose(fp_b);
 959 | 
 960 | 
 961 | //#define MEM_LEN (416*416*32+208*208*32)
 962 | //#define MEM_LEN (608*608*32)
 963 | /*
 964 | 	float *Memory_buf = (float*)calloc(MEM_LEN+1024*2,sizeof(float));//leave some memories for overflow
 965 | 	float *Memory_top = Memory_buf+1024;
 966 | 	float *Memory_bottom = Memory_top + MEM_LEN;
 967 | 	memcpy(Memory_top,input,416*416*3*sizeof(float));//416x416x3 input_pic
 968 | */
 969 | 
 970 | 	#define MEM_LEN (HALFWID*HALFWID*64)
 971 | 
 972 | 	float* Memory_top = (float*)calloc(MEM_LEN*6+BLOCK*6,sizeof(float));/*为什么加1024？*/
 973 | 	
 974 | 	float* Memory_top1 = Memory_top+BLOCK;
 975 | 	float* Memory_top2 = Memory_top1+MEM_LEN*2+BLOCK;
 976 | 	float* Memory_top3 = Memory_top2+MEM_LEN+BLOCK;
 977 | 	float* Memory_top4 = Memory_top3+MEM_LEN+BLOCK;
 978 | 	float* Memory_top5 = Memory_top4+MEM_LEN+BLOCK;
 979 | 	float* Memory_bot  = Memory_top5+MEM_LEN+BLOCK;
 980 | 	
 981 | 	memcpy(Memory_top2,input,HALFWID*HALFWID*4*3*sizeof(float));
 982 | 	
 983 | 	float* in_ptr[107];
 984 | 	float* out_ptr[107];
 985 | 	
 986 | 	#define ROUTE85_LEN (ATOMWID*ATOMWID*1024)
 987 | 	#define ROUTE97_LEN (ATOMWID*ATOMWID*2048)
 988 | 	
 989 | 	//float* yolo_buf = (float *)calloc(HALFWID*HALFWID*64,sizeof(float));
 990 | 
 991 | 
 992 | /*
 993 | 重写了内存管理部分，只需要五个小的buff就可以做完
 994 | 但是不知道为什么，buf1一直不对，所以用了六个
 995 | 修改后的代码不再需要route层，相对于之前的更快一些
 996 | 
 997 | 似乎懂了为什么有问题，因为36层依赖于33，而33存在buf1，所以需要换个位置
 998 | 给33层换个位置以后就好了。确实只需要五个小buf
 999 | 
1000 | */
1001 | 	int i = 0;
1002 | 
1003 | 	in_ptr[0] = Memory_top2;
1004 | 	out_ptr[0] = Memory_top1;
1005 | 	in_ptr[1] = out_ptr[0];
1006 | 	out_ptr[1] = Memory_top2;
1007 | 	in_ptr[2] = Memory_top2;
1008 | 	out_ptr[2] = Memory_top1;
1009 | 
1010 | 	for(i=3;i<6;i++){
1011 | 		if(i%2==0){
1012 | 			in_ptr[i] = Memory_top3;
1013 | 			out_ptr[i] = Memory_top1;
1014 | 		} else {
1015 | 			
1016 | 			in_ptr[i] = out_ptr[i-1];
1017 | 			out_ptr[i] = Memory_top3;
1018 | 		}
1019 | 	}
1020 | 	
1021 | 	for(i=6;i<10;i++){
1022 | 		if(i%2==0){
1023 | 			in_ptr[i] = out_ptr[i-1];
1024 | 			out_ptr[i] = Memory_top2;
1025 | 		} else {
1026 | 			in_ptr[i] = Memory_top2;
1027 | 			out_ptr[i] = Memory_top1;
1028 | 		}
1029 | 	}
1030 | 	
1031 | 	for(i = 10;i < 14;i++){
1032 | 		if(i%2==0){
1033 | 			in_ptr[i] = Memory_top1;
1034 | 			out_ptr[i] = Memory_top3;
1035 | 		} else {
1036 | 			in_ptr[i] = out_ptr[i-1];
1037 | 			out_ptr[i] = Memory_top1;
1038 | 		}
1039 | 	}
1040 | 	
1041 | 	for(i = 14;i< 17;i++){
1042 | 		if(i%2==0){
1043 | 			in_ptr[i] = Memory_top1;
1044 | 			out_ptr[i] = Memory_top2;
1045 | 		} else {
1046 | 			in_ptr[i] = out_ptr[i-1];
1047 | 			out_ptr[i] = Memory_top1;
1048 | 		}
1049 | 	}
1050 | 	for(i=17;i<20;i++){
1051 | 		if(i%2==0){
1052 | 			in_ptr[i] = Memory_top3;
1053 | 			out_ptr[i] = Memory_top2;
1054 | 		} else {
1055 | 			in_ptr[i] = out_ptr[i-1];
1056 | 			out_ptr[i] = Memory_top3;
1057 | 		}
1058 | 	}
1059 | 	for(i=20;i<23;i++){
1060 | 		in_ptr[i] = out_ptr[i-1];
1061 | 		if(i%2==0)
1062 | 			out_ptr[i] = Memory_top1; 
1063 | 		else 
1064 | 			out_ptr[i] = Memory_top3;
1065 | 	}
1066 | 	for(i=23;i<26;i++){
1067 | 		in_ptr[i] = out_ptr[i-1];
1068 | 		if(i%2==0)
1069 | 			out_ptr[i] = Memory_top1; 
1070 | 		else 
1071 | 			out_ptr[i] = Memory_top2;
1072 | 	}
1073 | 	for(i=26;i<29;i++){
1074 | 		in_ptr[i] = out_ptr[i-1];
1075 | 		if(i%2==0)
1076 | 			out_ptr[i] = Memory_top3; 
1077 | 		else 
1078 | 			out_ptr[i] = Memory_top2;
1079 | 	}
1080 | 	for(i=29;i<32;i++){
1081 | 		in_ptr[i] = out_ptr[i-1];
1082 | 		if(i%2==0)
1083 | 			out_ptr[i] = Memory_top3; 
1084 | 		else 
1085 | 			out_ptr[i] = Memory_top1;
1086 | 	}
1087 | 	for(i=32;i<35;i++){
1088 | 		in_ptr[i] = out_ptr[i-1];
1089 | 		if(i%2==0)
1090 | 			out_ptr[i] = Memory_top2; 
1091 | 		else{
1092 | 			out_ptr[i] = Memory_top4;
1093 | 		}
1094 | 	}
1095 | 	
1096 | 	in_ptr[35] = Memory_top2;
1097 | 	out_ptr[35] = Memory_top3;
1098 | 	
1099 | 	in_ptr[36] = Memory_top3;
1100 | 	out_ptr[36] = Memory_top1+ROUTE97_LEN;
1101 | 	/*0~36 现在都没有问题了*/
1102 | 	in_ptr[37] = out_ptr[36];
1103 | 	out_ptr[37] = Memory_top2;
1104 | 	
1105 | 	in_ptr[38] = Memory_top2;
1106 | 	out_ptr[38] = Memory_top3;
1107 | 	
1108 | 	for(i=39;i<42;i++){
1109 | 		in_ptr[i] = out_ptr[i-1];
1110 | 		if(i%2==0)
1111 | 			out_ptr[i] = Memory_top3; 
1112 | 		else 
1113 | 			out_ptr[i] = Memory_top4;
1114 | 	}
1115 | 	
1116 | 	for(i=42;i<45;i++){
1117 | 		in_ptr[i] = out_ptr[i-1];
1118 | 		if(i%2==0)
1119 | 			out_ptr[i] = Memory_top2; 
1120 | 		else 
1121 | 			out_ptr[i] = Memory_top4;
1122 | 	}
1123 | 	
1124 | 	for(i=45;i<48;i++){
1125 | 		in_ptr[i] = out_ptr[i-1];
1126 | 		if(i%2==0)
1127 | 			out_ptr[i] = Memory_top2; 
1128 | 		else 
1129 | 			out_ptr[i] = Memory_top3;
1130 | 	}
1131 | 	
1132 | 	for(i=48;i<51;i++){
1133 | 		in_ptr[i] = out_ptr[i-1];
1134 | 		if(i%2==0)
1135 | 			out_ptr[i] = Memory_top4; 
1136 | 		else 
1137 | 			out_ptr[i] = Memory_top3;
1138 | 	}
1139 | 	for(i=51;i<54;i++){
1140 | 		in_ptr[i] = out_ptr[i-1];
1141 | 		if(i%2==0)
1142 | 			out_ptr[i] = Memory_top4; 
1143 | 		else 
1144 | 			out_ptr[i] = Memory_top2;
1145 | 	}
1146 | 	for(i=54;i<57;i++){
1147 | 		in_ptr[i] = out_ptr[i-1];
1148 | 		if(i%2==0)
1149 | 			out_ptr[i] = Memory_top3; 
1150 | 		else 
1151 | 			out_ptr[i] = Memory_top2;
1152 | 	}
1153 | 	for(i=57;i<60;i++){
1154 | 		in_ptr[i] = out_ptr[i-1];
1155 | 		if(i%2==0)
1156 | 			out_ptr[i] = Memory_top3; 
1157 | 		else 
1158 | 			out_ptr[i] = Memory_top4;
1159 | 	}
1160 | 	
1161 | 	in_ptr[60] = Memory_top4;
1162 | 	out_ptr[60] = Memory_top5;
1163 | 	in_ptr[61] = Memory_top5;
1164 | 	out_ptr[61] = Memory_top2+ROUTE85_LEN;
1165 | 	in_ptr[62] = Memory_top2+ROUTE85_LEN;
1166 | 	out_ptr[62] = Memory_top3;
1167 | 	in_ptr[63] = Memory_top3;
1168 | 	out_ptr[63] = Memory_top4;
1169 | 	
1170 | 	for(i=64;i<67;i++){
1171 | 		in_ptr[i] = out_ptr[i-1];
1172 | 		if(i%2==0)
1173 | 			out_ptr[i] = Memory_top5; 
1174 | 		else 
1175 | 			out_ptr[i] = Memory_top4;
1176 | 	}
1177 | 	for(i=67;i<70;i++){
1178 | 		in_ptr[i] = out_ptr[i-1];
1179 | 		if(i%2==0)
1180 | 			out_ptr[i] = Memory_top5; 
1181 | 		else 
1182 | 			out_ptr[i] = Memory_top3;
1183 | 	}
1184 | 	
1185 | 	for(i=70;i<73;i++){
1186 | 		in_ptr[i] = out_ptr[i-1];
1187 | 		if(i%2==0)
1188 | 			out_ptr[i] = Memory_top4; 
1189 | 		else 
1190 | 			out_ptr[i] = Memory_top3;
1191 | 	}
1192 | 	for(i=73;i<81;i++){
1193 | 		in_ptr[i] = out_ptr[i-1];
1194 | 		if(i%2==0)
1195 | 			out_ptr[i] = Memory_top4; 
1196 | 		else 
1197 | 			out_ptr[i] = Memory_top5;
1198 | 	}
1199 | 	
1200 | 	for(i=81;i<83;i++){
1201 | 		in_ptr[i] = out_ptr[i-1];
1202 | 		if(i%2==0)
1203 | 			out_ptr[i] = Memory_top4; 
1204 | 		else 
1205 | 			out_ptr[i] = Memory_top3;
1206 | 	}
1207 | 	in_ptr[83] = out_ptr[79];
1208 | 	out_ptr[83] = out_ptr[79];
1209 | 	in_ptr[84] = out_ptr[83];
1210 | 	out_ptr[84] = Memory_top4;
1211 | 	
1212 | 	in_ptr[85] = Memory_top4;
1213 | 	out_ptr[85] = Memory_top2;
1214 | 	in_ptr[86] = Memory_top2;
1215 | 	out_ptr[86] = Memory_top2;
1216 | 	
1217 | 	for(i = 87;i<93;i++){
1218 | 		in_ptr[i] = out_ptr[i-1];
1219 | 		if(i%2==0)
1220 | 			out_ptr[i] = Memory_top2; 
1221 | 		else 
1222 | 			out_ptr[i] = Memory_top3;
1223 | 	}
1224 | 	
1225 | 	in_ptr[93] = Memory_top2;
1226 | 	out_ptr[93] = Memory_top4;
1227 | 	in_ptr[94] = Memory_top4;
1228 | 	out_ptr[94] = Memory_top2;
1229 | 	in_ptr[95] = Memory_top3;
1230 | 	out_ptr[95] = Memory_top3;
1231 | 	in_ptr[96] = Memory_top3;
1232 | 	out_ptr[96] = Memory_top2;
1233 | 	in_ptr[97] = Memory_top2;
1234 | 	out_ptr[97] = Memory_top1;
1235 | 	in_ptr[98] = Memory_top1;
1236 | 	out_ptr[98] = Memory_top1;
1237 | 	
1238 | 	for(i = 99;i<107;i++){
1239 | 		in_ptr[i] = out_ptr[i-1];
1240 | 		if(i%2==0)
1241 | 			out_ptr[i] = Memory_top1; 
1242 | 		else 
1243 | 			out_ptr[i] = Memory_top2;
1244 | 	}
1245 | 
1246 |     network netp = *net;
1247 |     //int i;
1248 | 	int woffset = 0;
1249 | 	int aoffset = 0;
1250 | 	int boffset = 0;
1251 | 	int TR,TC,TM,TN;
1252 | 	int output_w,output_h;
1253 | 	int rLoops,cLoops,mLoops,nLoops;
1254 | 	double sum_gop = 0.0;
1255 | 	
1256 | 	int T2Rate;
1257 | 	int TRow;
1258 | 	int trow_loops;
1259 | 
1260 |     for(i = 0; i < netp.n; ++i)
1261 | 	{
1262 |         netp.index = i;
1263 |         layer l = netp.layers[i];
1264 | 		printf("Layer[%2d]: ",i);
1265 | 		switch(l.type)
1266 | 		{
1267 | 			case CONVOLUTIONAL:{
1268 | 				printf("outputMemory:%8d;BN=%d;Activation=%d;conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d  %5.3f BFLOPs\n",l.outputs,l.batch_normalize,l.activation, l.n, l.size, l.size, l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);
1269 | 				sum_gop += (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.;
1270 | 				output_w = (l.w - l.size + 2*l.pad)/l.stride + 1 ;
1271 | 				output_h = (l.h - l.size + 2*l.pad)/l.stride + 1 ;
1272 | 
1273 | 				TR = MIN(((OnChipIB_Height-l.size)/l.stride+1),Tr);//keep Kernel_stride>=1
1274 | 				TR = MIN(output_h,TR);
1275 | 				TC = MIN(((OnChipIB_Width-l.size)/l.stride+1),Tc);
1276 | 				TC = MIN(output_w,TC);
1277 | 				TM = MIN(l.n,Tm);
1278 | 				TN = MIN(l.c,Tn);
1279 | 
1280 | 				rLoops = (int)ceil(((float)output_h)/TR);
1281 | 				cLoops = (int)ceil(((float)output_w)/TC);
1282 | 				mLoops = (int)ceil(((float)l.n)/TM);
1283 | 			    nLoops = (int)ceil(((float)l.c)/TN);
1284 | 				
1285 | 				switch(l.w)
1286 | 				{
1287 | 					case 26:
1288 | 						T2Rate = 2;
1289 | 						break;
1290 | 					case 13:
1291 | 						T2Rate = 4;
1292 | 						break;
1293 | 					default:
1294 | 						T2Rate = 1;
1295 | 						break;
1296 | 				}
1297 | 				TRow = (TR-1)*l.stride+l.size;
1298 | 				trow_loops = (int)ceil(((float)TRow/T2Rate));
1299 | 				
1300 | 			    //printf("TR=%d,TC=%d,TM=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d\n",TR,TC,TM,rLoops,cLoops,mLoops,nLoops);
1301 | 				printf("TRow = %d,trow_loops=%d,l.pad=%d,l.c=%d,l.n=%d,TR=%d,TC=%d,TM=%d,TN=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d,output_w=%d,output_h=%d\n",TRow,trow_loops,l.pad,l.c,l.n,TR,TC,TM,TN,rLoops,cLoops,mLoops,nLoops,output_w,output_h);
1302 | 
1303 | 				YOLO2_FPGA(in_ptr[i],NULL,out_ptr[i],Weight_buf+woffset,Beta_buf+boffset,
1304 | 					l.c,l.n,l.size,
1305 | 					l.stride,l.w,l.h,l.pad,l.activation==LEAKY?1:0,l.batch_normalize?1:0,
1306 | 					TM,TN,TR,TC,
1307 | 					mLoops,nLoops,rLoops,cLoops,0);
1308 | #ifdef REORG_GEN
1309 | 				Weight_reorgnaization_anti(Weight_buf + woffset,Weight_reorg_buf + woffset,NULL,l.c,l.n,l.size,TM,TN,0);
1310 | #endif
1311 | 
1312 | 				woffset += weight_offset[offset_index];
1313 | 				boffset += beta_offset[offset_index];
1314 | 				offset_index++;
1315 | 
1316 | 				break;
1317 | 			}
1318 | 			/*
1319 | 			case MAXPOOL:
1320 | 				printf("outputMemory:%8d;max          %d x %d / %d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",l.outputs, l.size, l.size, l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c);
1321 | 				//output_w = (l.w - l.size)/l.stride + 1 ;
1322 | 				//output_h = (l.h - l.size)/l.stride + 1 ;
1323 | 				output_w = l.out_h;
1324 | 				output_h = l.out_w;
1325 | 
1326 | 				TR = MIN(((OnChipIB_Height-l.size)/l.stride+1),Tr);//keep Kernel_stride>=1
1327 | 				TC = MIN(((OnChipIB_Width-l.size)/l.stride+1),Tc);
1328 | 
1329 | 				TR = MIN(output_h,TR);
1330 | 				TC = MIN(output_w,TC);
1331 | 				TM = MIN(Tm,Tn);
1332 | 				TM = MIN(l.c,TM);
1333 | 
1334 | 				rLoops = (int)ceil(((float)output_h)/TR);
1335 | 				cLoops = (int)ceil(((float)output_w)/TC);
1336 | 				mLoops = (int)ceil(((float)l.c)/TM);
1337 | 
1338 | 				YOLO2_FPGA(in_ptr[i],out_ptr[i],NULL,NULL,l.c,l.c,
1339 | 					l.size,l.stride,l.w,l.h,l.pad,0,0,TM,0,TR,TC,mLoops,0,rLoops,cLoops,1);
1340 | 
1341 | 				break;
1342 | 			case REORG:
1343 | 				printf("outputMemory:%8d;reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",l.outputs,  l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c);			
1344 | 				output_w = 26;
1345 | 				output_h = 32*13;
1346 | 
1347 | 				TR = MIN(((OnChipIB_Height-l.stride)/l.stride+1),Tr);//keep Kernel_stride>=1
1348 | 				TR = MIN(output_h,TR);
1349 | 				TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc);
1350 | 				TC = MIN(output_w,TC);
1351 | 				TM = 4;
1352 | 
1353 | 				rLoops = (int)ceil(((float)output_h)/TR);
1354 | 				cLoops = (int)ceil(((float)output_w)/TC);
1355 | 				mLoops = 1;
1356 | 
1357 | 				YOLO2_FPGA(in_ptr[i],out_ptr[i],NULL,NULL,1,4,
1358 | 							  l.stride,l.stride,52,32*26,0,0,0,TM,0,TR,TC,mLoops,0,rLoops,cLoops,2);
1359 | 
1360 | 				break;
1361 | 			case ROUTE:
1362 | 				printf("outputMemory:%8d;route ",l.outputs);
1363 | 				int j;
1364 | 				for(j = 0; j < l.n; ++j){
1365 | 					printf(" %d", l.input_layers[j]);
1366 | 				}
1367 | 				printf("\n");
1368 | 				break;
1369 | 			case REGION:
1370 | 				printf("outputMemory:%8d;Detection\n",l.outputs);
1371 | 				forward_region_layer(l, in_ptr[i]);
1372 | 				break;
1373 | 			}
1374 | 		*/
1375 | 		    case ROUTE:{
1376 | 				printf("outputMemory:%8d;route ",l.outputs);
1377 | 				int j;
1378 | 				for(j = 0; j < l.n; ++j){
1379 | 					printf(" %d", l.input_layers[j]);
1380 | 				}
1381 | 				printf("\n");
1382 | 				//forward_route_layer(l,netp);
1383 | 				break;
1384 | 			}
1385 | 			case SHORTCUT:{
1386 | 			    //as same as reorg
1387 | 			    printf("res  %3d                %4d x%4d x%4d   ->  %4d x%4d x%4d\n",l.index, netp.layers[i-1].w,netp.layers[i-1].h,netp.layers[i-1].n, l.w,l.h,l.c);
1388 | /*
1389 | 			    output_w = l.out_h;
1390 | 				output_h = l.out_w;
1391 | 
1392 | 				TR = MIN(((OnChipIB_Height-l.stride)/l.stride+1),Tr);//keep Kernel_stride>=1
1393 | 				TR = MIN(output_h,TR);
1394 | 				TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc);
1395 | 				TC = MIN(output_w,TC);
1396 | 				TM = 4;
1397 | 				TN = TM;
1398 | 
1399 | 				rLoops = (int)ceil(((float)output_h)/TR);
1400 | 				cLoops = (int)ceil(((float)output_w)/TC);
1401 | 				mLoops = 1;
1402 | */
1403 | 				//TM=TN;
1404 | 				//mLoops=nLoops;
1405 | 				//TN=TM;
1406 | 				//nLoops=mLoops;
1407 | 				output_w = l.out_h;
1408 | 				output_h = l.out_w;
1409 | 
1410 | 				//TR = MIN(((OnChipIB_Height-l.stride)/l.stride+1),Tr);//keep Kernel_stride>=1
1411 | 				TR = MIN(output_h,Tr);
1412 | 				//TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc);
1413 | 				TC = MIN(output_w,Tc);
1414 | 				//TM = 4;
1415 | 				//TN = TM;
1416 | 				//TM = MIN(l.n,Tm);
1417 | 				TN = MIN(l.c,Tn);
1418 | 				TM = TN;
1419 | 
1420 | 				rLoops = (int)ceil(((float)output_h)/TR);
1421 | 				cLoops = (int)ceil(((float)output_w)/TC);
1422 | 				//mLoops = (int)ceil(((float)l.n)/TM);
1423 | 			    nLoops = (int)ceil(((float)l.c)/TN);
1424 | 				mLoops = nLoops;
1425 | 				//mLoops = 1;
1426 | 				
1427 | 				switch(l.w)
1428 | 				{
1429 | 					case 26:
1430 | 						T2Rate = 2;
1431 | 						break;
1432 | 					case 13:
1433 | 						T2Rate = 4;
1434 | 						break;
1435 | 					default:
1436 | 						T2Rate = 1;
1437 | 						break;
1438 | 				}
1439 | 				TRow = TR;
1440 | 				trow_loops = (int)ceil(((float)TRow/T2Rate));
1441 | 				
1442 | 			    //printf("TR=%d,TC=%d,TM=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d\n",TR,TC,TM,rLoops,cLoops,mLoops,nLoops);
1443 | 				printf("TRow = %d,trow_loops=%d,l.pad=%d,l.c=%d,l.n=%d,TR=%d,TC=%d,TM=%d,TN=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d,output_w=%d,output_h=%d\n",TRow,trow_loops,l.pad,l.c,l.n,TR,TC,TM,TN,rLoops,cLoops,mLoops,nLoops,output_w,output_h);
1444 | /*				
1445 | 				switch(52)
1446 | 				{
1447 | 					case 26:
1448 | 						T2Rate = 2;
1449 | 						break;
1450 | 					case 13:
1451 | 						T2Rate = 4;
1452 | 						break;
1453 | 					default:
1454 | 						T2Rate = 1;
1455 | 						break;
1456 | 				}
1457 | 				TRow = (TR-1)*l.stride+l.stride;
1458 | 				trow_loops = (int)ceil(((float)TRow/T2Rate));
1459 | */
1460 | /*
1461 | 				YOLO2_FPGA(in_ptr[i],out_ptr[i],NULL,NULL,l.c,l.c,
1462 | 					l.size,l.stride,l.w,l.h,l.pad,0,0,TM,0,TR,TC,mLoops,0,rLoops,cLoops,1);
1463 | */
1464 | 				YOLO2_FPGA(out_ptr[i-1],out_ptr[l.index],out_ptr[i],NULL,NULL,l.c,l.c,
1465 | 					1,1,l.w,l.h,l.pad,0,0,TM,TN,TR,TC,mLoops,nLoops,rLoops,cLoops,1);
1466 | 					//inputQ[offset_index],inputQ[offset_index],INTERWIDTH,INTERWIDTH,trow_loops);
1467 | 			    break;
1468 | 			}
1469 | 			case UPSAMPLE:{
1470 | 			    //as same as pool
1471 | 			    printf("upsample           %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", l.stride, l.w, l.h, l.c, l.out_w, l.out_h, l.out_c);
1472 | 			    output_w = l.out_w;
1473 | 				output_h = l.out_h;
1474 | 
1475 | 				//TR TC 到底怎么选择？是按照最小的还是只按输出？
1476 | 				//是按照输出来的，因为loop除以的是输出宽度，但是事实只需要一遍
1477 | 				//所以是按照输入计算
1478 | 				//但是好像都可以，看后期实现哪个方便吧
1479 | 				//TR = MIN(output_h,Tr);
1480 | 				TR = 13;
1481 | 				//TR = MIN(TR,l.h);
1482 | 				//TC = MIN(((OnChipIB_Width-l.stride)/l.stride+1),Tc);
1483 | 				//TC = MIN(output_w,Tc);
1484 | 				TC = 13;
1485 | 				//TC = MIN(TC,l.w);
1486 | 				//TM = 4;
1487 | 				//TN = TM;
1488 | 				//TM = MIN(l.n,Tm);
1489 | 				TN = MIN(l.c,Tn);
1490 | 				TM = TN;
1491 | 
1492 | 				rLoops = (int)ceil(((float)l.h)/TR);
1493 | 				cLoops = (int)ceil(((float)l.w)/TC);
1494 | 				//mLoops = (int)ceil(((float)l.n)/TM);
1495 | 			    nLoops = (int)ceil(((float)l.c)/TN);
1496 | 				mLoops = nLoops;
1497 | 		
1498 | 				switch(l.w)
1499 | 				{
1500 | 					case 13:
1501 | 						T2Rate = 1;
1502 | 						break;
1503 | 					default:
1504 | 						T2Rate = 1;
1505 | 						break;
1506 | 				}
1507 | 				TRow = TR;
1508 | 				trow_loops = (int)ceil(((float)TRow/T2Rate));
1509 | 				
1510 | 			    //printf("TR=%d,TC=%d,TM=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d\n",TR,TC,TM,rLoops,cLoops,mLoops,nLoops);
1511 | 				printf("l.w = %d,l.stride = %d,TRow = %d,trow_loops=%d,l.pad=%d,l.c=%d,l.n=%d,TR=%d,TC=%d,TM=%d,TN=%d,rLoops=%d,cLoops=%d,mLoops=%d,nLoops=%d,output_w=%d,output_h=%d\n",l.w,l.stride,TRow,trow_loops,l.pad,l.c,l.n,TR,TC,TM,TN,rLoops,cLoops,mLoops,nLoops,output_w,output_h);
1512 | /*				
1513 | 				switch(52)
1514 | 				{
1515 | 					case 26:
1516 | 						T2Rate = 2;
1517 | 						break;
1518 | 					case 13:
1519 | 						T2Rate = 4;
1520 | 						break;
1521 | 					default:
1522 | 						T2Rate = 1;
1523 | 						break;
1524 | 				}
1525 | 				TRow = (TR-1)*l.stride+l.stride;
1526 | 				trow_loops = (int)ceil(((float)TRow/T2Rate));
1527 | */
1528 | //				YOLO2_FPGA(in_ptr[i],in_ptr[i],in_ptr[i],in_ptr[i],out_ptr[i],out_ptr[i],NULL,NULL,1,4,
1529 | //					l.stride,l.stride,52,32*26,output_w,output_h,
1530 | //					0,0,0,TM,TN,TR,TC,mLoops,1,rLoops,cLoops,2);
1531 | //(float *Input0,float *Input1,float *Output,float *Weight,float *Beta,const int InFM_num,const int OutFM_num,
1532 | //							  const int Kernel_size,const int Kernel_stride,
1533 | //							  const int Input_w,const int Input_h,const int Padding,const bool IsNL,const bool IsBN,
1534 | //							  const int TM,const int TN,const int TR,const int TC,
1535 | //							  const int mLoops,const int nLoops,const int rLoops,const int cLoops,const int LayerType)
1536 | 				YOLO2_FPGA(in_ptr[i],in_ptr[i],out_ptr[i],NULL,NULL,l.c,l.c,
1537 | 					1,l.stride,
1538 | 					l.w,l.h,l.pad,0,0,
1539 | 					TM,TN,TR,TC,
1540 | 					mLoops,nLoops,rLoops,cLoops,2);
1541 | 					//inputQ[offset_index],inputQ[offset_index],INTERWIDTH,INTERWIDTH,trow_loops);
1542 | 			    			    
1543 | 			    break;
1544 | 			}
1545 | 			case YOLO:{
1546 | 			    //YOLO as same as REGION
1547 | 				printf("outputMemory:%8d;yolo ",l.outputs);	
1548 | 			    /*
1549 | 				double OutputPara = pow(2.0,-inputQ[offset_index]);
1550 | 				bool NextPixelFlag = true;
1551 | 				int OutputPixelOffset = 0;
1552 | 				short current_p,next_p,output_p;
1553 | 				int *Output_ptr = (int *)(in_ptr[i]);
1554 | 				for(j=0;j<l.outputs;j++)
1555 | 				{
1556 | 					if(NextPixelFlag)
1557 | 					{
1558 | 						int tmp_p = Output_ptr[OutputPixelOffset];
1559 | 						OutputPixelOffset++;
1560 | 						current_p = tmp_p;
1561 | 						next_p = tmp_p >> 16;
1562 | 						output_p = current_p;
1563 | 						NextPixelFlag = false;
1564 | 					}else
1565 | 					{
1566 | 						output_p = next_p;
1567 | 						NextPixelFlag = true;
1568 | 					}
1569 | 					yolo_buf[j] = output_p*OutputPara;
1570 | 				}*/
1571 | 				netp.layers[i].output = forward_yolo_array(l,out_ptr[i-1]);    
1572 | 			    break;
1573 | 			}
1574 | 			
1575 | 		}
1576 | 
1577 | 		netp.input = l.output;
1578 | /*
1579 | 		    for( x = 0; x < 500; x++)
1580 | 		    {
1581 | 			    //sprintf(line, "%f\n", out_ptr[i][x]);
1582 | 				printf("%f,",out_ptr[i][x]);
1583 | 				if((x+1)%10==0){
1584 | 					printf("\n");
1585 | 				}
1586 | 			    //if(fputs(line,fp3)<0)fprintf(stderr,"write FILE failed\n");
1587 | 		    }
1588 | */	
1589 | 		/*
1590 | 		//mycode 
1591 | 		//if(i==84||i==85){
1592 | 		    char line[256];
1593 | 		    FILE *fp3;
1594 | 		    char filename[256];
1595 | 		    sprintf(filename, "fpga_net_layer_%d.txt", i);
1596 | 		    if( (fp3 = fopen(filename, "w")) == NULL)fprintf(stderr,"CANNOT OPEN\n");
1597 | 		    for( x = 0; x < l.outputs; x++)
1598 | 		    {
1599 | 			    sprintf(line, "%f\n", out_ptr[i][x]);
1600 | 			    if(fputs(line,fp3)<0)fprintf(stderr,"write FILE failed\n");
1601 | 		    }
1602 | 		    fclose(fp3);
1603 | 		    printf("layer[%d]:Write END!\n\n",i);
1604 | 		*/
1605 | 		//}
1606 | 
1607 |     }
1608 | 	printf("SUM_GOP=%g\n",sum_gop);
1609 | 	*net = orig;
1610 | 
1611 | #ifdef REORG_GEN
1612 | 	//fwrite(Weight_reorg_buf, sizeof(float), 203767168/4, fp_w_reorg);
1613 | 	fwrite(Weight_reorg_buf, sizeof(float), 247583104/4, fp_w_reorg);
1614 | 	fclose(fp_w_reorg);
1615 | 	free(Weight_reorg_buf);
1616 | #endif
1617 | 	free(Memory_top);
1618 | 	free(Weight_buf);
1619 | 	free(Beta_buf);
1620 | 
1621 | }
1622 | ///////////////////////////////////////////////////////////////////////20181229 anti-reorg ok end n4m32
1623 | 


--------------------------------------------------------------------------------