├── Makefile
├── README.md
├── TGIIF
    ├── TGIIF.ipynb
    ├── images
    │   ├── 0.jpg
    │   ├── 1.jpg
    │   ├── 2.jpg
    │   ├── 3.jpg
    │   ├── 4.jpg
    │   ├── 5.jpg
    │   ├── 6.jpg
    │   ├── 7.jpg
    │   ├── 8.jpg
    │   ├── 9.jpg
    │   └── list.txt
    ├── iou.py
    ├── libraries
    │   └── libssd.so
    └── preprocessing.py
├── block_design
    ├── README.md
    ├── pynq_dpu.bd
    └── pynq_dpu_142m.tcl
├── image
    ├── Readme.md
    ├── bd.png
    ├── dnndk.png
    ├── dpu.png
    ├── multithread.png
    ├── ssd.png
    └── training.png
├── model
    ├── caffe_model
    │   ├── deploy.caffemodel
    │   └── deploy_hardware.prototxt
    ├── dpu_ssd.elf
    ├── dpu_ssd_backup.elf
    └── tgiif_map.txt
├── overlay
    └── TGIIF
    │   ├── pynq_dpu_142m.bit
    │   └── pynq_dpu_142m.tcl
├── prerequisites
    ├── README.md
    ├── devicetree.dtb
    └── dnndk-lib
    │   ├── install.sh
    │   └── pkgs
    │       ├── bin
    │           ├── dexplorer
    │           └── dsight
    │       ├── driver
    │           ├── dpu-3.17.0.ko
    │           └── dpu-4.9.0.ko
    │       ├── include
    │           ├── dnndk.h
    │           ├── dputils.h
    │           ├── matrix.h
    │           ├── n2cube.h
    │           └── transform.h
    │       └── lib
    │           ├── echarts.js
    │           ├── libdputils.so.2.4
    │           ├── libdputils.so.3.1
    │           ├── libdsight.a
    │           ├── libhineon.a
    │           └── libn2cube.so
├── src
    ├── SoftmaxTable.hpp
    ├── main.cpp
    ├── neon_math.hpp
    ├── prior_boxes.cpp
    ├── prior_boxes.hpp
    ├── ssd_detector.cpp
    ├── ssd_detector.hpp
    └── time_helper.hpp
└── train
    ├── README.md
    ├── SSD_tgiif.sh
    ├── model_bnfixed_test.prototxt
    ├── model_bnfixed_train.prototxt
    └── solver.prototxt


/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT   =    libssd.so
 2 | OBJ       :=   main.o  dpu_ssd.elf ssd_detector.o prior_boxes.o
 3 | 
 4 | CXX       :=   g++
 5 | CC        :=   gcc
 6 | 
 7 | # linking libraries of OpenCV
 8 | # Please modify the line if OpenCV in your borad is installed at a different location
 9 | LDFLAGS   =   -L /opt/opencv/lib/ -lopencv_highgui -lopencv_imgproc -lopencv_core -lopencv_imgcodecs
10 | # LDFLAGS   = $(shell pkg-config --libs opencv)
11 | # linking libraries of DNNDK 
12 | LDFLAGS   +=  -lhineon -ln2cube -lpthread -ldputils
13 | 
14 | CUR_DIR   =   $(shell pwd)
15 | SRC       =   $(CUR_DIR)/src
16 | BUILD     =   $(CUR_DIR)/build
17 | MODEL	  =   $(CUR_DIR)/model
18 | VPATH     =   $(SRC)
19 | SUDO      =   sudo
20 | DST_DIR   =   $(CUR_DIR)/TGIIF/libraries/
21 | #DST_DIR   =   /home/xilinx/jupyter_notebooks/dac_2018/TGIIF/libraries/
22 | 
23 | CFLAGS    :=  -O3 -mcpu=cortex-a9 -mfloat-abi=hard -mfpu=neon -Wall -Wpointer-arith -std=c++11 -ffast-math -I /opt/opencv/include/ -fPIC -shared -rdynamic
24 |  
25 | all: $(BUILD) $(PROJECT)
26 |  
27 | $(PROJECT): $(OBJ)
28 | 	$(CXX) $(CFLAGS) $(addprefix $(BUILD)/, $^) -o $@ $(LDFLAGS)
29 |  
30 | %.o : %.cc
31 | 	$(CXX) -c $(CFLAGS) $< -o $(BUILD)/$@
32 | 
33 | %.o : %.cpp
34 | 	$(CXX) -c $(CFLAGS) $< -o $(BUILD)/$@
35 | 
36 | %.elf : 
37 | 	cp $(MODEL)/$@ $(BUILD)/$@ 
38 | 
39 | clean:
40 | 	$(RM) -r $(BUILD)
41 | 	$(RM) $(PROJECT)
42 | 
43 | $(BUILD) : 
44 | 	-mkdir -p $@
45 | 
46 | copy:
47 | 	$(SUDO) cp $(CUR_DIR)/$(PROJECT) $(DST_DIR)
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DAC 2018 System Design Contest-TGIIF
 2 | The 1st place winner's source codes for DAC 2018 System Design Contest, FPGA Track. Our design is based on DeePhi DPU RTL IP Core and DeePhi DNNDK software stack. For more infos about DeePhi DPU and DNNDK, please refer to [www.deephi.com][1].
 3 | 
 4 | - For prerequisites, refer to `prerequisites` folder. They are necessary for running our demo on PYNQ-Z1.
 5 | 
 6 | - For the tutorial in python, refer to the python notebook in `TGIIF` folder.
 7 | 
 8 | - For the source codes of software, our block design and the NN model we used, refer to `src`, `block_design`, and `model` folders, respectively.
 9 | 
10 | ## Algorithm
11 | ### SSD Modification
12 | We used SSD(Single Shot Multibox Detector) as our based algorithm, and modified it to better fit for the acceleration on DPU. The overview of the modification applied to the SSD network is showed below.
13 | 
14 | ![ssd](https://github.com/hirayaku/DAC2018-TGIIF/raw/master/image/ssd.png)
15 | 
16 | There are mainly four modifications as below, and you can find more details in the `model/caffe_model` folder.
17 | 
18 | - Better performance-->Resize input image from 640x360 to 448x252 (factor=0.7).
19 | 
20 | - Small objects-->Delete deep layer branches to speed up and get higher IOU.
21 | 
22 | - Speed up the convergence-->Add batch normalization
23 | 
24 | - Evaluation metric-->From mAP to IOU
25 | 
26 | ### Pruning and Quantization
27 | We used DeePhi DNNDK, the first public release of deep learning SDK in China, to support the full-stack development and deployment on DeePhi DPU platform. More infos about DeePhi DNNDK, refer to [www.deephi.com/dnndk.html][2].
28 | 
29 | ![dnndk](https://github.com/hirayaku/DAC2018-TGIIF/raw/master/image/dnndk.png)
30 | 
31 | In the phase of training, we use the fixed-point instead of the float-point for data representation as showed below. The processes of pruning and quantization are implemented via DeePhi DNNDK.
32 | 
33 | ![training](https://github.com/hirayaku/DAC2018-TGIIF/raw/master/image/training.png)
34 | 
35 | ## Software
36 | We used DeePhi DNNDK for NN model compilation and runtime deployment. Furthermore, we applied several optimization methods to improve the accuracy and speed on PYNQ-Z1.
37 | ### Funtional Level Optimization
38 | - Max-value Selection: Selecting the max confidence group bounding box instead of compute all the NMS.
39 | 
40 | - Table look-up: Computing softmax with table look-up, reduce the exponent arithmetic time.
41 | 
42 | ### System Level Optimization
43 | Divide the workflow into fine-grained sub-tasks & re-organization using multi-threading (1.8x speed up with 2 threads)
44 | 
45 | ![multithread](https://github.com/hirayaku/DAC2018-TGIIF/raw/master/image/multithread.png)
46 | 
47 | ## Hardware
48 | ### Overview of DPU
49 | We use DeePhi DPU IP as our hardware system, it is the basis of our design. It is a Deep-Learning Processor Unit that is specially designed for CNN and DNN, named as Aristotle. 
50 | 
51 | ![dpu](https://github.com/hirayaku/DAC2018-TGIIF/raw/master/image/dpu.png)
52 | 
53 | ### Block Design
54 | Here is our hardware solution on PYNQ-Z1. As for the connectivity there are two 64bit ports used for DDR read/write and one 32bit port for instruction and profiler. ARM CPU use one 32bit port to read/write register for controling. 
55 | 
56 | ![bd](https://github.com/hirayaku/DAC2018-TGIIF/raw/master/image/bd.png)
57 | 
58 | [1]:http://www.deephi.com/  "deephi"
59 | [2]:http://www.deephi.com/dnndk.html  "dnndk"
60 | 


--------------------------------------------------------------------------------
/TGIIF/TGIIF.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Object detection on PYNQ\n",
  8 |     "\n",
  9 |     "This notebook demonstrates the typical working flow of our solution. In our solution, a shared library `libssd.so` is first created from C/C++ sources and NN models, with the help of [DNNDK](http://www.deephi.com/dnndk.html). `libssd.so` exports necessary handlers to initialize, operate and terminate the object detection IP (DPU) running on the fabric. Then, the python notebook accesses the shared library and its exposed handlers to interact with the fabric. Most scheduling work is done within the shared library. For details, refer to the C++ codes.\n",
 10 |     "\n",
 11 |     "Either to start from scratch (compiling from source codes and models to the final binary file) or just have a try of our solution (using our binaries), there are some prerequisites:\n",
 12 |     "- Your PYNQ board with a tuned linux kernel\n",
 13 |     "- OpenCV (C++ version)\n",
 14 |     "- Deep Neural Network Development Kit (DNNDK) from DeePhi\n",
 15 |     "In the `prerequisites` folder we have provided the device tree, the DNNDK package along with instructions on installation. If you have problems configuring DNNDK, please contact [DeePhi](http://www.deephi.com).\n",
 16 |     "\n",
 17 |     "Below we illustrate our solution."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Initialization\n",
 25 |     "\n",
 26 |     "Here we import the necessary packages and set up the environment. The shared library in Python is accessed using [cffi](https://cffi.readthedocs.io/). The simplest mode (ABI, in-line) already satisfies our need. First, exported interfaces are declared again in Python; then the shared library is opened. With `cffi.new` we are free to pass the arguments and call the functions in the shared library now."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import sys\n",
 36 |     "import math\n",
 37 |     "import os\n",
 38 |     "import time\n",
 39 |     "from datetime import datetime\n",
 40 |     "from pynq import Overlay\n",
 41 |     "from preprocessing import *\n",
 42 |     "from iou import *\n",
 43 |     "from cffi import FFI\n",
 44 |     "\n",
 45 |     "team = 'TGIIF'\n",
 46 |     "agent = Agent(team)\n",
 47 |     "print(\"Team created\")\n",
 48 |     "\n",
 49 |     "ffi = FFI()\n",
 50 |     "ffi.cdef('''\n",
 51 |     "typedef struct {\n",
 52 |     "    int label;\n",
 53 |     "    int xmin;\n",
 54 |     "    int xmax;\n",
 55 |     "    int ymin;\n",
 56 |     "    int ymax;\n",
 57 |     "    float confidence;\n",
 58 |     "} result_t;\n",
 59 |     "''')\n",
 60 |     "\n",
 61 |     "ffi.cdef('''\n",
 62 |     "void dpu_initialize(char *lib_path);\n",
 63 |     "result_t dpu_detect_single(char *path);\n",
 64 |     "void dpu_detect_list(char *, unsigned);\n",
 65 |     "void dpu_clear(void);\n",
 66 |     "void dpu_destroy(void);\n",
 67 |     "result_t *dpu_get_results(void);\n",
 68 |     "''')\n",
 69 |     "\n",
 70 |     "lib_path = os.path.join(os.getcwd(), \"libraries/libssd.so\")\n",
 71 |     "dpu_lib = ffi.dlopen(lib_path)\n",
 72 |     "\n",
 73 |     "c_lib_path = ffi.new(\"char []\", lib_path.encode())\n",
 74 |     "print(\"Lib opened:\", lib_path)\n",
 75 |     "\n",
 76 |     "c_results = ffi.new(\"result_t *\")"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Overlay loading\n",
 84 |     "\n",
 85 |     "The bitstream file is loaded to the PL side of PYNQ."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "OVERLAY_PATH = os.path.join(OVERLAY_DIR, \"TGIIF/pynq_dpu_142m.bit\")\n",
 95 |     "overlay = Overlay(OVERLAY_PATH)\n",
 96 |     "print(\"Overlay loaded: {}\".format(OVERLAY_PATH))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "## Image processing\n",
104 |     "\n",
105 |     "This step we process the images using the declared functions. We need to all `dpu_initialize` to get DPU ready. To obtain the best performance, `dpu_detect_list` is called for processing images. The API accepts the name of a text file which contains the list of image paths as the argument and returns an array of results. To detect images one by one, you could use `dpu_detect_single`."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "scrolled": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "interval_time = 0\n",
117 |     "total_time = 0\n",
118 |     "total_num_img = len(agent.img_list)\n",
119 |     "result = list()\n",
120 |     "agent.reset_batch_count()\n",
121 |     "\n",
122 |     "# Initialize DPU\n",
123 |     "dpu_lib.dpu_initialize(c_lib_path)\n",
124 |     "print(\"DPU initialized\")\n",
125 |     "\n",
126 |     "# Start processing\n",
127 |     "result_records = []\n",
128 |     "for i in range(math.ceil(total_num_img/BATCH_SIZE)):\n",
129 |     "    # get a batch from agent\n",
130 |     "    batch = agent.send(interval_time, agent.img_batch)\n",
131 |     "    \n",
132 |     "    # timer starts\n",
133 |     "    start = time.time()\n",
134 |     "    with open(agent.coord_team + \"/imgs.txt\", 'w') as fimg:\n",
135 |     "        fimg.write(IMG_DIR+\"\\n\")\n",
136 |     "        for img in batch:\n",
137 |     "            fimg.write(img+'\\n')\n",
138 |     "    print(\"Image list created\")\n",
139 |     "    \n",
140 |     "    c_imgs_file = ffi.new(\"char []\", (agent.coord_team+\"/imgs.txt\").encode())\n",
141 |     "    c_img_num = ffi.new(\"unsigned *\")\n",
142 |     "    c_img_num[0] = BATCH_SIZE\n",
143 |     "    dpu_lib.dpu_detect_list(c_imgs_file, c_img_num[0])\n",
144 |     "    c_results = dpu_lib.dpu_get_results()\n",
145 |     "    print(\"Current batch processed\")\n",
146 |     "    \n",
147 |     "    result_records += [[c_results[j].xmin, c_results[j].xmax, c_results[j].ymin, c_results[j].ymax,\n",
148 |     "                        c_results[j].confidence, c_results[j].label] for j in range(c_img_num[0])]\n",
149 |     "        \n",
150 |     "    # timer stop after PS has received image\n",
151 |     "    end = time.time()\n",
152 |     "    t = end - start\n",
153 |     "    print('Processing time: {} seconds.'.format(t))\n",
154 |     "    total_time += t"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "## Results storing\n",
162 |     "\n",
163 |     "Detection results are stored into xml files."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "# Write misc info\n",
173 |     "agent.write(total_time, total_num_img, team)\n",
174 |     "\n",
175 |     "# Write detection results into xml files\n",
176 |     "agent.save_results_xml(result_records)\n",
177 |     "print(\"XML results written successfully.\")"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "## Cleaning up\n",
185 |     "\n",
186 |     "`dpu_destroy` is called to release the system resources and make DPU idle. To start DPU again, you can call `dpu_intialize` later on."
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "dpu_lib.dpu_destroy()"
196 |    ]
197 |   }
198 |  ],
199 |  "metadata": {
200 |   "kernelspec": {
201 |    "display_name": "Python 3",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.6.3"
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 2
220 | }
221 | 


--------------------------------------------------------------------------------
/TGIIF/images/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/0.jpg


--------------------------------------------------------------------------------
/TGIIF/images/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/1.jpg


--------------------------------------------------------------------------------
/TGIIF/images/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/2.jpg


--------------------------------------------------------------------------------
/TGIIF/images/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/3.jpg


--------------------------------------------------------------------------------
/TGIIF/images/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/4.jpg


--------------------------------------------------------------------------------
/TGIIF/images/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/5.jpg


--------------------------------------------------------------------------------
/TGIIF/images/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/6.jpg


--------------------------------------------------------------------------------
/TGIIF/images/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/7.jpg


--------------------------------------------------------------------------------
/TGIIF/images/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/8.jpg


--------------------------------------------------------------------------------
/TGIIF/images/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/images/9.jpg


--------------------------------------------------------------------------------
/TGIIF/images/list.txt:
--------------------------------------------------------------------------------
 1 | 9.jpg
 2 | 8.jpg
 3 | 7.jpg
 4 | 6.jpg
 5 | 5.jpg
 6 | 4.jpg
 7 | 3.jpg
 8 | 2.jpg
 9 | 1.jpg
10 | 0.jpg
11 | 


--------------------------------------------------------------------------------
/TGIIF/iou.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import xml.dom.minidom as dom
 3 | 
 4 | XML_PATH = "/home/xilinx/jupyter_notebooks/dac_2018/xml"
 5 | 
 6 | # bbox is in format of [xmin, ymin, xmax, ymax]
 7 | def ground_truth(index):
 8 |     doc = dom.parse(os.path.join(XML_PATH, "{}.xml".format(index)))
 9 |     coor = []
10 |     coor.append(doc.getElementsByTagName("xmin")[0].childNodes[0].data)
11 |     coor.append(doc.getElementsByTagName("xmax")[0].childNodes[0].data)
12 |     coor.append(doc.getElementsByTagName("ymin")[0].childNodes[0].data)
13 |     coor.append(doc.getElementsByTagName("ymax")[0].childNodes[0].data)
14 |     coor = [int(c) for c in coor]
15 |     return coor
16 | 
17 | def iou(detect, truth):
18 |     area_detect = (detect[1]-detect[0]+1)*(detect[3]-detect[2]+1)
19 |     area_truth = (truth[1]-truth[0]+1)*(truth[3]-truth[2]+1)
20 |     overlap_coor = [max(detect[0], truth[0]), min(detect[1], truth[1]),
21 |                     max(detect[2], truth[2]), min(detect[3], truth[3])]
22 |     if (overlap_coor[0] >= overlap_coor[1] or overlap_coor[2] >= overlap_coor[3]):
23 |         return 0
24 |     else:
25 |         area_overlap = (overlap_coor[1]-overlap_coor[0]+1)*(overlap_coor[3]-overlap_coor[2]+1)
26 |         assert(area_overlap > 0)
27 |         return float(area_overlap) / (area_detect + area_truth - area_overlap) 
28 | 


--------------------------------------------------------------------------------
/TGIIF/libraries/libssd.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/TGIIF/libraries/libssd.so


--------------------------------------------------------------------------------
/TGIIF/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import xml.dom.minidom
  4 | 
  5 | 
  6 | BATCH_SIZE = 5
  7 | CUR_DIR = os.getcwd()
  8 | DAC_CONTEST = os.path.join(CUR_DIR, '../')
  9 | IMG_DIR = os.path.join(CUR_DIR, './images')
 10 | OVERLAY_DIR = os.path.join(DAC_CONTEST, './overlay')
 11 | RESULT = os.path.join(CUR_DIR, './result')
 12 | TIME_DIR = os.path.join(RESULT, './time')
 13 | COORD_DIR = os.path.join(RESULT, './coordinate')
 14 | XML_PATH = os.path.join(RESULT, './xml')
 15 | 
 16 |     
 17 | # Get image name list
 18 | def get_image_names():
 19 |     names_temp = [f for f in os.listdir(IMG_DIR) if f.endswith('.jpg')]
 20 |     names_temp.sort(key= lambda x:int(x[:-4]))
 21 |     return names_temp
 22 | 
 23 | 
 24 | # Process the images in batches, may help when write to XML
 25 | def get_image_batch():
 26 |     image_list = get_image_names()
 27 |     batches = list()
 28 |     for i in range(0, len(image_list), BATCH_SIZE):
 29 |         batches.append(image_list[i:i+BATCH_SIZE])
 30 |     return batches
 31 | 
 32 | 
 33 | # Get image paths in batches
 34 | def get_image_path(image_name):
 35 |     return os.path.join(IMG_DIR, image_name)
 36 | 
 37 | 
 38 | # Return a batch of image dir  when `send` is called
 39 | class Agent:
 40 |     def __init__(self, teamname):
 41 |         self.batch_count = 0
 42 |         self.dac_contest = DAC_CONTEST
 43 |         self.img_dir = IMG_DIR
 44 |         self.overlay_dir = OVERLAY_DIR
 45 |         self.overlay_dir_team = OVERLAY_DIR + '/' + teamname
 46 |         self.result = RESULT
 47 |         self.time_dir = TIME_DIR
 48 |         self.coord_dir = COORD_DIR
 49 |         self.xml_path = XML_PATH
 50 |         self.coord_team = COORD_DIR + '/' + teamname
 51 |         self.xml_team = XML_PATH + '/' + teamname
 52 |         self.contestant = DAC_CONTEST + '/' + teamname
 53 |         folder_list = [self.dac_contest, self.img_dir, self.overlay_dir,
 54 |                        self.overlay_dir_team,
 55 |                        self.result,
 56 |                        self.time_dir, self.coord_dir, self.xml_path,
 57 |                        self.coord_team, self.xml_team, self.contestant]
 58 |         for folder in folder_list:
 59 |             if not os.path.isdir(folder):
 60 |                 os.mkdir(folder)
 61 |         self.img_list = get_image_names()
 62 |         self.img_batch = get_image_batch()
 63 | 
 64 |     def send(self, interval_time, batches):
 65 |         time.sleep(interval_time)
 66 |         tmp = batches[self.batch_count]
 67 |         self.batch_count += 1
 68 |         return tmp
 69 | 
 70 |     def reset_batch_count(self):
 71 |         self.batch_count = 0
 72 | 
 73 |     def write(self, t_batch, total_img, teamname):
 74 |         fps = total_img / t_batch
 75 |         with open(self.time_dir + '/' + teamname + '.txt', 'a+') as f:
 76 |             f.write("\n" + teamname + " Frames per second: " +
 77 |                     str(fps) + '\n')
 78 | 
 79 |     def save_results_xml(self, result_rectangle):
 80 |         if len(result_rectangle) != len(self.img_list):
 81 |             raise ValueError("Result length not equal to number of images.")
 82 |         for i in range(len(self.img_list)):
 83 |             doc = xml.dom.minidom.Document()
 84 |             root = doc.createElement('annotation')
 85 | 
 86 |             doc.appendChild(root)
 87 |             name_e = doc.createElement('filename')
 88 |             name_t = doc.createTextNode(self.img_list[i])
 89 |             name_e.appendChild(name_t)
 90 |             root.appendChild(name_e)
 91 | 
 92 |             size_e = doc.createElement('size')
 93 |             node_width = doc.createElement('width')
 94 |             node_width.appendChild(doc.createTextNode("640"))
 95 |             node_length = doc.createElement('length')
 96 |             node_length.appendChild(doc.createTextNode("360"))
 97 |             size_e.appendChild(node_width)
 98 |             size_e.appendChild(node_length)
 99 |             root.appendChild(size_e)
100 | 
101 |             object_node = doc.createElement('object')
102 |             node_name = doc.createElement('name')
103 |             node_name.appendChild(doc.createTextNode("NotCare"))
104 |             node_bnd_box = doc.createElement('bndbox')
105 |             node_bnd_box_xmin = doc.createElement('xmin')
106 |             node_bnd_box_xmin.appendChild(
107 |                 doc.createTextNode(str(result_rectangle[i][0])))
108 |             node_bnd_box_xmax = doc.createElement('xmax')
109 |             node_bnd_box_xmax.appendChild(
110 |                 doc.createTextNode(str(result_rectangle[i][1])))
111 |             node_bnd_box_ymin = doc.createElement('ymin')
112 |             node_bnd_box_ymin.appendChild(
113 |                 doc.createTextNode(str(result_rectangle[i][2])))
114 |             node_bnd_box_ymax = doc.createElement('ymax')
115 |             node_bnd_box_ymax.appendChild(
116 |                 doc.createTextNode(str(result_rectangle[i][3])))
117 |             node_bnd_box.appendChild(node_bnd_box_xmin)
118 |             node_bnd_box.appendChild(node_bnd_box_xmax)
119 |             node_bnd_box.appendChild(node_bnd_box_ymin)
120 |             node_bnd_box.appendChild(node_bnd_box_ymax)
121 | 
122 |             object_node.appendChild(node_name)
123 |             object_node.appendChild(node_bnd_box)
124 |             root.appendChild(object_node)
125 | 
126 |             file_name = self.img_list[i].replace('jpg', 'xml')
127 |             with open(self.xml_team + "/" + file_name, 'w') as fp:
128 |                 doc.writexml(fp, indent='\t', addindent='\t',
129 |                              newl='\n', encoding="utf-8")
130 | 


--------------------------------------------------------------------------------
/block_design/README.md:
--------------------------------------------------------------------------------
1 | Note: Since the DPU RTL IP is of DeePhi(Now part of Xilinx), you could only test our design with the bitstream right now. The tcl file is use for loading the overlay of the PYNQ-Z1.
2 | 
3 | To access the evaluation of DeePhi DPU IP, please refer to the dnndk@deephi.tech and www.deephi.com for further assistance.
4 | 


--------------------------------------------------------------------------------
/image/Readme.md:
--------------------------------------------------------------------------------
1 | This repo contains the images for README.
2 | 


--------------------------------------------------------------------------------
/image/bd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/image/bd.png


--------------------------------------------------------------------------------
/image/dnndk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/image/dnndk.png


--------------------------------------------------------------------------------
/image/dpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/image/dpu.png


--------------------------------------------------------------------------------
/image/multithread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/image/multithread.png


--------------------------------------------------------------------------------
/image/ssd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/image/ssd.png


--------------------------------------------------------------------------------
/image/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/image/training.png


--------------------------------------------------------------------------------
/model/caffe_model/deploy.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/model/caffe_model/deploy.caffemodel


--------------------------------------------------------------------------------
/model/caffe_model/deploy_hardware.prototxt:
--------------------------------------------------------------------------------
   1 | layer {
   2 |   name: "data"
   3 |   type: "Input"
   4 |   top: "data"
   5 |   transform_param {
   6 |     mean_value: 104
   7 |     mean_value: 117
   8 |     mean_value: 123
   9 |     resize_param {
  10 |       prob: 1
  11 |       resize_mode: WARP
  12 |       height: 252
  13 |       width: 448
  14 |       interp_mode: LINEAR
  15 |     }
  16 |   }
  17 |   input_param {
  18 |     shape {
  19 |       dim: 1
  20 |       dim: 3
  21 |       dim: 252
  22 |       dim: 448
  23 |     }
  24 |   }
  25 | }
  26 | layer {
  27 |   name: "conv1_1"
  28 |   type: "Convolution"
  29 |   bottom: "data"
  30 |   top: "conv1_1"
  31 |   phase: TRAIN
  32 |   convolution_param {
  33 |     num_output: 8
  34 |     bias_term: true
  35 |     pad: 1
  36 |     kernel_size: 3
  37 |     weight_filler {
  38 |       type: "msra"
  39 |     }
  40 |     bias_filler {
  41 |       type: "constant"
  42 |       value: 0
  43 |     }
  44 |   }
  45 | }
  46 | layer {
  47 |   name: "relu1_1"
  48 |   type: "ReLU"
  49 |   bottom: "conv1_1"
  50 |   top: "conv1_1"
  51 |   phase: TRAIN
  52 | }
  53 | layer {
  54 |   name: "conv1_2"
  55 |   type: "Convolution"
  56 |   bottom: "conv1_1"
  57 |   top: "conv1_2"
  58 |   phase: TRAIN
  59 |   convolution_param {
  60 |     num_output: 26
  61 |     bias_term: true
  62 |     pad: 1
  63 |     kernel_size: 3
  64 |     weight_filler {
  65 |       type: "msra"
  66 |     }
  67 |     bias_filler {
  68 |       type: "constant"
  69 |       value: 0
  70 |     }
  71 |   }
  72 | }
  73 | layer {
  74 |   name: "relu1_2"
  75 |   type: "ReLU"
  76 |   bottom: "conv1_2"
  77 |   top: "conv1_2"
  78 |   phase: TRAIN
  79 | }
  80 | layer {
  81 |   name: "pool1"
  82 |   type: "Pooling"
  83 |   bottom: "conv1_2"
  84 |   top: "pool1"
  85 |   phase: TRAIN
  86 |   pooling_param {
  87 |     pool: MAX
  88 |     kernel_size: 2
  89 |     stride: 2
  90 |   }
  91 | }
  92 | layer {
  93 |   name: "conv2_1"
  94 |   type: "Convolution"
  95 |   bottom: "pool1"
  96 |   top: "conv2_1"
  97 |   phase: TRAIN
  98 |   convolution_param {
  99 |     num_output: 40
 100 |     bias_term: true
 101 |     pad: 1
 102 |     kernel_size: 3
 103 |     weight_filler {
 104 |       type: "msra"
 105 |     }
 106 |     bias_filler {
 107 |       type: "constant"
 108 |       value: 0
 109 |     }
 110 |   }
 111 | }
 112 | layer {
 113 |   name: "relu2_1"
 114 |   type: "ReLU"
 115 |   bottom: "conv2_1"
 116 |   top: "conv2_1"
 117 |   phase: TRAIN
 118 | }
 119 | layer {
 120 |   name: "conv2_2"
 121 |   type: "Convolution"
 122 |   bottom: "conv2_1"
 123 |   top: "conv2_2"
 124 |   phase: TRAIN
 125 |   convolution_param {
 126 |     num_output: 52
 127 |     bias_term: true
 128 |     pad: 1
 129 |     kernel_size: 3
 130 |     weight_filler {
 131 |       type: "msra"
 132 |     }
 133 |     bias_filler {
 134 |       type: "constant"
 135 |       value: 0
 136 |     }
 137 |   }
 138 | }
 139 | layer {
 140 |   name: "relu2_2"
 141 |   type: "ReLU"
 142 |   bottom: "conv2_2"
 143 |   top: "conv2_2"
 144 |   phase: TRAIN
 145 | }
 146 | layer {
 147 |   name: "pool2"
 148 |   type: "Pooling"
 149 |   bottom: "conv2_2"
 150 |   top: "pool2"
 151 |   phase: TRAIN
 152 |   pooling_param {
 153 |     pool: MAX
 154 |     kernel_size: 2
 155 |     stride: 2
 156 |   }
 157 | }
 158 | layer {
 159 |   name: "conv3_1"
 160 |   type: "Convolution"
 161 |   bottom: "pool2"
 162 |   top: "conv3_1"
 163 |   phase: TRAIN
 164 |   convolution_param {
 165 |     num_output: 78
 166 |     bias_term: true
 167 |     pad: 1
 168 |     kernel_size: 3
 169 |     weight_filler {
 170 |       type: "msra"
 171 |     }
 172 |     bias_filler {
 173 |       type: "constant"
 174 |       value: 0
 175 |     }
 176 |   }
 177 | }
 178 | layer {
 179 |   name: "relu3_1"
 180 |   type: "ReLU"
 181 |   bottom: "conv3_1"
 182 |   top: "conv3_1"
 183 |   phase: TRAIN
 184 | }
 185 | layer {
 186 |   name: "conv3_2"
 187 |   type: "Convolution"
 188 |   bottom: "conv3_1"
 189 |   top: "conv3_2"
 190 |   phase: TRAIN
 191 |   convolution_param {
 192 |     num_output: 78
 193 |     bias_term: true
 194 |     pad: 1
 195 |     kernel_size: 3
 196 |     weight_filler {
 197 |       type: "msra"
 198 |     }
 199 |     bias_filler {
 200 |       type: "constant"
 201 |       value: 0
 202 |     }
 203 |   }
 204 | }
 205 | layer {
 206 |   name: "relu3_2"
 207 |   type: "ReLU"
 208 |   bottom: "conv3_2"
 209 |   top: "conv3_2"
 210 |   phase: TRAIN
 211 | }
 212 | layer {
 213 |   name: "conv3_3"
 214 |   type: "Convolution"
 215 |   bottom: "conv3_2"
 216 |   top: "conv3_3"
 217 |   phase: TRAIN
 218 |   convolution_param {
 219 |     num_output: 104
 220 |     bias_term: true
 221 |     pad: 1
 222 |     kernel_size: 3
 223 |     weight_filler {
 224 |       type: "msra"
 225 |     }
 226 |     bias_filler {
 227 |       type: "constant"
 228 |       value: 0
 229 |     }
 230 |   }
 231 | }
 232 | layer {
 233 |   name: "relu3_3"
 234 |   type: "ReLU"
 235 |   bottom: "conv3_3"
 236 |   top: "conv3_3"
 237 |   phase: TRAIN
 238 | }
 239 | layer {
 240 |   name: "pool3"
 241 |   type: "Pooling"
 242 |   bottom: "conv3_3"
 243 |   top: "pool3"
 244 |   phase: TRAIN
 245 |   pooling_param {
 246 |     pool: MAX
 247 |     kernel_size: 2
 248 |     stride: 2
 249 |   }
 250 | }
 251 | layer {
 252 |   name: "conv4_1"
 253 |   type: "Convolution"
 254 |   bottom: "pool3"
 255 |   top: "conv4_1"
 256 |   phase: TRAIN
 257 |   convolution_param {
 258 |     num_output: 206
 259 |     bias_term: true
 260 |     pad: 1
 261 |     kernel_size: 3
 262 |     weight_filler {
 263 |       type: "msra"
 264 |     }
 265 |     bias_filler {
 266 |       type: "constant"
 267 |       value: 0
 268 |     }
 269 |   }
 270 | }
 271 | layer {
 272 |   name: "relu4_1"
 273 |   type: "ReLU"
 274 |   bottom: "conv4_1"
 275 |   top: "conv4_1"
 276 |   phase: TRAIN
 277 | }
 278 | layer {
 279 |   name: "conv4_2"
 280 |   type: "Convolution"
 281 |   bottom: "conv4_1"
 282 |   top: "conv4_2"
 283 |   phase: TRAIN
 284 |   convolution_param {
 285 |     num_output: 154
 286 |     bias_term: true
 287 |     pad: 1
 288 |     kernel_size: 3
 289 |     weight_filler {
 290 |       type: "msra"
 291 |     }
 292 |     bias_filler {
 293 |       type: "constant"
 294 |       value: 0
 295 |     }
 296 |   }
 297 | }
 298 | layer {
 299 |   name: "relu4_2"
 300 |   type: "ReLU"
 301 |   bottom: "conv4_2"
 302 |   top: "conv4_2"
 303 |   phase: TRAIN
 304 | }
 305 | layer {
 306 |   name: "conv4_3"
 307 |   type: "Convolution"
 308 |   bottom: "conv4_2"
 309 |   top: "conv4_3"
 310 |   phase: TRAIN
 311 |   convolution_param {
 312 |     num_output: 104
 313 |     bias_term: true
 314 |     pad: 1
 315 |     kernel_size: 3
 316 |     weight_filler {
 317 |       type: "msra"
 318 |     }
 319 |     bias_filler {
 320 |       type: "constant"
 321 |       value: 0
 322 |     }
 323 |   }
 324 | }
 325 | layer {
 326 |   name: "relu4_3"
 327 |   type: "ReLU"
 328 |   bottom: "conv4_3"
 329 |   top: "conv4_3"
 330 |   phase: TRAIN
 331 | }
 332 | layer {
 333 |   name: "pool4"
 334 |   type: "Pooling"
 335 |   bottom: "conv4_3"
 336 |   top: "pool4"
 337 |   phase: TRAIN
 338 |   pooling_param {
 339 |     pool: MAX
 340 |     kernel_size: 2
 341 |     stride: 2
 342 |   }
 343 | }
 344 | layer {
 345 |   name: "conv5_1"
 346 |   type: "Convolution"
 347 |   bottom: "pool4"
 348 |   top: "conv5_1"
 349 |   phase: TRAIN
 350 |   convolution_param {
 351 |     num_output: 52
 352 |     bias_term: true
 353 |     pad: 1
 354 |     kernel_size: 3
 355 |     weight_filler {
 356 |       type: "msra"
 357 |     }
 358 |     bias_filler {
 359 |       type: "constant"
 360 |       value: 0
 361 |     }
 362 |   }
 363 | }
 364 | layer {
 365 |   name: "relu5_1"
 366 |   type: "ReLU"
 367 |   bottom: "conv5_1"
 368 |   top: "conv5_1"
 369 |   phase: TRAIN
 370 | }
 371 | layer {
 372 |   name: "conv5_2"
 373 |   type: "Convolution"
 374 |   bottom: "conv5_1"
 375 |   top: "conv5_2"
 376 |   phase: TRAIN
 377 |   convolution_param {
 378 |     num_output: 52
 379 |     bias_term: true
 380 |     pad: 1
 381 |     kernel_size: 3
 382 |     weight_filler {
 383 |       type: "msra"
 384 |     }
 385 |     bias_filler {
 386 |       type: "constant"
 387 |       value: 0
 388 |     }
 389 |   }
 390 | }
 391 | layer {
 392 |   name: "relu5_2"
 393 |   type: "ReLU"
 394 |   bottom: "conv5_2"
 395 |   top: "conv5_2"
 396 |   phase: TRAIN
 397 | }
 398 | layer {
 399 |   name: "conv5_3"
 400 |   type: "Convolution"
 401 |   bottom: "conv5_2"
 402 |   top: "conv5_3"
 403 |   phase: TRAIN
 404 |   convolution_param {
 405 |     num_output: 52
 406 |     bias_term: true
 407 |     pad: 1
 408 |     kernel_size: 3
 409 |     weight_filler {
 410 |       type: "msra"
 411 |     }
 412 |     bias_filler {
 413 |       type: "constant"
 414 |       value: 0
 415 |     }
 416 |   }
 417 | }
 418 | layer {
 419 |   name: "relu5_3"
 420 |   type: "ReLU"
 421 |   bottom: "conv5_3"
 422 |   top: "conv5_3"
 423 |   phase: TRAIN
 424 | }
 425 | layer {
 426 |   name: "pool5"
 427 |   type: "Pooling"
 428 |   bottom: "conv5_3"
 429 |   top: "pool5"
 430 |   phase: TRAIN
 431 |   pooling_param {
 432 |     pool: MAX
 433 |     kernel_size: 3
 434 |     stride: 1
 435 |     pad: 1
 436 |   }
 437 | }
 438 | layer {
 439 |   name: "fc6_1"
 440 |   type: "Convolution"
 441 |   bottom: "pool5"
 442 |   top: "fc6"
 443 |   phase: TRAIN
 444 |   convolution_param {
 445 |     num_output: 104
 446 |     bias_term: true
 447 |     pad: 1
 448 |     kernel_size: 3
 449 |     weight_filler {
 450 |       type: "msra"
 451 |     }
 452 |     bias_filler {
 453 |       type: "constant"
 454 |       value: 0
 455 |     }
 456 |   }
 457 | }
 458 | layer {
 459 |   name: "relu6"
 460 |   type: "ReLU"
 461 |   bottom: "fc6"
 462 |   top: "fc6"
 463 |   phase: TRAIN
 464 | }
 465 | layer {
 466 |   name: "fc7_2"
 467 |   type: "Convolution"
 468 |   bottom: "fc6"
 469 |   top: "fc7"
 470 |   phase: TRAIN
 471 |   convolution_param {
 472 |     num_output: 104
 473 |     bias_term: true
 474 |     kernel_size: 1
 475 |     weight_filler {
 476 |       type: "msra"
 477 |     }
 478 |     bias_filler {
 479 |       type: "constant"
 480 |       value: 0
 481 |     }
 482 |   }
 483 | }
 484 | layer {
 485 |   name: "relu7"
 486 |   type: "ReLU"
 487 |   bottom: "fc7"
 488 |   top: "fc7"
 489 |   phase: TRAIN
 490 | }
 491 | layer {
 492 |   name: "conv6_1"
 493 |   type: "Convolution"
 494 |   bottom: "fc7"
 495 |   top: "conv6_1"
 496 |   phase: TRAIN
 497 |   convolution_param {
 498 |     num_output: 26
 499 |     bias_term: true
 500 |     pad: 0
 501 |     kernel_size: 1
 502 |     stride: 1
 503 |     weight_filler {
 504 |       type: "msra"
 505 |     }
 506 |     bias_filler {
 507 |       type: "constant"
 508 |       value: 0
 509 |     }
 510 |   }
 511 | }
 512 | layer {
 513 |   name: "conv6_1_relu"
 514 |   type: "ReLU"
 515 |   bottom: "conv6_1"
 516 |   top: "conv6_1"
 517 |   phase: TRAIN
 518 | }
 519 | layer {
 520 |   name: "conv6_2"
 521 |   type: "Convolution"
 522 |   bottom: "conv6_1"
 523 |   top: "conv6_2"
 524 |   phase: TRAIN
 525 |   convolution_param {
 526 |     num_output: 52
 527 |     bias_term: true
 528 |     pad: 1
 529 |     kernel_size: 3
 530 |     stride: 2
 531 |     weight_filler {
 532 |       type: "msra"
 533 |     }
 534 |     bias_filler {
 535 |       type: "constant"
 536 |       value: 0
 537 |     }
 538 |   }
 539 | }
 540 | layer {
 541 |   name: "conv6_2_relu"
 542 |   type: "ReLU"
 543 |   bottom: "conv6_2"
 544 |   top: "conv6_2"
 545 |   phase: TRAIN
 546 | }
 547 | layer {
 548 |   name: "conv7_1"
 549 |   type: "Convolution"
 550 |   bottom: "conv6_2"
 551 |   top: "conv7_1"
 552 |   phase: TRAIN
 553 |   convolution_param {
 554 |     num_output: 14
 555 |     bias_term: true
 556 |     pad: 0
 557 |     kernel_size: 1
 558 |     stride: 1
 559 |     weight_filler {
 560 |       type: "msra"
 561 |     }
 562 |     bias_filler {
 563 |       type: "constant"
 564 |       value: 0
 565 |     }
 566 |   }
 567 | }
 568 | layer {
 569 |   name: "conv7_1_relu"
 570 |   type: "ReLU"
 571 |   bottom: "conv7_1"
 572 |   top: "conv7_1"
 573 |   phase: TRAIN
 574 | }
 575 | layer {
 576 |   name: "conv7_2"
 577 |   type: "Convolution"
 578 |   bottom: "conv7_1"
 579 |   top: "conv7_2"
 580 |   phase: TRAIN
 581 |   convolution_param {
 582 |     num_output: 26
 583 |     bias_term: true
 584 |     pad: 1
 585 |     kernel_size: 3
 586 |     stride: 2
 587 |     weight_filler {
 588 |       type: "msra"
 589 |     }
 590 |     bias_filler {
 591 |       type: "constant"
 592 |       value: 0
 593 |     }
 594 |   }
 595 | }
 596 | layer {
 597 |   name: "conv7_2_relu"
 598 |   type: "ReLU"
 599 |   bottom: "conv7_2"
 600 |   top: "conv7_2"
 601 |   phase: TRAIN
 602 | }
 603 | layer {
 604 |   name: "conv4_3_norm_mbox_loc"
 605 |   type: "Convolution"
 606 |   bottom: "conv4_3"
 607 |   top: "conv4_3_norm_mbox_loc"
 608 |   phase: TRAIN
 609 |   convolution_param {
 610 |     num_output: 16
 611 |     bias_term: true
 612 |     pad: 1
 613 |     kernel_size: 3
 614 |     stride: 1
 615 |     weight_filler {
 616 |       type: "msra"
 617 |     }
 618 |     bias_filler {
 619 |       type: "constant"
 620 |       value: 0
 621 |     }
 622 |   }
 623 | }
 624 | layer {
 625 |   name: "conv4_3_norm_mbox_loc_perm"
 626 |   type: "Permute"
 627 |   bottom: "conv4_3_norm_mbox_loc"
 628 |   top: "conv4_3_norm_mbox_loc_perm"
 629 |   phase: TRAIN
 630 |   permute_param {
 631 |     order: 0
 632 |     order: 2
 633 |     order: 3
 634 |     order: 1
 635 |   }
 636 | }
 637 | layer {
 638 |   name: "conv4_3_norm_mbox_loc_flat"
 639 |   type: "Flatten"
 640 |   bottom: "conv4_3_norm_mbox_loc_perm"
 641 |   top: "conv4_3_norm_mbox_loc_flat"
 642 |   phase: TRAIN
 643 |   flatten_param {
 644 |     axis: 1
 645 |   }
 646 | }
 647 | layer {
 648 |   name: "conv4_3_norm_mbox_conf"
 649 |   type: "Convolution"
 650 |   bottom: "conv4_3"
 651 |   top: "conv4_3_norm_mbox_conf"
 652 |   phase: TRAIN
 653 |   convolution_param {
 654 |     num_output: 396
 655 |     bias_term: true
 656 |     pad: 1
 657 |     kernel_size: 3
 658 |     stride: 1
 659 |     weight_filler {
 660 |       type: "msra"
 661 |     }
 662 |     bias_filler {
 663 |       type: "constant"
 664 |       value: 0
 665 |     }
 666 |   }
 667 | }
 668 | layer {
 669 |   name: "conv4_3_norm_mbox_conf_perm"
 670 |   type: "Permute"
 671 |   bottom: "conv4_3_norm_mbox_conf"
 672 |   top: "conv4_3_norm_mbox_conf_perm"
 673 |   phase: TRAIN
 674 |   permute_param {
 675 |     order: 0
 676 |     order: 2
 677 |     order: 3
 678 |     order: 1
 679 |   }
 680 | }
 681 | layer {
 682 |   name: "conv4_3_norm_mbox_conf_flat"
 683 |   type: "Flatten"
 684 |   bottom: "conv4_3_norm_mbox_conf_perm"
 685 |   top: "conv4_3_norm_mbox_conf_flat"
 686 |   phase: TRAIN
 687 |   flatten_param {
 688 |     axis: 1
 689 |   }
 690 | }
 691 | layer {
 692 |   name: "conv4_3_norm_mbox_priorbox"
 693 |   type: "PriorBox"
 694 |   bottom: "conv4_3"
 695 |   bottom: "data"
 696 |   top: "conv4_3_norm_mbox_priorbox"
 697 |   phase: TRAIN
 698 |   prior_box_param {
 699 |     min_size: 30
 700 |     max_size: 66
 701 |     aspect_ratio: 2
 702 |     flip: true
 703 |     clip: false
 704 |     variance: 0.1
 705 |     variance: 0.1
 706 |     variance: 0.2
 707 |     variance: 0.2
 708 |     step: 8
 709 |     offset: 0.5
 710 |   }
 711 | }
 712 | layer {
 713 |   name: "fc7_mbox_loc"
 714 |   type: "Convolution"
 715 |   bottom: "fc7"
 716 |   top: "fc7_mbox_loc"
 717 |   phase: TRAIN
 718 |   convolution_param {
 719 |     num_output: 24
 720 |     bias_term: true
 721 |     pad: 1
 722 |     kernel_size: 3
 723 |     stride: 1
 724 |     weight_filler {
 725 |       type: "msra"
 726 |     }
 727 |     bias_filler {
 728 |       type: "constant"
 729 |       value: 0
 730 |     }
 731 |   }
 732 | }
 733 | layer {
 734 |   name: "fc7_mbox_loc_perm"
 735 |   type: "Permute"
 736 |   bottom: "fc7_mbox_loc"
 737 |   top: "fc7_mbox_loc_perm"
 738 |   phase: TRAIN
 739 |   permute_param {
 740 |     order: 0
 741 |     order: 2
 742 |     order: 3
 743 |     order: 1
 744 |   }
 745 | }
 746 | layer {
 747 |   name: "fc7_mbox_loc_flat"
 748 |   type: "Flatten"
 749 |   bottom: "fc7_mbox_loc_perm"
 750 |   top: "fc7_mbox_loc_flat"
 751 |   phase: TRAIN
 752 |   flatten_param {
 753 |     axis: 1
 754 |   }
 755 | }
 756 | layer {
 757 |   name: "fc7_mbox_conf"
 758 |   type: "Convolution"
 759 |   bottom: "fc7"
 760 |   top: "fc7_mbox_conf"
 761 |   phase: TRAIN
 762 |   convolution_param {
 763 |     num_output: 594
 764 |     bias_term: true
 765 |     pad: 1
 766 |     kernel_size: 3
 767 |     stride: 1
 768 |     weight_filler {
 769 |       type: "msra"
 770 |     }
 771 |     bias_filler {
 772 |       type: "constant"
 773 |       value: 0
 774 |     }
 775 |   }
 776 | }
 777 | layer {
 778 |   name: "fc7_mbox_conf_perm"
 779 |   type: "Permute"
 780 |   bottom: "fc7_mbox_conf"
 781 |   top: "fc7_mbox_conf_perm"
 782 |   phase: TRAIN
 783 |   permute_param {
 784 |     order: 0
 785 |     order: 2
 786 |     order: 3
 787 |     order: 1
 788 |   }
 789 | }
 790 | layer {
 791 |   name: "fc7_mbox_conf_flat"
 792 |   type: "Flatten"
 793 |   bottom: "fc7_mbox_conf_perm"
 794 |   top: "fc7_mbox_conf_flat"
 795 |   phase: TRAIN
 796 |   flatten_param {
 797 |     axis: 1
 798 |   }
 799 | }
 800 | layer {
 801 |   name: "fc7_mbox_priorbox"
 802 |   type: "PriorBox"
 803 |   bottom: "fc7"
 804 |   bottom: "data"
 805 |   top: "fc7_mbox_priorbox"
 806 |   phase: TRAIN
 807 |   prior_box_param {
 808 |     min_size: 66
 809 |     max_size: 127
 810 |     aspect_ratio: 2
 811 |     aspect_ratio: 3
 812 |     flip: true
 813 |     clip: false
 814 |     variance: 0.1
 815 |     variance: 0.1
 816 |     variance: 0.2
 817 |     variance: 0.2
 818 |     step: 16
 819 |     offset: 0.5
 820 |   }
 821 | }
 822 | layer {
 823 |   name: "conv6_2_mbox_loc"
 824 |   type: "Convolution"
 825 |   bottom: "conv6_2"
 826 |   top: "conv6_2_mbox_loc"
 827 |   phase: TRAIN
 828 |   convolution_param {
 829 |     num_output: 24
 830 |     bias_term: true
 831 |     pad: 1
 832 |     kernel_size: 3
 833 |     stride: 1
 834 |     weight_filler {
 835 |       type: "msra"
 836 |     }
 837 |     bias_filler {
 838 |       type: "constant"
 839 |       value: 0
 840 |     }
 841 |   }
 842 | }
 843 | layer {
 844 |   name: "conv6_2_mbox_loc_perm"
 845 |   type: "Permute"
 846 |   bottom: "conv6_2_mbox_loc"
 847 |   top: "conv6_2_mbox_loc_perm"
 848 |   phase: TRAIN
 849 |   permute_param {
 850 |     order: 0
 851 |     order: 2
 852 |     order: 3
 853 |     order: 1
 854 |   }
 855 | }
 856 | layer {
 857 |   name: "conv6_2_mbox_loc_flat"
 858 |   type: "Flatten"
 859 |   bottom: "conv6_2_mbox_loc_perm"
 860 |   top: "conv6_2_mbox_loc_flat"
 861 |   phase: TRAIN
 862 |   flatten_param {
 863 |     axis: 1
 864 |   }
 865 | }
 866 | layer {
 867 |   name: "conv6_2_mbox_conf"
 868 |   type: "Convolution"
 869 |   bottom: "conv6_2"
 870 |   top: "conv6_2_mbox_conf"
 871 |   phase: TRAIN
 872 |   convolution_param {
 873 |     num_output: 594
 874 |     bias_term: true
 875 |     pad: 1
 876 |     kernel_size: 3
 877 |     stride: 1
 878 |     weight_filler {
 879 |       type: "msra"
 880 |     }
 881 |     bias_filler {
 882 |       type: "constant"
 883 |       value: 0
 884 |     }
 885 |   }
 886 | }
 887 | layer {
 888 |   name: "conv6_2_mbox_conf_perm"
 889 |   type: "Permute"
 890 |   bottom: "conv6_2_mbox_conf"
 891 |   top: "conv6_2_mbox_conf_perm"
 892 |   phase: TRAIN
 893 |   permute_param {
 894 |     order: 0
 895 |     order: 2
 896 |     order: 3
 897 |     order: 1
 898 |   }
 899 | }
 900 | layer {
 901 |   name: "conv6_2_mbox_conf_flat"
 902 |   type: "Flatten"
 903 |   bottom: "conv6_2_mbox_conf_perm"
 904 |   top: "conv6_2_mbox_conf_flat"
 905 |   phase: TRAIN
 906 |   flatten_param {
 907 |     axis: 1
 908 |   }
 909 | }
 910 | layer {
 911 |   name: "conv6_2_mbox_priorbox"
 912 |   type: "PriorBox"
 913 |   bottom: "conv6_2"
 914 |   bottom: "data"
 915 |   top: "conv6_2_mbox_priorbox"
 916 |   phase: TRAIN
 917 |   prior_box_param {
 918 |     min_size: 127
 919 |     max_size: 188
 920 |     aspect_ratio: 2
 921 |     aspect_ratio: 3
 922 |     flip: true
 923 |     clip: false
 924 |     variance: 0.1
 925 |     variance: 0.1
 926 |     variance: 0.2
 927 |     variance: 0.2
 928 |     step: 32
 929 |     offset: 0.5
 930 |   }
 931 | }
 932 | layer {
 933 |   name: "conv7_2_mbox_loc"
 934 |   type: "Convolution"
 935 |   bottom: "conv7_2"
 936 |   top: "conv7_2_mbox_loc"
 937 |   phase: TRAIN
 938 |   convolution_param {
 939 |     num_output: 24
 940 |     bias_term: true
 941 |     pad: 1
 942 |     kernel_size: 3
 943 |     stride: 1
 944 |     weight_filler {
 945 |       type: "msra"
 946 |     }
 947 |     bias_filler {
 948 |       type: "constant"
 949 |       value: 0
 950 |     }
 951 |   }
 952 | }
 953 | layer {
 954 |   name: "conv7_2_mbox_loc_perm"
 955 |   type: "Permute"
 956 |   bottom: "conv7_2_mbox_loc"
 957 |   top: "conv7_2_mbox_loc_perm"
 958 |   phase: TRAIN
 959 |   permute_param {
 960 |     order: 0
 961 |     order: 2
 962 |     order: 3
 963 |     order: 1
 964 |   }
 965 | }
 966 | layer {
 967 |   name: "conv7_2_mbox_loc_flat"
 968 |   type: "Flatten"
 969 |   bottom: "conv7_2_mbox_loc_perm"
 970 |   top: "conv7_2_mbox_loc_flat"
 971 |   phase: TRAIN
 972 |   flatten_param {
 973 |     axis: 1
 974 |   }
 975 | }
 976 | layer {
 977 |   name: "conv7_2_mbox_conf"
 978 |   type: "Convolution"
 979 |   bottom: "conv7_2"
 980 |   top: "conv7_2_mbox_conf"
 981 |   phase: TRAIN
 982 |   convolution_param {
 983 |     num_output: 594
 984 |     bias_term: true
 985 |     pad: 1
 986 |     kernel_size: 3
 987 |     stride: 1
 988 |     weight_filler {
 989 |       type: "msra"
 990 |     }
 991 |     bias_filler {
 992 |       type: "constant"
 993 |       value: 0
 994 |     }
 995 |   }
 996 | }
 997 | layer {
 998 |   name: "conv7_2_mbox_conf_perm"
 999 |   type: "Permute"
1000 |   bottom: "conv7_2_mbox_conf"
1001 |   top: "conv7_2_mbox_conf_perm"
1002 |   phase: TRAIN
1003 |   permute_param {
1004 |     order: 0
1005 |     order: 2
1006 |     order: 3
1007 |     order: 1
1008 |   }
1009 | }
1010 | layer {
1011 |   name: "conv7_2_mbox_conf_flat"
1012 |   type: "Flatten"
1013 |   bottom: "conv7_2_mbox_conf_perm"
1014 |   top: "conv7_2_mbox_conf_flat"
1015 |   phase: TRAIN
1016 |   flatten_param {
1017 |     axis: 1
1018 |   }
1019 | }
1020 | layer {
1021 |   name: "conv7_2_mbox_priorbox"
1022 |   type: "PriorBox"
1023 |   bottom: "conv7_2"
1024 |   bottom: "data"
1025 |   top: "conv7_2_mbox_priorbox"
1026 |   phase: TRAIN
1027 |   prior_box_param {
1028 |     min_size: 188
1029 |     max_size: 249
1030 |     aspect_ratio: 2
1031 |     aspect_ratio: 3
1032 |     flip: true
1033 |     clip: false
1034 |     variance: 0.1
1035 |     variance: 0.1
1036 |     variance: 0.2
1037 |     variance: 0.2
1038 |     step: 64
1039 |     offset: 0.5
1040 |   }
1041 | }
1042 | layer {
1043 |   name: "mbox_loc"
1044 |   type: "Concat"
1045 |   bottom: "conv4_3_norm_mbox_loc_flat"
1046 |   bottom: "fc7_mbox_loc_flat"
1047 |   bottom: "conv6_2_mbox_loc_flat"
1048 |   bottom: "conv7_2_mbox_loc_flat"
1049 |   top: "mbox_loc"
1050 |   phase: TRAIN
1051 |   concat_param {
1052 |     axis: 1
1053 |   }
1054 | }
1055 | layer {
1056 |   name: "mbox_conf"
1057 |   type: "Concat"
1058 |   bottom: "conv4_3_norm_mbox_conf_flat"
1059 |   bottom: "fc7_mbox_conf_flat"
1060 |   bottom: "conv6_2_mbox_conf_flat"
1061 |   bottom: "conv7_2_mbox_conf_flat"
1062 |   top: "mbox_conf"
1063 |   phase: TRAIN
1064 |   concat_param {
1065 |     axis: 1
1066 |   }
1067 | }
1068 | layer {
1069 |   name: "mbox_priorbox"
1070 |   type: "Concat"
1071 |   bottom: "conv4_3_norm_mbox_priorbox"
1072 |   bottom: "fc7_mbox_priorbox"
1073 |   bottom: "conv6_2_mbox_priorbox"
1074 |   bottom: "conv7_2_mbox_priorbox"
1075 |   top: "mbox_priorbox"
1076 |   phase: TRAIN
1077 |   concat_param {
1078 |     axis: 2
1079 |   }
1080 | }
1081 | 


--------------------------------------------------------------------------------
/model/dpu_ssd.elf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/model/dpu_ssd.elf


--------------------------------------------------------------------------------
/model/dpu_ssd_backup.elf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/model/dpu_ssd_backup.elf


--------------------------------------------------------------------------------
/model/tgiif_map.txt:
--------------------------------------------------------------------------------
  1 | background
  2 | person13
  3 | car3
  4 | riding1
  5 | whale1
  6 | truck1
  7 | riding8
  8 | car15
  9 | paraglider1
 10 | car2
 11 | boat6
 12 | car17
 13 | person11
 14 | person6
 15 | wakeboard4
 16 | boat8
 17 | horseride1
 18 | riding13
 19 | person26
 20 | riding16
 21 | person27
 22 | car8
 23 | person7
 24 | car19
 25 | car5
 26 | riding5
 27 | person16
 28 | boat7
 29 | person9
 30 | wakeboard3
 31 | car24
 32 | car13
 33 | car1
 34 | drone1
 35 | car4
 36 | riding4
 37 | car21
 38 | person3
 39 | person19
 40 | person23
 41 | person12
 42 | drone2
 43 | person5
 44 | riding15
 45 | car22
 46 | car14
 47 | person20
 48 | person1
 49 | person29
 50 | person18
 51 | person22
 52 | riding10
 53 | drone3
 54 | boat3
 55 | person17
 56 | riding2
 57 | person21
 58 | group3
 59 | group2
 60 | riding6
 61 | building3
 62 | person25
 63 | riding12
 64 | building2
 65 | riding7
 66 | person14
 67 | person15
 68 | car20
 69 | drone4
 70 | boat1
 71 | boat2
 72 | boat4
 73 | person2
 74 | boat5
 75 | car11
 76 | person28
 77 | car9
 78 | truck2
 79 | riding11
 80 | car6
 81 | person10
 82 | car18
 83 | person24
 84 | riding9
 85 | riding14
 86 | car12
 87 | person8
 88 | car16
 89 | group1
 90 | car23
 91 | bird1
 92 | car7
 93 | wakeboard1
 94 | person4
 95 | car10
 96 | riding17
 97 | riding3
 98 | building1
 99 | wakeboard2
100 | 


--------------------------------------------------------------------------------
/overlay/TGIIF/pynq_dpu_142m.bit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/overlay/TGIIF/pynq_dpu_142m.bit


--------------------------------------------------------------------------------
/prerequisites/README.md:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | 
 3 | Please follow the steps below to modify the boot setting and install the dnndk package on PYNQ-Z1. It has been tested on PYNQ-Z1 Image v2.1. 
 4 | 
 5 | - Copy and change the `devicetree.dtb` in the SD\_CARD.
 6 | 
 7 | - Stop the autostart at the booting stage, and then type the cmds below:
 8 | ```sh     
 9 | setenv bootargs 'console=ttyPS0,115200 mem=256M root=/dev/mmcblk0p2 rw earlyprintk rootfstype=ext4 rootwait devtmpfs.mount=1 uio_pdrv_genirq.of_id="generic-uio"'
10 | setenv fdt_high 0x10000000
11 | setenv initrd_high 0x10000000
12 | saveenv
13 | pri
14 | ```
15 | and then type cmd: 
16 | ```sh
17 | reset
18 | ```
19 | system should boot successfully, After the linux get started, use cmd
20 | ```sh
21 | free -h
22 | ```
23 | make sure that system available memory is 256M instead of 1024M.
24 | 
25 | - Install the dnndk package.
26 | copy and unzip the dnndk-lib.zip to the board. 
27 | use cmd: sudo ./install.sh to install the dnndk package. Make sure that you see the successful info without any warning or error.
28 | 
29 | - Run our code in ipynb.
30 | 


--------------------------------------------------------------------------------
/prerequisites/devicetree.dtb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/devicetree.dtb


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "Begin to install DeePhi DNNDK ..."
 3 | 
 4 | arch=$(uname -m)
 5 | if [ "$arch" != "armv7l" ]; then
 6 |     echo "DeePhi DNNDK package could only be installed on ARM targes."
 7 |     echo "Please contact dnndk-support@deephi.com for more help."
 8 |     echo "Terminate installation ..."
 9 |     exit
10 | fi
11 | 
12 | ###########################################################
13 | echo "Install DeePhi DPU Driver ..."
14 | 
15 | spt_ver_ar=(3.17.0,4.9.0)   # support version list
16 | sysver_pref=$(uname -r | awk -F'[.-]' 'BEGIN {}; {print $1 "." $2 "." $3}')
17 | 
18 | mkdir -p /lib/modules/$(uname -r)/extra/
19 | touch /lib/modules/$(uname -r)/modules.order
20 | touch /lib/modules/$(uname -r)/modules.builtin
21 | 
22 | if [[ "${spt_ver_ar[@]}" =~ $sysver_pref ]] ; then
23 |     cp pkgs/driver/dpu-$sysver_pref.ko /lib/modules/$(uname -r)/extra/dpu.ko
24 | else
25 |     echo "Linux kernel version "  $(uname -r) "not support!!"
26 | 	exit
27 | fi
28 | 
29 | depmod -a
30 | rst="$(lsmod | grep dpu 2>&1)"
31 | if [ -n "$rst" ] ; then
32 | 	rmmod dpu
33 | fi 
34 | rst="$(modprobe dpu | grep modprobe 2>&1)"
35 | if [ -n "$rst" ] ; then
36 | 	echo $rst
37 | 	exit
38 | fi 
39 | 
40 | if ! grep -Fxq "dpu" /etc/modules ; then
41 |     sh -c 'echo "dpu" >> /etc/modules' ;
42 | fi
43 | 
44 | ###########################################################
45 | echo "Install DeePhi tools, runtime & libraries ..."
46 | cp pkgs/bin/*  /usr/local/bin/
47 | cp pkgs/lib/*  /usr/local/lib/
48 | 
49 | lfile="/usr/local/lib/libdputils.so"
50 | if [ -f $lfile ] ; then
51 |     rm $lfile
52 | fi
53 | ln -s /usr/local/lib/libdputils.so.3.1 $lfile 
54 | mkdir -p /usr/local/include/dnndk/
55 | cp pkgs/include/*.h  /usr/local/include/dnndk/ 
56 | ldconfig
57 | 
58 | echo "Complete installation successfully."
59 | 


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/bin/dexplorer:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/bin/dexplorer


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/bin/dsight:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/bin/dsight


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/driver/dpu-3.17.0.ko:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/driver/dpu-3.17.0.ko


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/driver/dpu-4.9.0.ko:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/driver/dpu-4.9.0.ko


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/include/dnndk.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Copyright (c) 2016-2018 DeePhi Tech, Inc.
 3 |  *
 4 |  * All Rights Reserved. No part of this source code may be reproduced
 5 |  * or transmitted in any form or by any means without the prior written
 6 |  * permission of DeePhi Tech, Inc.
 7 |  *
 8 |  * Filename: dnndk.h
 9 |  * Version: 1.10 beta
10 |  *
11 |  * Description:
12 |  * The unique header file containing all DNNDK exported APIs.
13 |  * Please refer to document "deephi_dnndk_user_guide.pdf" for more details of APIs.
14 |  */
15 | 
16 | #ifndef _DNNDK_H_
17 | #define _DNNDK_H_
18 | 
19 | #include <dnndk/n2cube.h>
20 | #include <dnndk/dputils.h>
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/include/dputils.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Copyright (c) 2016-2018 DeePhi Tech, Inc.
 3 |  *
 4 |  * All Rights Reserved. No part of this source code may be reproduced
 5 |  * or transmitted in any form or by any means without the prior written
 6 |  * permission of DeePhi Tech, Inc.
 7 |  *
 8 |  * Filename: dputils.h
 9 |  * Version: 1.10 beta
10 |  *
11 |  * Description:
12 |  * Header file containing all the exported APIs of DNNDK utility library libdputils
13 |  * Please refer to document "deephi_dnndk_user_guide.pdf" for more details of APIs.
14 |  */
15 | 
16 | #ifndef _DPUTILS_H_
17 | #define _DPUTILS_H_
18 | 
19 | #include <opencv2/opencv.hpp>
20 | 
21 | struct  dpu_task_t;
22 | typedef struct dpu_task_t   DPUTask;
23 | 
24 | 
25 | /* Set image into DPU Task's input Tensor */
26 | int dpuSetInputImage(DPUTask *task, const char *nodeName,
27 |     const cv::Mat &image, float *mean);
28 | 
29 | /* Set image into DPU Task's input Tensor with a specified scale parameter */
30 | int dpuSetInputImageWithScale(DPUTask *task, const char *nodeName,
31 |     const cv::Mat &image, float *mean, float scale);
32 | 
33 | /* Set image into DPU Task's input Tensor (mean values automatically processed by N2Cube) */
34 | int dpuSetInputImage2(DPUTask *task, const char *nodeName, const cv::Mat &image);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/include/matrix.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MATRIX_H__
 2 | #define __MATRIX_H__
 3 | 
 4 | /* function description
 5 |  * matrix:
 6 |  * a00 a01 a02 a03
 7 |  * a10 a11 a12 a13
 8 |  * a20 a21 a22 a23
 9 |  * a30 a31 a32 a33
10 |  * we will get new matrix:
11 |  * tt = nrow*factor
12 |  * b0 = a00+a10+a20+a30
13 |  * b1 = a11+a11+a21+a31
14 |  * b2 = a22+a12+a22+a32
15 |  * b3 = a33+a13+a23+a33
16 |  * b0/tt b1/tt b2/tt b3/tt
17 |  */
18 | void matrix_col_avg1(int nrow, int ncol, const signed char *src, float *dst, int factor);
19 | void matrix_col_avg2(int nrow, int ncol, const signed char *src, float *dst, int factor);
20 | void matrix_col_avg3(int nrow, int ncol, const signed char *src, signed char *dst, int factor);
21 | void matrix_col_avg4(int nrow, int ncol, const signed char *src, signed char *dst, float factor);
22 | 
23 | void matrix_col_avg1_c(int nrow, int ncol, const signed char *src, float *dst, int factor);
24 | void matrix_col_avg2_c(int nrow, int ncol, const signed char *src, float *dst, int factor);
25 | void matrix_col_avg3_c(int nrow, int ncol, const signed char *src, signed char *dst, int factor);
26 | void matrix_col_avg4_c(int nrow, int ncol, const signed char *src, signed char *dst, float factor);
27 | 
28 | void matrix_col_avg1_intr(int nrow, int ncol, const signed char *src, float *dst, int factor);
29 | void matrix_col_avg2_intr(int nrow, int ncol, const signed char *src, float *dst, int factor);
30 | void matrix_col_avg3_intr(int nrow, int ncol, const signed char *src, signed char *dst, int factor);
31 | void matrix_col_avg4_intr(int nrow, int ncol, const signed char *src, signed char *dst, float factor);
32 | 
33 | /*
34 |  * R = A*b
35 |  * A : rows x cols
36 |  * b : cols x 1
37 |  * R : rows x 1
38 |  * store matrix by column order
39 |  * var : variant
40 |  */
41 | void matxvec_rowaccess(int rows, int cols, const float *matA, const float *matB, float *matR);
42 | void matxvec_colaccess(int rows, int cols, const float *matA, const float *matB, float *matR);
43 | void matxvec_add_vec_rowaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
44 | void matxvec_add_vec_colaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
45 | void matxvec_add_vec_relu_rowaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
46 | void matxvec_add_vec_relu_colaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
47 | 
48 | void matxvec_rowaccess_c(int rows, int cols, const float *matA, const float *matB, float *matR);
49 | void matxvec_colaccess_c(int rows, int cols, const float *matA, const float *matB, float *matR);
50 | void matxvec_add_vec_rowaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
51 | void matxvec_add_vec_colaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
52 | void matxvec_add_vec_relu_rowaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
53 | void matxvec_add_vec_relu_colaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
54 | 
55 | void matxvec_rowaccess_intr(int rows, int cols, const float *matA, const float *matB, float *matR);
56 | void matxvec_colaccess_intr(int rows, int cols, const float *matA, const float *matB, float *matR);
57 | void matxvec_add_vec_rowaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
58 | void matxvec_add_vec_colaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
59 | void matxvec_add_vec_relu_rowaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
60 | void matxvec_add_vec_relu_colaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
61 | 
62 | /*
63 |  * R = a*B + c
64 |  * a : 1 x rows
65 |  * B : rows x cols
66 |  * c : 1 x cols
67 |  * R : 1 x cols
68 |  */
69 | void vecxmat_add_vec_rowaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
70 | void vecxmat_add_vec_colaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
71 | void vecxmat_add_vec_relu_rowaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
72 | void vecxmat_add_vec_relu_colaccess(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
73 | 
74 | void vecxmat_add_vec_rowaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
75 | void vecxmat_add_vec_colaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
76 | void vecxmat_add_vec_relu_rowaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
77 | void vecxmat_add_vec_relu_colaccess_c(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
78 | 
79 | void vecxmat_add_vec_rowaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
80 | void vecxmat_add_vec_colaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
81 | void vecxmat_add_vec_relu_rowaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
82 | void vecxmat_add_vec_relu_colaccess_intr(int rows, int cols, const float *matA, const float *matB, const float *matC, float *matR);
83 | 
84 | #endif /*__MATRIX_H__ */
85 | 


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/include/n2cube.h:
--------------------------------------------------------------------------------
  1 |  /* 
  2 |   * Copyright (c) 2016-2018 DeePhi Tech, Inc.
  3 |   *
  4 |   * All Rights Reserved. No part of this source code may be reproduced
  5 |   * or transmitted in any form or by any means without the prior written
  6 |   * permission of DeePhi Tech, Inc.
  7 |   *
  8 |   * Filename: n2cube.h
  9 |   * Version: 1.10 beta
 10 |   *
 11 |   * Description:
 12 |   * Header file containing all the exported APIs of DNNDK Runtime library libn2cube
 13 |   * Please refer to document "deephi_dnndk_user_guide.pdf" for more details of these APIs.
 14 |   */
 15 | #ifndef _N2CUBE_H_
 16 | #define _N2CUBE_H_
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif
 21 | 
 22 | #include <stdint.h>
 23 | 
 24 | 
 25 | /* DPU Task runtime mode definitions */
 26 | 
 27 | /* Task in normal mode (defaul mode) */
 28 | #define T_MODE_NORMAL        (0)
 29 | 
 30 | /* Task in profiling mode in order to collect performance stastics for each DPU Node */
 31 | #define T_MODE_PROFILE       (1<<0)  
 32 | 
 33 | /* Task in debug mode in order to dump each Node's Code/Bias/Weights/Input/Output raw data for debugging */
 34 | #define T_MODE_DEBUG         (1<<1)
 35 |                                         
 36 | /* Exported data structures of DPU Kernel/Task/Tensor */
 37 | struct  dpu_kernel_t;
 38 | struct  dpu_task_t;
 39 | struct  task_tensor_t;
 40 | 
 41 | typedef struct dpu_kernel_t  DPUKernel;
 42 | typedef struct dpu_task_t    DPUTask;
 43 | typedef struct task_tensor_t DPUTensor;
 44 | 
 45 | 
 46 | /* Open & initialize the usage of DPU device */
 47 | int dpuOpen();
 48 | 
 49 | /* Close & finalize the usage of DPU device */
 50 | int dpuClose();
 51 | 
 52 | /* Load a DPU Kernel and allocate DPU memory space for 
 53 |    its Code/Weight/Bias segments */
 54 | DPUKernel *dpuLoadKernel(const char *netName);
 55 | DPUKernel *dpuLoadKernelModel(const char *netName,const char* modelPath);
 56 | 
 57 | /* Set mean values for DPU Kernel */
 58 | int dpuSetKernelMeanValue(DPUKernel *kernel, int mean1, int mean2, int mean3);
 59 | 
 60 | /* Destroy a DPU Kernel and release its associated resources */
 61 | int dpuDestroyKernel(DPUKernel *kernel);
 62 | 
 63 | /* Instantiate a DPU Task from one DPU Kernel, allocate its private
 64 |    working memory buffer and prepare for its execution context */
 65 | DPUTask *dpuCreateTask(DPUKernel *kernel, int mode);
 66 | 
 67 | /* Launch the running of DPU Task */
 68 | int dpuRunTask(DPUTask *task);
 69 | 
 70 | /* Remove a DPU Task, release its working memory buffer and destroy
 71 |    associated execution context */
 72 | int dpuDestroyTask(DPUTask *task);
 73 | 
 74 | /* Enable dump facility of DPU Task while running for debugging purpose */
 75 | int dpuEnableTaskDebug(DPUTask *task);
 76 | 
 77 | /* Enable profiling facility of DPU Task while running to get its performance metrics */
 78 | int dpuEnableTaskProfile(DPUTask *task);
 79 | 
 80 | /* Get the execution time of DPU Task */
 81 | long long dpuGetTaskProfile(DPUTask *task);
 82 | 
 83 | /* Get the execution time of DPU Node */
 84 | long long dpuGetNodeProfile(DPUTask *task, const char*nodeName);
 85 | 
 86 | /* Get input Tensor of DPU Task */
 87 | DPUTensor* dpuGetInputTensor(DPUTask *task, const char*nodeName);
 88 | 
 89 | /* Get the start address of DPU Task's input Tensor */
 90 | int8_t* dpuGetInputTensorAddress(DPUTask *task, const char *nodeName);
 91 | 
 92 | /* Get the size (in byte) of one DPU Task's input Tensor */
 93 | int dpuGetInputTensorSize(DPUTask *task, const char *nodeName);
 94 | 
 95 | /* Get the scale value (DPU INT8 quantization) of one DPU Task's input Tensor */
 96 | float dpuGetInputTensorScale(DPUTask *task, const char *nodeName);
 97 | 
 98 | /* Get the height dimension of one DPU Task's input Tensor */
 99 | int dpuGetInputTensorHeight(DPUTask *task, const char *nodeName);
100 | 
101 | /* Get the width dimension of one DPU Task's input Tensor */
102 | int dpuGetInputTensorWidth(DPUTask *task, const char *nodeName);
103 | 
104 | /* Get the channel dimension of one DPU Task's input Tensor */
105 | int dpuGetInputTensorChannel(DPUTask *task, const char *nodeName);
106 | 
107 | /* Get output Tensor of one DPU Task */
108 | DPUTensor* dpuGetOutputTensor(DPUTask *task, const char *nodeName);
109 | 
110 | /* Get the start address of one DPU Task's output Tensor */
111 | int8_t* dpuGetOutputTensorAddress(DPUTask *task, const char *nodeName);
112 | 
113 | /* Get the size (in byte) of one DPU Task's output Tensor */
114 | int dpuGetOutputTensorSize(DPUTask *task, const char *nodeName);
115 | 
116 | /* Get the scale value (DPU INT8 quantization) of one DPU Task's output Tensor */
117 | float dpuGetOutputTensorScale(DPUTask *task, const char *nodeName);
118 | 
119 | /* Get the height dimension of one DPU Task's output Tensor */
120 | int dpuGetOutputTensorHeight(DPUTask *task, const char *nodeName);
121 | 
122 | /*  Get the channel dimension of one DPU Task's output Tensor */
123 | int dpuGetOutputTensorWidth(DPUTask *task, const char *nodeName);
124 | 
125 | /* Get DPU Node's output tensor's channel */
126 | int dpuGetOutputTensorChannel(DPUTask *task, const char *nodeName);
127 | 
128 | /* Get the size of one DPU Tensor */
129 | int dpuGetTensorSize(DPUTensor* tensor);
130 | 
131 | /* Get the start address of one DPU Tensor */
132 | int8_t* dpuGetTensorAddress(DPUTensor* tensor);
133 | 
134 | /* Get the scale value of one DPU Tensor */
135 | float dpuGetTensorScale(DPUTensor* tensor);
136 | 
137 | /* Get the height dimension of one DPU Tensor */
138 | int dpuGetTensorHeight(DPUTensor* tensor);
139 | 
140 | /* Get the width dimension of one DPU Tensor */
141 | int dpuGetTensorWidth(DPUTensor* tensor);
142 | 
143 | /* Get the channel dimension of one DPU Tensor */
144 | int dpuGetTensorChannel(DPUTensor* tensor);
145 | 
146 | /* Set DPU Task's input Tensor with data stored under Caffe
147 |    Blob's order (channel/height/width) in INT8 format */
148 | int dpuSetInputTensorInCHWInt8(DPUTask *task, const char *nodeName, int8_t *data, int size);
149 | 
150 | /* Set DPU Task's input Tensor with data stored under Caffe
151 |    Blob's order (channel/height/width) in FP32 format */
152 | int dpuSetInputTensorInCHWFP32(DPUTask *task, const char *nodeName, float *data, int size);
153 | 
154 | /* Set DPU Task's input Tensor with data stored under DPU
155 |    Tensor's order (height/width/channel) in INT8 format */
156 | int dpuSetInputTensorInHWCInt8(DPUTask *task, const char *nodeName, int8_t *data, int size);
157 | 
158 | /* Set DPU Task's input Tensor with data stored under DPU
159 |    Tensor's order (height/width/channel) in FP32 format */
160 | int dpuSetInputTensorInHWCFP32(DPUTask *task, const char *nodeName, float *data, int size);
161 | 
162 | /* Get DPU Task's output Tensor and store them under Caffe
163 |    Blob's order (channel/height/width) in INT8 format */
164 | int dpuGetOutputTensorInCHWInt8(DPUTask *task, const char *nodeName, int8_t *data, int size);
165 | 
166 | /* Get DPU Task's output Tensor and store them under Caffe
167 |    Blob's order (channel/height/width) in FP32 format */
168 | int dpuGetOutputTensorInCHWFP32(DPUTask *task, const char *nodeName, float *data, int size);
169 | 
170 | /* Get DPU Task's output Tensor and store them under DPU
171 |    Tensor's order (height/width/channel) in INT8 format */
172 | DPUTensor* dpuGetOutputTensorInHWCInt8(DPUTask *task, const char *nodeName);
173 | 
174 | /* Get DPU Task's output Tensor and store them under DPU
175 |    Tensor's order (height/width/channel) in FP32 format */
176 | int dpuGetOutputTensorInHWCFP32(DPUTask *task, const char *nodeName, float *data, int size);
177 | 
178 | /* DEPRECATED, use with caution! */
179 | int dpuRunSoftmax( DPUTask *task, const char *nodeName, float* softmax);
180 | 
181 | #ifdef __cplusplus
182 | }
183 | #endif
184 | 
185 | #endif
186 | 


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/include/transform.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRANSFORM_H__
 2 | #define __TRANSFORM_H__
 3 | 
 4 | // transform bgr image:
 5 | // b = (b-shiftB)*scaleB
 6 | // g = (g-shiftG)*scaleG
 7 | // r = (r-shiftR)*scaleR
 8 | void transform_bgr(int w, int h, unsigned char *src, signed char *dst,
 9 |         float val_shift_B, float var_scale_B,
10 |         float val_shift_G, float var_scale_G,
11 |         float val_shift_R, float var_scale_R);
12 | void transform_bgr_c(int w, int h, unsigned char *src, signed char *dst,
13 |         float val_shift_B, float var_scale_B,
14 |         float val_shift_G, float var_scale_G,
15 |         float val_shift_R, float var_scale_R);
16 | void transform_bgr_intr(int w, int h, unsigned char *src, signed char *dst,
17 |         float var_shift_B, float var_scale_B,
18 |         float var_shift_G, float var_scale_G,
19 |         float var_shift_R, float var_scale_R);
20 | 
21 | #endif /*__TRANSFORM_H__*/
22 | 


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/lib/libdputils.so.2.4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/lib/libdputils.so.2.4


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/lib/libdputils.so.3.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/lib/libdputils.so.3.1


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/lib/libdsight.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/lib/libdsight.a


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/lib/libhineon.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/lib/libhineon.a


--------------------------------------------------------------------------------
/prerequisites/dnndk-lib/pkgs/lib/libn2cube.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirayaku/DAC2018-TGIIF/32adcfe8f3fb8a2b96097869fa1c22521928ea6c/prerequisites/dnndk-lib/pkgs/lib/libn2cube.so


--------------------------------------------------------------------------------
/src/SoftmaxTable.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SOFTMAX_TABLE_HPP_
 2 | #define SOFTMAX_TABLE_HPP_
 3 | 
 4 | #include <tuple>
 5 | using namespace std;
 6 | 
 7 | class SoftmaxTable {
 8 | public:
 9 |     vector<double> exp_table_;
10 |     vector<float> score_;
11 |     float fix_scale_;
12 |     int class_num_;
13 |     int softmax_size_;
14 | 
15 |     SoftmaxTable(float fix_scale, int softmax_size, int class_num) {
16 |         fix_scale_ = fix_scale;
17 |         softmax_size_ = softmax_size;
18 |         class_num_ = class_num;
19 |         for (int i = 0; i < 256; i++) {
20 |             float temp = i - 128;
21 |             exp_table_.push_back(exp(temp * fix_scale_));
22 |         }
23 |         score_.reserve(softmax_size_ / class_num_);
24 |         //cols_.reserve(softmax_size_ / class_num_);
25 |     }
26 | 
27 |     //tuple<int, int, float> cal_softmax(int8_t* input) {
28 |     tuple<int, float> cal_softmax(int8_t* input) {
29 |         for (int i = 0; i < softmax_size_; i = i + class_num_) {
30 |             double sum = 0;
31 |             auto max_pt = max_element(input + i + 1, input + i + class_num_);
32 | 
33 |             for(int j = i; j < i + class_num_ / 4 * 4; j += 4) {
34 |                 sum += exp_table_[input[j] + 128] + exp_table_[input[j+1] + 128] +
35 |                        exp_table_[input[j+2] + 128] + exp_table_[input[j+3] + 128];
36 | 
37 |             }
38 |             for(int j = i + class_num_ / 4 * 4; j < class_num_; j++) {
39 |                 sum += exp_table_[input[j] + 128];
40 |             }
41 |             auto max_value = exp_table_[*max_pt + 128];
42 | 
43 |             score_.push_back(max_value / sum);
44 |         }
45 | 
46 |         auto max_score_pt = max_element(score_.begin(), score_.end());
47 |         auto max_ind = distance(score_.begin(), max_score_pt);
48 |         score_.clear();
49 |         return tuple<int, float>(max_ind, *max_score_pt);
50 |     }
51 | 
52 | 
53 | };
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2016-2017 DeePhi Tech, Inc.
  3 |  *
  4 |  * All Rights Reserved. No part of this source code may be reproduced
  5 |  * or transmitted in any form or by any means without the prior written
  6 |  * permission of DeePhi Tech, Inc.
  7 |  *
  8 |  * Filename: main.cc
  9 |  * Version: 1.07 beta
 10 |  * Description:
 11 |  * Sample source code showing how to deploy SSD neural network on
 12 |  * DeePhi DPU@Zynq7020 platform.
 13 |  */
 14 | 
 15 | #ifndef _GNU_SOURCE
 16 | #  define _GNU_SOURCE
 17 | #endif
 18 | 
 19 | #include <algorithm>
 20 | #include <cmath>
 21 | #include <fstream>
 22 | #include <iostream>
 23 | #include <string>
 24 | #include <vector>
 25 | #include <chrono>
 26 | #include <thread>
 27 | #include <mutex>
 28 | #include <condition_variable>
 29 | #include <unistd.h>
 30 | #include <sched.h>
 31 | #include <pthread.h>
 32 | 
 33 | // Header file OpenCV for image processing
 34 | #include <opencv2/opencv.hpp>
 35 | // Header files for DNNDK APIs
 36 | #include <dnndk/dputils.h>
 37 | #include <dnndk/n2cube.h>
 38 | #include <dnndk/transform.h>
 39 | #include "ssd_detector.hpp"
 40 | #include "prior_boxes.hpp"
 41 | #include "neon_math.hpp"
 42 | //#include "exp_lut.hpp"
 43 | #include "SoftmaxTable.hpp"
 44 | 
 45 | using namespace std;
 46 | using namespace cv;
 47 | using namespace deephi;
 48 | 
 49 | // DPU Kernel name for SSD Convolution layers
 50 | #define KERNEL_CONV "ssd"
 51 | // DPU node name for input and output
 52 | #define CONV_INPUT_NODE "conv1_1"
 53 | #define CONV_OUTPUT_NODE_LOC "mbox_loc"
 54 | #define CONV_OUTPUT_NODE_CONF "mbox_conf"
 55 | 
 56 | // detection params
 57 | const float NMS_THRESHOLD = 0.45;
 58 | const float CONF_THRESHOLD = 0.01;
 59 | const int TOP_K = 1;
 60 | const int KEEP_TOP_K = 1;
 61 | const int num_classes = 99;
 62 | 
 63 | extern "C" {
 64 | typedef struct {
 65 |     int label;
 66 |     int xmin;
 67 |     int xmax;
 68 |     int ymin;
 69 |     int ymax;
 70 |     float confidence;
 71 | } result_t;
 72 | }
 73 | 
 74 | typedef struct {
 75 |     int8_t score;
 76 |     int row;
 77 |     int col;
 78 | } max_t;
 79 | 
 80 | /**
 81 |  * @Optimized max-find for **int8_t** arrays using data coalescing
 82 |  * 
 83 |  * @param array - the 2D int8_t array
 84 |  * @param rows - the rows of the array
 85 |  * @param cols - the cols of the array, must be divisable by 4
 86 |  */
 87 | max_t find_max_int8(int8_t* array, int rows, int cols) {
 88 |   int row = -1, col = -1;
 89 |   int8_t max_value = 0;
 90 |   unsigned *packed_array = reinterpret_cast<unsigned *>(array);
 91 |   for(int i = 0; i < rows * cols / 4; i++) {
 92 |     unsigned data = packed_array[i];
 93 |     int8_t data0 = data & 0x000000ff;
 94 |     int8_t data1 = (data & 0x0000ff00) >> 8;
 95 |     int8_t data2 = (data & 0x00ff0000) >> 16; 
 96 |     int8_t data3 = (data & 0xff000000) >> 24;
 97 |     if(data0 > max_value) {
 98 |         max_value = data0;
 99 |         row = i * 4;
100 |     }
101 |     if(data1 > max_value) {
102 |         max_value = data1;
103 |         row = i * 4 + 1;
104 |     }
105 |     if(data2 > max_value) {
106 |         max_value = data2;
107 |         row = i * 4 + 2;
108 |     }
109 |     if(data3 > max_value) {
110 |         max_value = data3;
111 |         row = i * 4 + 3;
112 |     }
113 |   }
114 |   // In case the array is not 4-byte aligned
115 |   for(int i =  rows * cols / 4 * 4; i < rows * cols; i++) {
116 |       int8_t data = array[i];
117 |       if(data > max_value) {
118 |           max_value  = data;
119 |           row = i;
120 |       }
121 |   }
122 |   col = row - (row / cols * cols);
123 |   row = row / cols;
124 |   return {max_value, row, col};
125 | }
126 | 
127 | /**
128 |  * @brief Calculate softmax on CPU
129 |  *
130 |  * @param src - pointer to int8_t DPU data to be calculated
131 |  * @param size - size of input int8_t DPU data
132 |  * @param scale - scale to miltiply to transform DPU data from int8_t to float
133 |  * @param dst - pointer to float result after softmax
134 |  *
135 |  * @return none
136 |  */
137 | void CPUSoftmax(int8_t* src, int size, float scale, float* dst) {
138 |   float sum = 0.0f;
139 |   for (auto i = 0; i < size; ++i) {
140 |     dst[i] = exp(src[i] * scale);
141 |     sum += dst[i];
142 |   }
143 |   for (auto i = 0; i < size; ++i) {
144 |     dst[i] /= sum;
145 |   }
146 | }
147 | 
148 | void CreatePriors(vector<shared_ptr<vector<float>>> *priors) {
149 |   vector<float> variances{0.1, 0.1, 0.2, 0.2};
150 |   vector<PriorBoxes> prior_boxes;
151 | 
152 |   // prior boxes for model: 90% compress rate, 0.7 scale img input
153 |   prior_boxes.emplace_back(PriorBoxes{
154 |       448, 252, 56, 32, variances, {30}, {66}, {2}, 0.5, 8.0, 8.0});
155 | 
156 |   prior_boxes.emplace_back(PriorBoxes{
157 |       448, 252, 28, 16, variances, {66}, {127}, {2, 3}, 0.5, 16.0, 16.0});
158 | 
159 |   prior_boxes.emplace_back(PriorBoxes{
160 |       448, 252, 14, 8, variances, {127}, {188}, {2, 3}, 0.5, 32.0, 32.0});
161 | 
162 |   prior_boxes.emplace_back(PriorBoxes{
163 |       448, 252, 7, 4, variances, {188}, {249}, {2, 3}, 0.5, 64.0, 64.0});
164 | 
165 |   int num_priors = 0;
166 |   for (auto &p : prior_boxes) {
167 |     num_priors += p.priors().size();
168 |   }
169 | 
170 |   priors->clear();
171 |   priors->reserve(num_priors);
172 |   for (auto i = 0U; i < prior_boxes.size(); ++i) {
173 |     priors->insert(priors->end(), prior_boxes[i].priors().begin(),
174 |                    prior_boxes[i].priors().end());
175 |   }
176 | }
177 | 
178 | class DPU_Handler {
179 |     DPUKernel *kernel_conv;
180 |     DPUTask *task_conv;
181 | 
182 |     vector<shared_ptr<vector<float>>> priors;
183 |     int8_t *conf_mem = nullptr;
184 |     float *conf_softmax = nullptr;
185 |     int size;   // output tensor size
186 |     float mean[3] = {104, 117, 123};
187 | 
188 |     SoftmaxTable *STable;
189 | 
190 |     // Sync variables for overlapping cv.imread and RunSSD
191 |     mutex load_mutex, run_mutex;
192 |     condition_variable load_cv, run_cv;
193 |     bool load_to_begin, load_finished;
194 |     Mat *even_img, *odd_img;
195 | 
196 |     // Output bbox results
197 |     vector<result_t> results;
198 | 
199 |     // DDR_0 --> DDR_dpu
200 |     // Not using dpuSetInputImage in dputils.cpp, since it wastes us another 5ms per call
201 |     void set_input(Mat &img) {
202 |         // Set image into CONV Task with mean value
203 |         auto time0 = chrono::system_clock::now();
204 | 
205 |         int8_t *input_addr = dpuGetInputTensorAddress(task_conv, CONV_INPUT_NODE);
206 |         float scale_fix = dpuGetInputTensorScale(task_conv, CONV_INPUT_NODE);
207 |         transform_bgr(img.cols, img.rows, img.data,
208 |                       input_addr,
209 |                       mean[0], scale_fix,
210 |                       mean[1], scale_fix,
211 |                       mean[2], scale_fix
212 |                      );
213 |         //dpuSetInputImage(task_conv, (char*)CONV_INPUT_NODE, img, mean);
214 |         auto time1 = chrono::system_clock::now();
215 |         cout << "setinput: " << chrono::duration_cast<chrono::microseconds>(time1-time0).count() << ".us" << endl;
216 |     }
217 |     // Signal DPU to run task: DDR_dpu --> BRAM --> DDR_dpu'
218 |     inline void run_dpu_task() {
219 |         auto time0 = chrono::system_clock::now();
220 |         dpuRunTask(this->task_conv);
221 |         auto time1 = chrono::system_clock::now();
222 |         cout << "DPU     : " << chrono::duration_cast<chrono::microseconds>(time1-time0).count() << ".us" << endl;
223 |     }
224 | 
225 |     result_t post_process(const Mat& img) {
226 |         // Initializations
227 |         result_t top = {0, 0, 0, 0, 0, 0};
228 |         int8_t* loc =
229 |             (int8_t*)dpuGetOutputTensorAddress(task_conv, CONV_OUTPUT_NODE_LOC);
230 |         int8_t* conf =
231 |             (int8_t*)dpuGetOutputTensorAddress(task_conv, CONV_OUTPUT_NODE_CONF);
232 |         float loc_scale = dpuGetOutputTensorScale(task_conv, CONV_OUTPUT_NODE_LOC);
233 |         float conf_scale =
234 |           dpuGetOutputTensorScale(task_conv, CONV_OUTPUT_NODE_CONF);
235 | 
236 |         auto time2 = chrono::system_clock::now();
237 | 
238 |         int count = size/num_classes;
239 |         memcpy(conf_mem, conf, this->size);
240 |         auto t = STable->cal_softmax(conf_mem);
241 |         int location = get<0>(t);
242 |         int8_t *max_row = conf_mem + location * num_classes;
243 |         auto max_pt = max_element(max_row + 1, max_row + num_classes);
244 |         auto dis = distance(max_row, max_pt);
245 |         int classification = dis;
246 | 
247 |         /*
248 |         auto max_data = find_max_int8(conf_mem, count, num_classes);
249 |         int location = max_data.row;
250 |         int classification = max_data.col;
251 |         */
252 | 
253 |         auto time3 = chrono::system_clock::now();
254 |         CPUSoftmax(conf_mem + location*num_classes, num_classes, conf_scale, conf_softmax + location*num_classes);
255 |         
256 |         auto time4 = chrono::system_clock::now();
257 |         MultiDetObjects results;
258 |         vector<float> th_conf(num_classes, CONF_THRESHOLD);
259 |         SSDdetector* detector_ = new SSDdetector(num_classes,SSDdetector::CodeType::CENTER_SIZE, false,
260 |                   KEEP_TOP_K, th_conf, TOP_K, NMS_THRESHOLD, 1.0, priors, loc_scale);
261 |         detector_->Detect(loc, conf_softmax, &results, location, classification);
262 | 
263 |         float top_conf = 0;
264 |         float full_cols = img.cols * 10 / 7;
265 |         float full_rows = img.rows * 10 / 7;
266 |         for (size_t i = 0; i < results.size(); ++i) {
267 |             int label = get<0>(results[i]);
268 |             float xmin = get<2>(results[i]).x * full_cols;
269 |             float ymin = get<2>(results[i]).y * full_rows;
270 |             float xmax = xmin + (get<2>(results[i]).width) * full_cols;
271 |             float ymax = ymin + (get<2>(results[i]).height) * full_rows;
272 |             xmin = round(xmin*100.0)/100.0;
273 |             ymin = round(ymin*100.0)/100.0;
274 |             xmax = round(xmax*100.0)/100.0;
275 |             ymax = round(ymax*100.0)/100.0;
276 |             float confidence = get<1>(results[i]);
277 | 
278 |             xmin = std::min(std::max(xmin, 0.0f), full_cols);
279 |             xmax = std::min(std::max(xmax, 0.0f), full_cols);
280 |             ymin = std::min(std::max(ymin, 0.0f), full_rows);
281 |             ymax = std::min(std::max(ymax, 0.0f), full_rows);
282 | 
283 |             if (top_conf < confidence) {
284 |                 top_conf = confidence;
285 |                 top.label = label;
286 |                 top.xmin = (int)xmin; top.xmax = (int)xmax;
287 |                 top.ymin = (int)ymin; top.ymax = (int)ymax;
288 |                 top.confidence = confidence;
289 |             }
290 |         }
291 | 
292 |         auto time5 = chrono::system_clock::now();
293 |         //cout << "before  : " << chrono::duration_cast<chrono::microseconds>(time0-timex).count() << ".us" << endl;
294 |         //cout << "dpu time: " << chrono::duration_cast<chrono::microseconds>(time2-time1).count() << ".us" << endl;
295 |         cout << "find    : " << chrono::duration_cast<chrono::microseconds>(time3-time2).count() << ".us" << endl;
296 |         cout << "softmax : " << chrono::duration_cast<chrono::microseconds>(time4-time3).count() << ".us" << endl;
297 |         cout << "detect  : " << chrono::duration_cast<chrono::microseconds>(time5-time4).count() << ".us" << endl;
298 |         cout << "Post    : " << chrono::duration_cast<chrono::microseconds>(time5-time2).count() << ".us" << endl;
299 |         return top;
300 |     }
301 | 
302 |     // thread-1: run dpu task when t2 is ready
303 |     void t1_run_task(unsigned total_count) {
304 |         Mat *current_img;
305 |         for(unsigned current = 0; current < total_count; current++) {
306 |             if(current%2 == 0) {
307 |                 current_img = this->even_img;
308 |             } else
309 |                 current_img = this->odd_img;
310 | 
311 |             {
312 |                 unique_lock<mutex> lk(this->run_mutex);
313 |                 while(!load_finished)
314 |                     this->run_cv.wait(lk);
315 |                 load_finished = false;
316 |             }
317 | 
318 | #ifdef DEBUG
319 |             cout << "Input    : " << to_string(current) << ".jpg" << endl;
320 | #endif
321 | 
322 |             // DDR_0 --> DDR_dpu when DDR_0 is ready
323 |             this->set_input(*current_img);
324 |             // Then make t2 to process DDR_dpu' and load next img
325 |             {
326 |                 lock_guard<mutex> guard(this->load_mutex);
327 |                 load_to_begin = true;
328 |             }
329 |             this->load_cv.notify_one();
330 |             this->run_dpu_task();
331 | 
332 | #ifdef DEBUG
333 |             cout << "- - - - -" << endl;
334 | #endif
335 |         }
336 |         {
337 |             unique_lock<mutex> lk(this->run_mutex);
338 |             while(!load_finished)
339 |                 this->run_cv.wait(lk);
340 |             load_finished = false;
341 |         }
342 |         {
343 |             lock_guard<mutex> guard(this->load_mutex);
344 |             load_to_begin = true;
345 |         }
346 |         this->load_cv.notify_one();
347 |     }
348 |     // thread-2: load img
349 |     void t2_post_and_load(string imgs_dir, vector<string> &imgs_vec, unsigned total_count) {
350 |         Mat *current_img, *last_img;
351 |         for(unsigned current = 0; current < total_count; current++) {
352 |             if(current%2 == 0) {
353 |                 current_img = this->even_img;
354 |                 last_img = this->odd_img;
355 |             } else {
356 |                 current_img = this->odd_img;
357 |                 last_img = this->even_img;
358 |             }
359 | 
360 |             // Start post processing last DDR_dpu'
361 |             if(current > 1) {
362 |                 this->results.push_back(this->post_process(*last_img));
363 |             }
364 |             Mat read_img = imread(imgs_dir + "/" + imgs_vec[current]);
365 |             resize(read_img, *current_img, Size(), 0.7, 0.7);
366 | 
367 |             {
368 |                 lock_guard<mutex> guard(this->run_mutex);
369 |                 load_finished = true;
370 |             }
371 |             this->run_cv.notify_one();
372 |             // Wait until some data is ready
373 |             {
374 |                 unique_lock<mutex> lk(this->load_mutex);
375 |                 while(!load_to_begin)
376 |                     this->load_cv.wait(lk);
377 |                 load_to_begin = false;
378 |             }
379 |         }
380 |         this->results.push_back(this->post_process(*this->odd_img));
381 |         {
382 |             lock_guard<mutex> guard(this->run_mutex);
383 |             load_finished = true;
384 |         }
385 |         this->run_cv.notify_one();
386 |         // Wait until some data is ready
387 |         {
388 |             unique_lock<mutex> lk(this->load_mutex);
389 |             while(!load_to_begin)
390 |                 this->load_cv.wait(lk);
391 |             load_to_begin = false;
392 |         }
393 |         this->results.push_back(this->post_process(*this->even_img));
394 |     }
395 | 
396 | public:
397 |   DPU_Handler(string lib_path) {
398 |     dpuOpen();
399 |     this->kernel_conv = dpuLoadKernelModel(KERNEL_CONV, lib_path.data()); //"/home/xilinx/ssd_99_py/model/dpu_ssd_233.elf"
400 |     this->task_conv = dpuCreateTask(this->kernel_conv, 0);
401 |     CreatePriors(&(this->priors));
402 |     this->size = dpuGetOutputTensorSize(task_conv, CONV_OUTPUT_NODE_CONF);
403 |     this->conf_mem = new int8_t[size];
404 |     this->conf_softmax = new float[size];
405 | 
406 |     //float conf_scale =
407 |     //    dpuGetOutputTensorScale(task_conv, CONV_OUTPUT_NODE_CONF);
408 |     this->STable = new SoftmaxTable(0.125, 1058904, 99);
409 |     //this->STable = new SoftmaxTable(, this->size, num_classes);
410 | 
411 |     this->load_to_begin = false;
412 |     this->load_finished = false;
413 |     this->even_img = new Mat;
414 |     this->odd_img = new Mat;
415 | 
416 |     cout << "DPU InputSize : " << dpuGetInputTensorSize(task_conv, (char*)CONV_INPUT_NODE) << endl;
417 |     cout << "DPU OutputSize: " << this->size << endl;
418 |     cout << "- - - - - - - - - -" << endl;
419 |   }
420 | 
421 |   ~DPU_Handler() {
422 |     delete STable;
423 |     delete odd_img;
424 |     delete even_img;
425 |     delete [] this->conf_softmax;
426 |     delete [] this->conf_mem;
427 |     // Destroy DPU Tasks and Kernels and free resources
428 |     dpuDestroyTask(this->task_conv);
429 |     dpuDestroyKernel(this->kernel_conv);
430 |     // Detach from DPU driver and release resources
431 |     dpuClose();
432 |   }
433 |   void dpu_clear() {
434 |     this->results.clear();
435 |     this->load_to_begin = false;
436 |     this->load_finished = false;
437 |   }
438 |   result_t dpu_detect_single(string img_path) {
439 |     Mat img = imread(img_path.data());
440 |     set_input(img);
441 |     run_dpu_task();
442 |     auto r = post_process(img);
443 |     results.push_back(r);
444 |     return r;
445 |   }
446 |   void dpu_detect_list(string imgs_dir, vector<string> &imgs_vec, unsigned img_count) {
447 |     dpu_clear();
448 |     // Spawn threads
449 |     thread t1(&DPU_Handler::t1_run_task, this, img_count);
450 |     thread t2(&DPU_Handler::t2_post_and_load, this, imgs_dir, ref(imgs_vec), img_count);
451 | 
452 |     // set cpu affinity
453 |     cpu_set_t cpuset;
454 |     int rc;
455 |     CPU_ZERO(&cpuset);
456 |     CPU_SET(0, &cpuset);
457 |     rc = pthread_setaffinity_np(t1.native_handle(),
458 |                                 sizeof(cpuset),
459 |                                 &cpuset);
460 |     if(rc != 0) {
461 |         cerr << "Fail to bind t1_run_task to CPU0!" << endl;
462 |     }
463 | 
464 |     CPU_SET(1, &cpuset);
465 |     rc = pthread_setaffinity_np(t2.native_handle(),
466 |                                 sizeof(cpuset),
467 |                                 &cpuset);
468 |     if(rc != 0) {
469 |         cerr << "Fail to bind t2_post_and_load to CPU1!" << endl;
470 |     }
471 |     // set thread priority
472 |     struct sched_param sp;
473 |     sp.sched_priority = 2;
474 |     pthread_setschedparam(t1.native_handle(), SCHED_FIFO, &sp);
475 |     sp.sched_priority = 2;
476 |     pthread_setschedparam(t2.native_handle(), SCHED_FIFO, &sp);
477 | 
478 |     // Wait for threads to finish
479 |     t2.join();
480 |     t1.join();
481 |   }
482 |   result_t *dpu_get_results() {
483 |       return this->results.data();
484 |   }
485 | };
486 | 
487 | DPU_Handler *dpu_ptr = nullptr;
488 | extern "C" {
489 |   void dpu_initialize(char *c_lib_path) {
490 |     dpu_ptr = new DPU_Handler(string(c_lib_path));
491 |   }
492 |   void dpu_destroy() {
493 |     dpu_ptr->~DPU_Handler();
494 |     dpu_ptr = nullptr;
495 |   }
496 |   void dpu_clear() {
497 |       dpu_ptr->dpu_clear();
498 |   }
499 |   result_t dpu_detect_single(char *c_img_path) {
500 |     auto time_begin = chrono::system_clock::now();
501 |     assert(dpu_ptr != nullptr);
502 |     auto r = dpu_ptr->dpu_detect_single(string(c_img_path));
503 |     auto time_end = chrono::system_clock::now();
504 |     cout << "Overall: " << chrono::duration_cast<chrono::microseconds>(time_end - time_begin).count() << ".us" << endl;
505 |     cout << "- - - - -" << endl;
506 |     return r;
507 |   }
508 |   void dpu_detect_list(char *c_imgs_file, unsigned num) {
509 |       auto time_begin = chrono::system_clock::now();
510 | 
511 |       //auto time0 = chrono::system_clock::now();
512 |       string imgs_dir, temp;
513 |       vector<string> imgs;
514 |       ifstream imgs_file(c_imgs_file);
515 |       // Get imgs directory
516 |       getline(imgs_file, imgs_dir);
517 |       while(getline(imgs_file, temp))
518 |           imgs.push_back(temp);
519 |       //auto time1 = chrono::system_clock::now();
520 | 
521 |       dpu_ptr->dpu_detect_list(imgs_dir, imgs, num);
522 | 
523 |       auto time_end = chrono::system_clock::now();
524 |       //auto read_micro_sec = chrono::duration_cast<chrono::microseconds>(time1 - time0).count();
525 |       auto micro_sec = chrono::duration_cast<chrono::microseconds>(time_end - time_begin).count();
526 |       cout << "- - - - - - - - - - " << endl;
527 |       cout << "Overall : " << micro_sec << ".us" << endl;
528 |       cout << "FPS     : " << to_string(num / (micro_sec / 1000000.0f)) << endl;
529 |   }
530 |   result_t* dpu_get_results() {
531 |       return dpu_ptr->dpu_get_results();
532 |   }
533 | }
534 | 


--------------------------------------------------------------------------------
/src/neon_math.hpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <arm_neon.h>
  3 | 
  4 | using std::exp;
  5 | typedef float32x4_t v4sf;
  6 | typedef uint32x4_t v4su;
  7 | v4sf exp_ps(v4sf);
  8 | 
  9 | void softmax_c(const int8_t* input, float scale, unsigned int cls,
 10 |     float* output) {
 11 |   float sum = 0.f;
 12 |   for (unsigned int i = 0; i < cls; ++i) {
 13 |     output[i] = exp(input[i] * scale);
 14 |     sum += output[i];
 15 |   }
 16 | 
 17 |   for (unsigned int i = 0; i < cls; ++i) output[i] /= sum;
 18 | }
 19 | 
 20 | void softmax_c(const int8_t* input, float scale, unsigned int cls,
 21 |     unsigned int group, float* output) {
 22 |   for (unsigned int i = 0; i < group; ++i) {
 23 |     softmax_c(input, scale, cls, output);
 24 |     input += cls;
 25 |     output += cls;
 26 |   }  
 27 | }
 28 | 
 29 | void softmax4_internal(const int8_t*, float, unsigned int, float*);
 30 | void softmax4_neon(const int8_t* input, float scale,
 31 |     unsigned int group, float* output) {
 32 |   unsigned int aligned = group & (-8);
 33 |   softmax4_internal(input, scale, aligned, output);
 34 |   unsigned int remain = group - aligned;
 35 |   input += (4 * aligned);
 36 |   output += (4 * aligned);
 37 |   softmax_c(input, scale, 4, remain, output);
 38 | }
 39 | 
 40 | /*
 41 |  * 2-class softmax
 42 |  */
 43 | void softmax2_internal(const int8_t*, float, unsigned int, float*);
 44 | void softmax2_neon(const int8_t* input, float scale,
 45 |     unsigned int group, float* output) {
 46 |   unsigned int aligned = group & (-8);
 47 |   softmax2_internal(input, scale, aligned, output);
 48 |   unsigned int remain = group - aligned;
 49 |   input += (2 * aligned);
 50 |   output += (2 * aligned);
 51 |   softmax_c(input, scale, 2, remain, output);
 52 | }
 53 | 
 54 | /*
 55 |  * Assume group is divided by 8
 56 |  */
 57 | void softmax4_internal(const int8_t* input, float scale,
 58 |     unsigned int group, float* output) {
 59 |   unsigned int batch = group / 8;
 60 | 
 61 |   for (unsigned int i = 0; i < batch; ++i) {
 62 |     /* Interleaved load 32 bytes into 4 NEON registers */
 63 |     int8x8x4_t q01 = vld4_s8(input);
 64 |     /* Convert to 16-bit integers */
 65 |     int16x8_t q2 = vmovl_s8(q01.val[0]);
 66 |     int16x8_t q3 = vmovl_s8(q01.val[1]);
 67 |     int16x8_t q4 = vmovl_s8(q01.val[2]);
 68 |     int16x8_t q5 = vmovl_s8(q01.val[3]);
 69 |     
 70 |     /* Process first 4 groups */
 71 |     int16x4_t d10 = vget_low_s16(q2);
 72 |     int16x4_t d11 = vget_low_s16(q3);
 73 |     int16x4_t d12 = vget_low_s16(q4);
 74 |     int16x4_t d13 = vget_low_s16(q5);
 75 | 
 76 |     float32x4_t q8 = vcvtq_f32_s32(vmovl_s16(d10));
 77 |     float32x4_t q9 = vcvtq_f32_s32(vmovl_s16(d11));
 78 |     float32x4_t q10 = vcvtq_f32_s32(vmovl_s16(d12));
 79 |     float32x4_t q11 = vcvtq_f32_s32(vmovl_s16(d13));
 80 | 
 81 |     q8 = exp_ps(vmulq_n_f32(q8, scale));
 82 |     q9 = exp_ps(vmulq_n_f32(q9, scale));
 83 |     q10 = exp_ps(vmulq_n_f32(q10, scale));
 84 |     q11 = exp_ps(vmulq_n_f32(q11, scale));
 85 | 
 86 |     float32x4_t q12 = vaddq_f32(q8, q9);
 87 |     q12 = vaddq_f32(q12, q10);
 88 |     q12 = vaddq_f32(q12, q11);
 89 |     q12 = vrecpeq_f32(q12);
 90 | 
 91 |     q8 = vmulq_f32(q12, q8);
 92 |     q9 = vmulq_f32(q12, q9);
 93 |     q10 = vmulq_f32(q12, q10);
 94 |     q11 = vmulq_f32(q12, q11);
 95 | 
 96 |     float32x4x4_t b0 = {q8, q9, q10, q11};
 97 |     vst4q_f32(output, b0);
 98 |     output += 16;
 99 | 
100 |     /* Process last 4 groups */
101 |     d10 = vget_high_s16(q2);
102 |     d11 = vget_high_s16(q3);
103 |     d12 = vget_high_s16(q4);
104 |     d13 = vget_high_s16(q5);
105 | 
106 |     q8 = vcvtq_f32_s32(vmovl_s16(d10));
107 |     q9 = vcvtq_f32_s32(vmovl_s16(d11));
108 |     q10 = vcvtq_f32_s32(vmovl_s16(d12));
109 |     q11 = vcvtq_f32_s32(vmovl_s16(d13));
110 | 
111 |     q8 = exp_ps(vmulq_n_f32(q8, scale));
112 |     q9 = exp_ps(vmulq_n_f32(q9, scale));
113 |     q10 = exp_ps(vmulq_n_f32(q10, scale));
114 |     q11 = exp_ps(vmulq_n_f32(q11, scale));
115 | 
116 |     q12 = vaddq_f32(q8, q9);
117 |     q12 = vaddq_f32(q12, q10);
118 |     q12 = vaddq_f32(q12, q11);
119 |     q12 = vrecpeq_f32(q12);
120 | 
121 |     q8 = vmulq_f32(q12, q8);
122 |     q9 = vmulq_f32(q12, q9);
123 |     q10 = vmulq_f32(q12, q10);
124 |     q11 = vmulq_f32(q12, q11);
125 | 
126 |     float32x4x4_t b1 = {q8, q9, q10, q11};
127 |     vst4q_f32(output, b1);
128 |     output += 16;
129 | 
130 |     input += 32;
131 |   }
132 | }
133 | 
134 | /*
135 |  * Assume group is divided by 8
136 |  */
137 | void softmax2_internal(const int8_t* input, float scale,
138 |     unsigned int group, float* output) {
139 |   unsigned int batch = group / 8;
140 | 
141 |   for (unsigned int i = 0; i < batch; ++i) {
142 |     /* Interleaved load 16 bytes into 2 NEON registers */
143 |     int8x8x2_t q0 = vld2_s8(input);
144 |     /* Convert to 16-bit integers */
145 |     int16x8_t q1 = vmovl_s8(q0.val[0]);
146 |     int16x8_t q2 = vmovl_s8(q0.val[1]);
147 |     
148 |     int16x4_t d2 = vget_low_s16(q1);
149 |     int16x4_t d3 = vget_high_s16(q1);
150 |     int16x4_t d4 = vget_low_s16(q2);
151 |     int16x4_t d5 = vget_high_s16(q2);
152 | 
153 |     /* Process first 4 groups */
154 |     float32x4_t q3 = vcvtq_f32_s32(vmovl_s16(d2));
155 |     float32x4_t q4 = vcvtq_f32_s32(vmovl_s16(d4));
156 |     q3 = exp_ps(vmulq_n_f32(q3, scale));
157 |     q4 = exp_ps(vmulq_n_f32(q4, scale));
158 | 
159 |     float32x4_t q7 = vaddq_f32(q3, q4);
160 |     q7 = vrecpeq_f32(q7);
161 |     q3 = vmulq_f32(q7, q3);
162 |     q4 = vmulq_f32(q7, q4);
163 | 
164 |     /* Process last 4 groups */
165 |     float32x4_t q5 = vcvtq_f32_s32(vmovl_s16(d3));
166 |     float32x4_t q6 = vcvtq_f32_s32(vmovl_s16(d5));
167 |     q5 = exp_ps(vmulq_n_f32(q5, scale));
168 |     q6 = exp_ps(vmulq_n_f32(q6, scale));
169 | 
170 |     float32x4_t q8 = vaddq_f32(q5, q6);
171 |     q8 = vrecpeq_f32(q8);
172 |     q5 = vmulq_f32(q8, q5);
173 |     q6 = vmulq_f32(q8, q6);
174 | 
175 |     /* Save to memory */
176 |     float32x4x2_t b0 = {q3, q4};
177 |     vst2q_f32(output, b0);
178 |     output += 8;
179 |     float32x4x2_t b1 = {q5, q6};
180 |     vst2q_f32(output, b1);
181 |     output += 8;
182 | 
183 |     input += 16;
184 |   }
185 | }
186 | 
187 | /*
188 |  * Assume group is divided by 8
189 |  */
190 | void softmax99_internal(const int8_t* input, float scale,
191 |     unsigned int group, float* output) {
192 |   unsigned int batch = group / 8;
193 | 
194 |   for (unsigned int i = 0; i < batch; ++i) {
195 |     /* Interleaved load 32 bytes into 4 NEON registers */
196 |     int8x8x4_t q01 = vld4_s8(input);
197 |     /* Convert to 16-bit integers */
198 |     int16x8_t q2 = vmovl_s8(q01.val[0]);
199 |     int16x8_t q3 = vmovl_s8(q01.val[1]);
200 |     int16x8_t q4 = vmovl_s8(q01.val[2]);
201 |     int16x8_t q5 = vmovl_s8(q01.val[3]);
202 |     
203 |     /* Process first 4 groups */
204 |     int16x4_t d10 = vget_low_s16(q2);
205 |     int16x4_t d11 = vget_low_s16(q3);
206 |     int16x4_t d12 = vget_low_s16(q4);
207 |     int16x4_t d13 = vget_low_s16(q5);
208 | 
209 |     float32x4_t q8 = vcvtq_f32_s32(vmovl_s16(d10));
210 |     float32x4_t q9 = vcvtq_f32_s32(vmovl_s16(d11));
211 |     float32x4_t q10 = vcvtq_f32_s32(vmovl_s16(d12));
212 |     float32x4_t q11 = vcvtq_f32_s32(vmovl_s16(d13));
213 | 
214 |     q8 = exp_ps(vmulq_n_f32(q8, scale));
215 |     q9 = exp_ps(vmulq_n_f32(q9, scale));
216 |     q10 = exp_ps(vmulq_n_f32(q10, scale));
217 |     q11 = exp_ps(vmulq_n_f32(q11, scale));
218 | 
219 |     float32x4_t q12 = vaddq_f32(q8, q9);
220 |     q12 = vaddq_f32(q12, q10);
221 |     q12 = vaddq_f32(q12, q11);
222 |     q12 = vrecpeq_f32(q12);
223 | 
224 |     q8 = vmulq_f32(q12, q8);
225 |     q9 = vmulq_f32(q12, q9);
226 |     q10 = vmulq_f32(q12, q10);
227 |     q11 = vmulq_f32(q12, q11);
228 | 
229 |     float32x4x4_t b0 = {q8, q9, q10, q11};
230 |     vst4q_f32(output, b0);
231 |     output += 16;
232 | 
233 |     /* Process last 4 groups */
234 |     d10 = vget_high_s16(q2);
235 |     d11 = vget_high_s16(q3);
236 |     d12 = vget_high_s16(q4);
237 |     d13 = vget_high_s16(q5);
238 | 
239 |     q8 = vcvtq_f32_s32(vmovl_s16(d10));
240 |     q9 = vcvtq_f32_s32(vmovl_s16(d11));
241 |     q10 = vcvtq_f32_s32(vmovl_s16(d12));
242 |     q11 = vcvtq_f32_s32(vmovl_s16(d13));
243 | 
244 |     q8 = exp_ps(vmulq_n_f32(q8, scale));
245 |     q9 = exp_ps(vmulq_n_f32(q9, scale));
246 |     q10 = exp_ps(vmulq_n_f32(q10, scale));
247 |     q11 = exp_ps(vmulq_n_f32(q11, scale));
248 | 
249 |     q12 = vaddq_f32(q8, q9);
250 |     q12 = vaddq_f32(q12, q10);
251 |     q12 = vaddq_f32(q12, q11);
252 |     q12 = vrecpeq_f32(q12);
253 | 
254 |     q8 = vmulq_f32(q12, q8);
255 |     q9 = vmulq_f32(q12, q9);
256 |     q10 = vmulq_f32(q12, q10);
257 |     q11 = vmulq_f32(q12, q11);
258 | 
259 |     float32x4x4_t b1 = {q8, q9, q10, q11};
260 |     vst4q_f32(output, b1);
261 |     output += 16;
262 | 
263 |     input += 32;
264 |   }
265 | }
266 | 
267 | 
268 | /*
269 | void softmax4(const int8_t* input, int group, float scale, float* output) {
270 |   int size = group * 4;
271 |   int count = size / 8;
272 |   // int remain = size % 8;
273 |   auto ptr = output;
274 |   // auto p2 = input;
275 |   for (auto i = 0; i < count; ++i) {
276 |     int8x8_t s8v = vld1_s8(input);
277 |     int16x8_t s16v = vmovl_s8(s8v);
278 | 
279 |     int16x4_t s16v0 = vget_low_s16(s16v);
280 |     int32x4_t s32v0 = vmovl_s16(s16v0);
281 |     float32x4_t f32v0 = vcvtq_f32_s32(s32v0);
282 |     f32v0 = vmulq_n_f32(f32v0, scale);
283 |     v4sf expv0 = exp_ps(f32v0);
284 |     // sum
285 |     float32x2_t sumv0 = vadd_f32(
286 |         vget_high_f32(expv0), vget_low_f32(expv0));
287 |     float32x2_t recv0 = vrecpe_f32(vpadd_f32(sumv0, sumv0));
288 |     expv0 = vmulq_n_f32(expv0, vget_lane_f32(recv0, 0));
289 |     vst1q_f32(ptr, expv0);
290 | 
291 |     int16x4_t s16v1 = vget_high_s16(s16v);
292 |     int32x4_t s32v1 = vmovl_s16(s16v1);
293 |     float32x4_t f32v1 = vcvtq_f32_s32(s32v1);
294 |     f32v1 = vmulq_n_f32(f32v1, scale);
295 |     v4sf exp_high = exp_ps(f32v1);
296 |     // sum
297 |     float32x2_t sumv1 = vadd_f32(
298 |         vget_high_f32(exp_high), vget_low_f32(exp_high));
299 |     float sum1 = vget_lane_f32(vpadd_f32(sumv1, sumv1), 0);
300 |     exp_high = vmulq_n_f32(exp_high, 1.0/sum1);
301 |     vst1q_f32(ptr+4, exp_high);
302 | 
303 |     input += 8;
304 |     ptr += 8;
305 |   }
306 | */
307 | 
308 | /*
309 |   if (remain > 0) {
310 |     for (auto i = 0; i < remain; ++i) {
311 |       // p1[i] = expf_neon(p2[i]*scale);
312 |       ptr[i] = exp(input[i]*scale);
313 |     }
314 |   }
315 | 
316 |   ptr = output;
317 |   for (auto i = 0; i < group; ++i) {
318 |     float s = sum4(ptr);
319 |     for (int j = 0; j < 4; ++j) ptr[j] /= s;
320 |     ptr += 4;
321 |   }
322 | */
323 | //}
324 | 
325 | 
326 | /*
327 | void softmax4_safe(const int8_t* input, int group, float scale, float* output) {
328 |   int align8 = (uint64_t)input % 8;
329 |   int align4 = align8 % 4;
330 |   if (align4 % 4) {
331 |     LOG(FATAL) << "Data must be 4-bytes aligned";
332 |   }
333 |   if (align8) {
334 |     softmax_group(input, scale, output);
335 |     input += 4;
336 |     group -= 1;
337 |     output += 4;
338 |   }
339 |   if (group % 2) {
340 |     auto offset = group * 4 - 4;
341 |     softmax_group(input + offset, scale, output + offset);
342 |     group -= 1;
343 |   }
344 |   softmax4(input, group, scale, output);
345 | }
346 | */
347 | 
348 | #define c_exp_hi 88.3762626647949f
349 | #define c_exp_lo -88.3762626647949f
350 | 
351 | #define c_cephes_LOG2EF 1.44269504088896341
352 | #define c_cephes_exp_C1 0.693359375
353 | #define c_cephes_exp_C2 -2.12194440e-4
354 | 
355 | #define c_cephes_exp_p0 1.9875691500E-4
356 | #define c_cephes_exp_p1 1.3981999507E-3
357 | #define c_cephes_exp_p2 8.3334519073E-3
358 | #define c_cephes_exp_p3 4.1665795894E-2
359 | #define c_cephes_exp_p4 1.6666665459E-1
360 | #define c_cephes_exp_p5 5.0000001201E-1
361 | 
362 | /* exp() computed for 4 float at once */
363 | v4sf exp_ps(v4sf x) {
364 |   v4sf tmp, fx;
365 | 
366 |   v4sf one = vdupq_n_f32(1);
367 |   x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
368 |   x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
369 | 
370 |   /* express exp(x) as exp(g + n*log(2)) */
371 |   fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
372 | 
373 |   /* perform a floorf */
374 |   tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
375 | 
376 |   /* if greater, substract 1 */
377 |   v4su mask = vcgtq_f32(tmp, fx);
378 |   mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
379 | 
380 | 
381 |   fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
382 | 
383 |   tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
384 |   v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
385 |   x = vsubq_f32(x, tmp);
386 |   x = vsubq_f32(x, z);
387 | 
388 |   static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1,
389 |       c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
390 |   v4sf y = vld1q_dup_f32(cephes_exp_p+0);
391 |   v4sf c1 = vld1q_dup_f32(cephes_exp_p+1);
392 |   v4sf c2 = vld1q_dup_f32(cephes_exp_p+2);
393 |   v4sf c3 = vld1q_dup_f32(cephes_exp_p+3);
394 |   v4sf c4 = vld1q_dup_f32(cephes_exp_p+4);
395 |   v4sf c5 = vld1q_dup_f32(cephes_exp_p+5);
396 | 
397 |   y = vmulq_f32(y, x);
398 |   z = vmulq_f32(x, x);
399 |   y = vaddq_f32(y, c1);
400 |   y = vmulq_f32(y, x);
401 |   y = vaddq_f32(y, c2);
402 |   y = vmulq_f32(y, x);
403 |   y = vaddq_f32(y, c3);
404 |   y = vmulq_f32(y, x);
405 |   y = vaddq_f32(y, c4);
406 |   y = vmulq_f32(y, x);
407 |   y = vaddq_f32(y, c5);
408 | 
409 |   y = vmulq_f32(y, z);
410 |   y = vaddq_f32(y, x);
411 |   y = vaddq_f32(y, one);
412 | 
413 |   /* build 2^n */
414 |   int32x4_t mm;
415 |   mm = vcvtq_s32_f32(fx);
416 |   mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
417 |   mm = vshlq_n_s32(mm, 23);
418 |   v4sf pow2n = vreinterpretq_f32_s32(mm);
419 | 
420 |   y = vmulq_f32(y, pow2n);
421 |   return y;
422 | }
423 | 
424 | 
425 | 


--------------------------------------------------------------------------------
/src/prior_boxes.cpp:
--------------------------------------------------------------------------------
  1 | #include "prior_boxes.hpp"
  2 | 
  3 | #include <cmath>
  4 | #include <algorithm>
  5 | 
  6 | namespace deephi {
  7 | 
  8 | using std::sqrt;
  9 | using std::vector;
 10 | using std::make_pair;
 11 | using std::make_shared;
 12 | using std::fill_n;
 13 | using std::copy_n;
 14 | 
 15 | PriorBoxes::PriorBoxes(int image_width, int image_height,
 16 |     int layer_width, int layer_height,
 17 |     const vector<float>& variances,
 18 |     const vector<float>& min_sizes, const vector<float>& max_sizes,
 19 |     const vector<float>& aspect_ratios, float offset,
 20 |     float step_width, float step_height, bool flip, bool clip) :
 21 |         offset_(offset), clip_(clip) {
 22 | 
 23 |   // CHECK_GT(min_sizes.size(), 0);
 24 |   // if (!max_sizes.empty()) CHECK_EQ(min_sizes.size(), max_sizes.size());
 25 | 
 26 |   // Store image dimensions and layer dimensions
 27 |   image_dims_ = make_pair(image_width, image_height);
 28 |   layer_dims_ = make_pair(layer_width, layer_height);
 29 | 
 30 |   // Compute step width and height
 31 |   if (step_width == 0 || step_height == 0) {
 32 |     step_dims_ = make_pair(
 33 |        static_cast<float>(image_dims_.first) / layer_dims_.first,
 34 |        static_cast<float>(image_dims_.second) / layer_dims_.second);
 35 |   } else {
 36 |     step_dims_ = make_pair(step_width, step_height);
 37 |   }
 38 | 
 39 |   // Store box variances
 40 |   if (variances.size() == 4) {
 41 |     variances_ = variances;
 42 |   } else if (variances.size() == 1) {
 43 |     variances_.resize(4);
 44 |     fill_n(variances_.begin(), 4, variances[0]);
 45 |   } else {
 46 |     variances_.resize(4);
 47 |     fill_n(variances_.begin(), 4, 0.1f);
 48 |   }
 49 | 
 50 |   // Generate boxes' dimensions
 51 |   for (auto i = 0; i < min_sizes.size(); ++i) {
 52 |     // first prior: aspect_ratio = 1, size = min_size
 53 |     boxes_dims_.emplace_back(min_sizes[i], min_sizes[i]);
 54 |     // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
 55 |     if (!max_sizes.empty()) {
 56 |       boxes_dims_.emplace_back(
 57 |           sqrt(min_sizes[i] * max_sizes[i]),
 58 |           sqrt(min_sizes[i] * max_sizes[i]));
 59 |     }
 60 |     // rest of priors
 61 |     for (auto ar : aspect_ratios) {
 62 |       float w = min_sizes[i] * sqrt(ar);
 63 |       float h = min_sizes[i] / sqrt(ar);
 64 |       boxes_dims_.emplace_back(w, h);
 65 |       if (flip) boxes_dims_.emplace_back(h, w);
 66 |     }
 67 |   }
 68 | 
 69 |   // automatically create priors
 70 |   CreatePriors();
 71 | 
 72 | }
 73 | 
 74 | void PriorBoxes::CreatePriors() {
 75 | 
 76 |   for (int h = 0; h < layer_dims_.second; ++h) {
 77 |     for (int w = 0; w < layer_dims_.first; ++w) {
 78 |       float center_x = (w + offset_) * step_dims_.first;
 79 |       float center_y = (h + offset_) * step_dims_.second;
 80 |       for (auto& dims : boxes_dims_) {
 81 |         auto box = make_shared<vector<float> >(12);
 82 |         // xmin, ymin, xmax, ymax
 83 |         (*box)[0] = (center_x - dims.first / 2.) / image_dims_.first;
 84 |         (*box)[1] = (center_y - dims.second / 2.) / image_dims_.second;
 85 |         (*box)[2] = (center_x + dims.first / 2.) / image_dims_.first;
 86 |         (*box)[3] = (center_y + dims.second / 2.) / image_dims_.second;
 87 | 
 88 |         if (clip_) {
 89 |           for (int i = 0; i < 4; ++i)
 90 |             (*box)[i] = std::min(std::max((*box)[i], 0.f), 1.f);
 91 |         }
 92 |         // variances
 93 |         copy_n(variances_.begin(), 4, box->data()+4);
 94 |         // centers and dimensions
 95 |         (*box)[8] = 0.5f * ((*box)[0] + (*box)[2]);
 96 |         (*box)[9] = 0.5f * ((*box)[1] + (*box)[3]);
 97 |         (*box)[10] = (*box)[2] - (*box)[0];
 98 |         (*box)[11] = (*box)[3] - (*box)[1];
 99 | 
100 |         priors_.push_back(std::move(box));
101 |       }
102 | 
103 |     }
104 |   }
105 | }
106 | 
107 | }  // namespace deephi
108 | 


--------------------------------------------------------------------------------
/src/prior_boxes.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPHI_PRIORBOXES_HPP_
 2 | #define DEEPHI_PRIORBOXES_HPP_
 3 | 
 4 | #include <vector>
 5 | #include <memory>
 6 | #include <utility>
 7 | 
 8 | namespace deephi {
 9 | 
10 | class PriorBoxes {
11 |  
12 |  public:
13 |   PriorBoxes(int image_width, int image_height, 
14 |       int layer_width, int layer_height,
15 |       const std::vector<float>& variances,
16 |       const std::vector<float>& min_sizes, const std::vector<float>& max_sizes,
17 |       const std::vector<float>& aspect_ratios, float offset,
18 |       float step_width = 0.f, float step_height = 0.f,
19 |       bool flip = true, bool clip = false);
20 | 
21 |   const std::vector<std::shared_ptr<std::vector<float> > >& priors() const {
22 |     return priors_;
23 |   }
24 | 
25 |  protected:
26 |   
27 |   void CreatePriors();
28 | 
29 |   std::vector<std::shared_ptr<std::vector<float> > > priors_;
30 | 
31 |   std::pair<int, int> image_dims_;
32 |   std::pair<int, int> layer_dims_;
33 |   std::pair<float, float> step_dims_;
34 | 
35 |   std::vector<std::pair<float, float> > boxes_dims_;
36 | 
37 |   float offset_;
38 |   bool clip_;
39 | 
40 |   std::vector<float> variances_;
41 | };
42 | 
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/ssd_detector.cpp:
--------------------------------------------------------------------------------
  1 | #include "ssd_detector.hpp"
  2 | 
  3 | #include <cmath>
  4 | #include <algorithm>
  5 | #include <functional>
  6 | #include <tuple>
  7 | #include <thread>
  8 | #include <iostream>
  9 | #include <opencv2/core/core.hpp>
 10 | 
 11 | #include "time_helper.hpp"
 12 | 
 13 | namespace deephi {
 14 | 
 15 | using namespace cv;
 16 | using namespace std;
 17 | 
 18 | SSDdetector::SSDdetector(unsigned int num_classes,  // int background_label_id,
 19 |                          CodeType code_type, bool variance_encoded_in_target,
 20 |                          unsigned int keep_top_k,
 21 |                          const vector<float>& confidence_threshold,
 22 |                          unsigned int nms_top_k, float nms_threshold, float eta,
 23 |                          const vector<shared_ptr<vector<float>>>& priors,
 24 |                          float scale, bool clip)
 25 |     : num_classes_(num_classes),
 26 |       // background_label_id_(background_label_id),
 27 |       code_type_(code_type),
 28 |       variance_encoded_in_target_(variance_encoded_in_target),
 29 |       keep_top_k_(keep_top_k),
 30 |       confidence_threshold_(confidence_threshold),
 31 |       nms_top_k_(nms_top_k),
 32 |       nms_threshold_(nms_threshold),
 33 |       eta_(eta),
 34 |       priors_(priors),
 35 |       scale_(scale),
 36 |       clip_(clip) {
 37 |   num_priors_ = priors_.size();
 38 |   nms_confidence_ = *std::min_element(
 39 |       confidence_threshold_.begin() + 1, confidence_threshold_.end());
 40 | }
 41 | 
 42 | template <typename T>
 43 | void SSDdetector::Detect(const T* loc_data, const float* conf_data,
 44 |                          MultiDetObjects* result, int location, int classification) {
 45 |   decoded_bboxes_.clear();
 46 |   const T(*bboxes)[4] = (const T(*)[4])loc_data;
 47 | 
 48 | //  __TIC__(Sort);
 49 |   int num_det = 0;
 50 |   vector<vector<int>> indices(num_classes_);
 51 |   vector<vector<pair<float, int>>> score_index_vec(num_classes_);
 52 | 
 53 | #if 0
 54 |   // Get top_k scores (with corresponding indices).
 55 |   GetMultiClassMaxScoreIndexMT(conf_data, 1, num_classes_ - 1,
 56 |                                &score_index_vec);
 57 | #else
 58 | /*
 59 |   int location = -1;
 60 |   float conf = 0;
 61 |   int classification = -1;
 62 |   for(int i = 0; i < num_priors_; ++i) {
 63 |     for(int j = 1; j < num_classes_; ++j) {
 64 |       int offset = i * num_classes_ + j;
 65 |       if (conf_data[offset] > confidence_threshold_[i] && conf_data[offset] > conf) {
 66 |         conf = conf_data[offset];
 67 |         location = i;
 68 |         classification = j;
 69 |       }
 70 |     }
 71 |   }
 72 |   score_index_vec[0].push_back(make_pair(conf, location));
 73 | */
 74 | 
 75 |   float conf = conf_data[location*num_classes_+classification];
 76 |   if(conf > confidence_threshold_[classification]) {
 77 |     score_index_vec[0].push_back(make_pair(conf, location));
 78 |     ApplyOneClassNMS(bboxes, conf_data, classification, score_index_vec[0], &(indices[0]));
 79 |     num_det = 1;
 80 |   }
 81 | #endif
 82 |   // for (int c = 1; c < num_classes_; ++c) {
 83 |   // if (c == background_label_id_) {
 84 |   //   continue;
 85 |   // }
 86 |   // GetOneClassMaxScoreIndex(conf_data, c, &score_index_vec[c]);
 87 |   // }
 88 | 
 89 | //  __TOC__(Sort);
 90 | //  __TIC__(NMS);
 91 | #if 0
 92 |   for (int c = 1; c < num_classes_; ++c) {
 93 |     // Perform NMS for one class
 94 |     ApplyOneClassNMS(bboxes, conf_data, c, score_index_vec[c], &(indices[c]));
 95 | 
 96 |     num_det += indices[c].size();
 97 |   }
 98 | #endif
 99 | 
100 |   if (keep_top_k_ > 0 && num_det > keep_top_k_) {
101 |     vector<tuple<float, int, int>> score_index_tuples;
102 |     for (auto label = 0; label < num_classes_; ++label) {
103 |       const vector<int>& label_indices = indices[label];
104 |       for (auto j = 0; j < label_indices.size(); ++j) {
105 |         auto idx = label_indices[j];
106 |         auto score = conf_data[idx * num_classes_ + label];
107 |         score_index_tuples.emplace_back(score, label, idx);
108 |       }
109 |     }
110 | 
111 |     // Keep top k results per image.
112 |     std::sort(score_index_tuples.begin(), score_index_tuples.end(),
113 |               [](const tuple<float, int, int>& lhs,
114 |                  const tuple<float, int, int>& rhs) {
115 |                 return get<0>(lhs) > get<0>(rhs);
116 |               });
117 |     score_index_tuples.resize(keep_top_k_);
118 | 
119 |     indices.clear();
120 |     indices.resize(num_classes_);
121 |     for (auto& item : score_index_tuples) {
122 |       indices[get<1>(item)].push_back(get<2>(item));
123 |     }
124 | 
125 |     num_det = keep_top_k_;
126 |   }
127 | 
128 | //  __TOC__(NMS);
129 | 
130 | //  __TIC__(Box);
131 | /*
132 |   for (auto label = 1; label < indices.size(); ++label) {
133 |     for (auto idx : indices[label]) {
134 |       auto score = conf_data[idx * num_classes_ + label];
135 |       if (score < confidence_threshold_[label]) {
136 |         continue;
137 |       }
138 |       auto& bbox = decoded_bboxes_[idx];
139 | 	    bbox[0] = std::max(std::min(bbox[0], 1.f), 0.f);
140 | 	    bbox[1] = std::max(std::min(bbox[1], 1.f), 0.f);
141 | 	    bbox[2] = std::max(std::min(bbox[2], 1.f), 0.f);
142 | 	    bbox[3] = std::max(std::min(bbox[3], 1.f), 0.f);
143 | cout << "index: " << idx << endl;
144 | cout << "score: " << score << endl;
145 | cout << "lol: " << bbox[0] << " " << bbox[1] << " " << bbox[2] << " " << bbox[3] << endl;
146 | 	    auto box_rect = Rect_<float>(Point2f(bbox[0], bbox[1]),
147 |           Point2f(bbox[2], bbox[3]));
148 |       result->emplace_back(label, score, box_rect);
149 |     }
150 |   }
151 | */
152 |   if(conf > confidence_threshold_[classification]) {
153 |     auto& bbox = decoded_bboxes_[location];
154 |     bbox[0] = std::max(std::min(bbox[0], 1.f), 0.f);
155 |     bbox[1] = std::max(std::min(bbox[1], 1.f), 0.f);
156 |     bbox[2] = std::max(std::min(bbox[2], 1.f), 0.f);
157 |     bbox[3] = std::max(std::min(bbox[3], 1.f), 0.f);
158 |     auto box_rect = Rect_<float>(Point2f(bbox[0], bbox[1]),
159 |           Point2f(bbox[2], bbox[3]));
160 |     result->emplace_back(classification, conf, box_rect);
161 |   }
162 | 
163 | //  __TOC__(Box);
164 | }
165 | 
166 | template void SSDdetector::Detect(
167 |     const int* loc_data, const float* conf_data,
168 |     MultiDetObjects* result, int, int);
169 | template void SSDdetector::Detect(
170 |     const int8_t* loc_data, const float* conf_data,
171 |     MultiDetObjects* result, int, int);
172 | 
173 | template <typename T>
174 | void SSDdetector::ApplyOneClassNMS(
175 |     const T (*bboxes)[4], const float* conf_data, int label,
176 |     const vector<pair<float, int>>& score_index_vec, vector<int>* indices) {
177 |   // Get top_k scores (with corresponding indices).
178 |   // vector<pair<float, int> > score_index_vec;
179 |   // GetOneClassMaxScoreIndex(conf_data, label, &score_index_vec);
180 | 
181 |   // Do nms.
182 |   float adaptive_threshold = nms_threshold_;
183 |   indices->clear();
184 |   int i = 0;
185 |   while (i < score_index_vec.size()) {
186 | //	__TIC__(Decode)
187 |     const int idx = score_index_vec[i].second;
188 |     if (decoded_bboxes_.find(idx) == decoded_bboxes_.end()) {
189 |       DecodeBBox(bboxes, idx, true);
190 |     }
191 | //	__TOC__(Decode)
192 | //	__TIC__(OVERLAP)
193 |     bool keep = true;
194 |     for (int k = 0; k < indices->size(); ++k) {
195 |       if (keep) {
196 |         const int kept_idx = (*indices)[k];
197 |         float overlap = JaccardOverlap(bboxes, idx, kept_idx);
198 |         keep = overlap <= adaptive_threshold;
199 |       } else {
200 |         break;
201 |       }
202 |     }
203 |     if (keep) {
204 |       indices->push_back(idx);
205 |     }
206 |     ++i;
207 |     if (keep && eta_ < 1 && adaptive_threshold > 0.5) {
208 |       adaptive_threshold *= eta_;
209 |     }
210 | //	__TOC__(OVERLAP)
211 |   }
212 | }
213 | 
214 | template void SSDdetector::ApplyOneClassNMS(
215 |     const int (*bboxes)[4], const float* conf_data, int label,
216 |     const vector<pair<float, int>>& score_index_vec, vector<int>* indices);
217 | template void SSDdetector::ApplyOneClassNMS(
218 |     const int8_t (*bboxes)[4], const float* conf_data, int label,
219 |     const vector<pair<float, int>>& score_index_vec, vector<int>* indices);
220 | 
221 | void SSDdetector::GetOneClassMaxScoreIndex(
222 |     const float* conf_data, int label,
223 |     vector<pair<float, int>>* score_index_vec) {
224 |   //__TIC__(PUSH2)
225 |   conf_data += label;
226 |   for (int i = 0; i < num_priors_; ++i) {
227 |     auto score = *conf_data;
228 |     if (score > nms_confidence_) {
229 |       score_index_vec->emplace_back(score, i);
230 |     }
231 |     conf_data += num_classes_;
232 |   }
233 |   //__TOC__(PUSH2)
234 |   //__TIC__(SORT2)
235 |   std::stable_sort(
236 |       score_index_vec->begin(), score_index_vec->end(),
237 |       [](const pair<float, int>& lhs, const pair<float, int>& rhs) {
238 |         return lhs.first > rhs.first;
239 |       });
240 |   //__TOC__(SORT2)
241 | 
242 |   if (nms_top_k_ > -1 && nms_top_k_ < score_index_vec->size()) {
243 |     score_index_vec->resize(nms_top_k_);
244 |   }
245 | }
246 | 
247 | void SSDdetector::GetMultiClassMaxScoreIndex(
248 |     const float* conf_data, int start_label, int num_classes,
249 |     vector<vector<pair<float, int>>>* score_index_vec) {
250 |   for (auto i = start_label; i < start_label + num_classes; ++i) {
251 |     GetOneClassMaxScoreIndex(conf_data, i, &((*score_index_vec)[i]));
252 |   }
253 | }
254 | 
255 | void SSDdetector::GetMultiClassMaxScoreIndexMT(
256 |     const float* conf_data, int start_label, int num_classes,
257 |     vector<vector<pair<float, int>>>* score_index_vec, int threads) {
258 |   // CHECK_GT(threads, 0);
259 |   int thread_classes = num_classes / threads;
260 |   int last_thread_classes = num_classes % threads + thread_classes;
261 | 
262 |   vector<std::thread> workers;
263 | 
264 |   auto c = start_label;
265 |   for (auto i = 0; i < threads - 1; ++i) {
266 |     workers.emplace_back(&SSDdetector::GetMultiClassMaxScoreIndex, this,
267 |                          conf_data, c, thread_classes, score_index_vec);
268 |     c += thread_classes;
269 |   }
270 |   workers.emplace_back(&SSDdetector::GetMultiClassMaxScoreIndex, this,
271 |                        conf_data, c, last_thread_classes, score_index_vec);
272 | 
273 |   for (auto& worker : workers)
274 |     if (worker.joinable()) worker.join();
275 | }
276 | 
277 | void BBoxSize(vector<float>& bbox, bool normalized) {
278 |   float width = bbox[2] - bbox[0];
279 |   float height = bbox[3] - bbox[1];
280 |   if (width > 0 && height > 0) {
281 |     if (normalized) {
282 |       bbox[4] = width * height;
283 |     } else {
284 |       bbox[4] = (width + 1) * (height + 1);
285 |     }
286 |   } else {
287 |     bbox[4] = 0.f;
288 |   }
289 | }
290 | 
291 | float IntersectBBoxSize(const vector<float>& bbox1, const vector<float>& bbox2,
292 |                         bool normalized) {
293 |   if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
294 |       bbox2[3] < bbox1[1]) {
295 |     // Return 0 if there is no intersection.
296 |     return 0.f;
297 |   }
298 | 
299 |   vector<float> intersect_bbox(5);
300 |   intersect_bbox[0] = max(bbox1[0], bbox2[0]);
301 |   intersect_bbox[1] = max(bbox1[1], bbox2[1]);
302 |   intersect_bbox[2] = min(bbox1[2], bbox2[2]);
303 |   intersect_bbox[3] = min(bbox1[3], bbox2[3]);
304 |   BBoxSize(intersect_bbox, normalized);
305 |   return intersect_bbox[4];
306 | }
307 | 
308 | /*
309 | void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox) {
310 |   clip_bbox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
311 |   clip_bbox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
312 |   clip_bbox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
313 |   clip_bbox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
314 |   clip_bbox->clear_size();
315 |   clip_bbox->set_size(BBoxSize(*clip_bbox));
316 |   clip_bbox->set_difficult(bbox.difficult());
317 | }
318 | 
319 | void ClipBBox(const NormalizedBBox& bbox, const float height, const float width,
320 |               NormalizedBBox* clip_bbox) {
321 |   clip_bbox->set_xmin(std::max(std::min(bbox.xmin(), width), 0.f));
322 |   clip_bbox->set_ymin(std::max(std::min(bbox.ymin(), height), 0.f));
323 |   clip_bbox->set_xmax(std::max(std::min(bbox.xmax(), width), 0.f));
324 |   clip_bbox->set_ymax(std::max(std::min(bbox.ymax(), height), 0.f));
325 |   clip_bbox->clear_size();
326 |   clip_bbox->set_size(BBoxSize(*clip_bbox));
327 |   clip_bbox->set_difficult(bbox.difficult());
328 | }
329 | */
330 | 
331 | template <typename T>
332 | float SSDdetector::JaccardOverlap(const T (*bboxes)[4], int idx, int kept_idx,
333 |                                   bool normalized) {
334 |   /*
335 |     if (decoded_bboxes_.find(idx) == decoded_bboxes_.end()) {
336 |       DecodeBBox(bboxes, idx, normalized);
337 |     }
338 |     if (decoded_bboxes_.find(kept_idx) == decoded_bboxes_.end()) {
339 |       DecodeBBox(bboxes, kept_idx, normalized);
340 |     }
341 |   */
342 |   const vector<float>& bbox1 = decoded_bboxes_[idx];
343 |   const vector<float>& bbox2 = decoded_bboxes_[kept_idx];
344 |   float intersect_size = IntersectBBoxSize(bbox1, bbox2, normalized);
345 |   return intersect_size <= 0 ? 0 : intersect_size /
346 |                                        (bbox1[4] + bbox2[4] - intersect_size);
347 | }
348 | 
349 | template float SSDdetector::JaccardOverlap(const int (*bboxes)[4], int idx,
350 |                                            int kept_idx, bool normalized);
351 | template float SSDdetector::JaccardOverlap(const int8_t (*bboxes)[4], int idx,
352 |                                            int kept_idx, bool normalized);
353 | 
354 | template <typename T>
355 | void SSDdetector::DecodeBBox(const T (*bboxes)[4], int idx, bool normalized) {
356 |   vector<float> bbox(5, 0);
357 |   // scale bboxes
358 |   transform(bboxes[idx], bboxes[idx] + 4, bbox.begin(),
359 |             std::bind2nd(multiplies<float>(), scale_));
360 |   for (int i =0; i < 1; i++) {
361 | 	// LOG(INFO) << "lalalal========" << bbox[0] << " " << bbox[1] << " " << bbox[2] << " " << bbox[3] << " " << bbox[4];
362 |   }
363 |   auto& prior_bbox = priors_[idx];
364 | 
365 |   if (code_type_ == CodeType::CORNER) {
366 |     if (variance_encoded_in_target_) {
367 |       // variance is encoded in target, we simply need to add the offset
368 |       // predictions.
369 |       transform(bbox.begin(), bbox.end(), prior_bbox->begin(), bbox.begin(),
370 |                 plus<float>());
371 |     } else {
372 |       // variance is encoded in bbox, we need to scale the offset accordingly.
373 |       transform(bbox.begin(), bbox.end(), prior_bbox->begin() + 4, bbox.begin(),
374 |                 multiplies<float>());
375 |       transform(bbox.begin(), bbox.end(), prior_bbox->begin(), bbox.begin(),
376 |                 plus<float>());
377 |     }
378 |   } else if (code_type_ == CodeType::CENTER_SIZE) {
379 |     float decode_bbox_center_x, decode_bbox_center_y;
380 |     float decode_bbox_width, decode_bbox_height;
381 |     if (variance_encoded_in_target_) {
382 |       // variance is encoded in target, we simply need to retore the offset
383 |       // predictions.
384 |       decode_bbox_center_x = bbox[0] * (*prior_bbox)[10] + (*prior_bbox)[8];
385 |       decode_bbox_center_y = bbox[1] * (*prior_bbox)[11] + (*prior_bbox)[9];
386 |       decode_bbox_width = exp(bbox[2]) * (*prior_bbox)[10];
387 |       decode_bbox_height = exp(bbox[3]) * (*prior_bbox)[11];
388 |     } else {
389 |       // variance is encoded in bbox, we need to scale the offset accordingly.
390 |       decode_bbox_center_x =
391 |           (*prior_bbox)[4] * bbox[0] * (*prior_bbox)[10] + (*prior_bbox)[8];
392 |       decode_bbox_center_y =
393 |           (*prior_bbox)[5] * bbox[1] * (*prior_bbox)[11] + (*prior_bbox)[9];
394 |       decode_bbox_width = exp((*prior_bbox)[6] * bbox[2]) * (*prior_bbox)[10];
395 |       decode_bbox_height = exp((*prior_bbox)[7] * bbox[3]) * (*prior_bbox)[11];
396 |     }
397 | 
398 |     bbox[0] = decode_bbox_center_x - decode_bbox_width / 2.;
399 |     bbox[1] = decode_bbox_center_y - decode_bbox_height / 2.;
400 |     bbox[2] = decode_bbox_center_x + decode_bbox_width / 2.;
401 |     bbox[3] = decode_bbox_center_y + decode_bbox_height / 2.;
402 |   } else if (code_type_ == CodeType::CORNER_SIZE) {
403 |     if (variance_encoded_in_target_) {
404 |       // variance is encoded in target, we simply need to add the offset
405 |       // predictions.
406 |       bbox[0] *= (*prior_bbox)[10];
407 |       bbox[1] *= (*prior_bbox)[11];
408 |       bbox[2] *= (*prior_bbox)[10];
409 |       bbox[3] *= (*prior_bbox)[11];
410 |       transform(bbox.begin(), bbox.end(), prior_bbox->begin(), bbox.begin(),
411 |                 plus<float>());
412 |     } else {
413 |       // variance is encoded in bbox, we need to scale the offset accordingly.
414 |       bbox[0] *= (*prior_bbox)[10];
415 |       bbox[1] *= (*prior_bbox)[11];
416 |       bbox[2] *= (*prior_bbox)[10];
417 |       bbox[3] *= (*prior_bbox)[11];
418 |       transform(bbox.begin(), bbox.end(), prior_bbox->begin() + 4, bbox.begin(),
419 |                 multiplies<float>());
420 |       transform(bbox.begin(), bbox.end(), prior_bbox->begin(), bbox.begin(),
421 |                 plus<float>());
422 |     }
423 |   } else {
424 |     // LOG(FATAL) << "Unknown LocLossType.";
425 |   }
426 | 
427 |   BBoxSize(bbox, normalized);
428 |   // if (clip_) {
429 |   //   ClipBBox(*bbox, normalized);
430 |   // }
431 | 
432 |   decoded_bboxes_.emplace(idx, std::move(bbox));
433 | }
434 | 
435 | template void SSDdetector::DecodeBBox(const int (*bboxes)[4], int idx,
436 |                                       bool normalized);
437 | template void SSDdetector::DecodeBBox(const int8_t (*bboxes)[4], int idx,
438 |                                       bool normalized);
439 | }
440 | 


--------------------------------------------------------------------------------
/src/ssd_detector.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPHI_SSD_DETECTOR_HPP_
 2 | #define DEEPHI_SSD_DETECTOR_HPP_
 3 | 
 4 | #include <vector>
 5 | #include <memory>
 6 | #include <map>
 7 | #include <utility>
 8 | //#include "base/data.hpp"
 9 | #include <tuple>
10 | #include <opencv2/core.hpp>
11 | 
12 | namespace deephi {
13 | 
14 | using SingleDetObject = std::tuple<int, float, cv::Rect_<float> >;
15 | using MultiDetObjects = std::vector<SingleDetObject>;
16 | 
17 | class SSDdetector {
18 | 
19 |  public:
20 |   enum CodeType { CORNER, CENTER_SIZE, CORNER_SIZE };
21 | 
22 |   SSDdetector(unsigned int num_classes,// int background_label_id,
23 |       CodeType code_type, bool variance_encoded_in_target,
24 |       unsigned int keep_top_k,
25 |       const std::vector<float>& confidence_threshold,
26 |       unsigned int nms_top_k, float nms_threshold, float eta,
27 |       const std::vector<std::shared_ptr<std::vector<float> > >& priors,
28 |       float scale = 1.f, bool clip = false);
29 | 
30 |   template <typename T>
31 |   void Detect(const T* loc_data, const float* conf_data,
32 |               MultiDetObjects* result, int, int);
33 | 
34 |   unsigned int num_classes() const { return num_classes_; }
35 |   unsigned int num_priors() const { return priors_.size(); }
36 | 
37 |  protected:
38 | 
39 |   template <typename T>
40 |   void ApplyOneClassNMS(const T (*bboxes)[4], const float* conf_data,
41 |       int label, const std::vector<std::pair<float, int> >& score_index_vec,
42 |       std::vector<int>* indices);
43 | 
44 |   void GetOneClassMaxScoreIndex(const float* conf_data, int label,
45 |       std::vector<std::pair<float, int> >* score_index_vec);
46 | 
47 |   void GetMultiClassMaxScoreIndex(const float* conf_data,
48 |       int start_label, int num_classes,
49 |       std::vector<std::vector<std::pair<float, int> > >* score_index_vec);
50 | 
51 |   void GetMultiClassMaxScoreIndexMT(const float* conf_data,
52 |       int start_label, int num_classes,
53 |       std::vector<std::vector<std::pair<float, int> > >* score_index_vec,
54 |       int threads = 2);
55 | 
56 |   template <typename T>
57 |   float JaccardOverlap(const T (*bboxes)[4], int idx, int kept_idx,
58 |       bool normalized = true);
59 | 
60 |   template <typename T>
61 |   void DecodeBBox(const T (*bboxes)[4], int idx, bool normalized);
62 | 
63 |   std::map<int, std::vector<float> > decoded_bboxes_;
64 | 
65 |   const unsigned int num_classes_;
66 |   // int background_label_id_;
67 |   CodeType code_type_;
68 |   bool variance_encoded_in_target_;
69 |   unsigned int keep_top_k_;
70 |   std::vector<float> confidence_threshold_;
71 |   float nms_confidence_;
72 |   unsigned int nms_top_k_;
73 |   float nms_threshold_;
74 |   float eta_;
75 | 
76 |   const std::vector<std::shared_ptr<std::vector<float> > >& priors_;
77 |   float scale_;
78 | 
79 |   bool clip_;
80 | 
81 |   int num_priors_;
82 | };
83 | 
84 | }
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/time_helper.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __TIME_HELPER_HPP__
 2 | #define __TIME_HELPER_HPP__
 3 | 
 4 | #include <sys/time.h>
 5 | 
 6 | #define __AVE_TIC__(tag) static int ____##tag##_total_time=0; \
 7 |         static int ____##tag##_total_conut=0;\
 8 |         timeval ____##tag##_start_time, ____##tag##_end_time;\
 9 |         gettimeofday(&____##tag##_start_time, 0);
10 | 
11 | #define __AVE_TOC__(tag) gettimeofday(&____##tag##_end_time, 0); \
12 |         ____##tag##_total_conut++; \
13 |         ____##tag##_total_time+=((int)____##tag##_end_time.tv_sec-(int)____##tag##_start_time.tv_sec)*1000000+((int)____##tag##_end_time.tv_usec-(int)____##tag##_start_time.tv_usec); \
14 |         fprintf(stderr,  #tag ": %d us\n", ____##tag##_total_time/____##tag##_total_conut);
15 | 
16 | #define __TIC__(tag) timeval ____##tag##_start_time, ____##tag##_end_time;\
17 |         gettimeofday(&____##tag##_start_time, 0);
18 | 
19 | #define __TOC__(tag) gettimeofday(&____##tag##_end_time, 0); \
20 |         int ____##tag##_total_time=((int)____##tag##_end_time.tv_sec-(int)____##tag##_start_time.tv_sec)*1000000+((int)____##tag##_end_time.tv_usec-(int)____##tag##_start_time.tv_usec); \
21 |         fprintf(stderr,  #tag ": %d us\n", ____##tag##_total_time);
22 | 
23 | #endif // __TIME_HELPER_HPP__


--------------------------------------------------------------------------------
/train/README.md:
--------------------------------------------------------------------------------
1 | Note: Since we use the fixed-point instead of the float-point for data representation during training, we only provide the prototxts with fixed-point layers. 
2 | You can replace those layers with float-point ones and train with [Caffe-SSD](https://github.com/weiliu89/caffe/tree/ssd).
3 | 
4 | 


--------------------------------------------------------------------------------
/train/SSD_tgiif.sh:
--------------------------------------------------------------------------------
1 | #!bin/sh
2 | 
3 | PATH_TO_CAFFE=''
4 | 
5 | $PATH_TO_CAFFE/build/tools/caffe train \
6 |     --solver="solver.prototxt" \
7 |     --weights="VGG16_BN_FIXED_PRETRAINED.caffemodel" \
8 |     --gpu 0,1,2,3 2>&1 | tee logs/log_SSD_tgiif.log
9 | 


--------------------------------------------------------------------------------
/train/model_bnfixed_test.prototxt:
--------------------------------------------------------------------------------
   1 | name: "VGG_tgiif_test"
   2 | layer {
   3 |   name: "data"
   4 |   type: "AnnotatedData"
   5 |   top: "data"
   6 |   top: "label"
   7 |   include {
   8 |     phase: TEST
   9 |   }
  10 |   transform_param {
  11 |     mean_value: 104.0
  12 |     mean_value: 117.0
  13 |     mean_value: 123.0
  14 |     resize_param {
  15 |       prob: 1.0
  16 |       resize_mode: WARP
  17 |       height: 252
  18 |       width: 448
  19 |       interp_mode: LINEAR
  20 |     }
  21 |   }
  22 |   data_param {
  23 |     source: "example_0.7/val_lmdb"
  24 |     batch_size: 1
  25 |     multi_gpu_testing: false
  26 |     backend: LMDB
  27 |   }
  28 |   annotated_data_param {
  29 |     batch_sampler {
  30 |     }
  31 |     label_map_file: "lists/labelmap.prototxt"
  32 |   }
  33 | }
  34 | 
  35 | layer {
  36 |   name: "data_fixed"
  37 |   type: "FixedNeuron"
  38 |   bottom: "data"
  39 |   top: "data"
  40 |   param {
  41 |     lr_mult: 0
  42 |     decay_mult: 0
  43 |   }
  44 |   fixed_param {
  45 |     fixed_method: OVER_FLOW
  46 |     bit_width: 8
  47 |   }
  48 | }
  49 | layer {
  50 |   name: "conv1_1"
  51 |   type: "ConvolutionBNFixed"
  52 |   bottom: "data"
  53 |   top: "conv1_1"
  54 |   param {
  55 |     lr_mult: 1
  56 |     decay_mult: 1
  57 |   }
  58 |   param {
  59 |     lr_mult: 1
  60 |     decay_mult: 0
  61 |   }
  62 |   param {
  63 |     lr_mult: 1
  64 |     decay_mult: 0
  65 |   }
  66 |   param {
  67 |     lr_mult: 0
  68 |     decay_mult: 0
  69 |   }
  70 |   param {
  71 |     lr_mult: 0
  72 |     decay_mult: 0
  73 |   }
  74 |   convolution_param {
  75 |     num_output: 64
  76 |     bias_term: false
  77 |     pad: 1
  78 |     kernel_size: 3
  79 |     weight_filler {
  80 |       type: "msra"
  81 |     }
  82 |     bias_filler {
  83 |       type: "constant"
  84 |       value: 0
  85 |     }
  86 |   }
  87 |   batch_norm_param {
  88 |     scale_filler {
  89 |       type: "constant"
  90 |       value: 1
  91 |     }
  92 |     bias_filler {
  93 |       type: "constant"
  94 |       value: 0
  95 |     }
  96 |   }
  97 |   fixed_param {
  98 |     fixed_method: OVER_FLOW
  99 |     bit_width: 8
 100 |   }
 101 | }
 102 | layer {
 103 |   name: "relu1_1"
 104 |   type: "ReLU"
 105 |   bottom: "conv1_1"
 106 |   top: "conv1_1"
 107 | }
 108 | layer {
 109 |   name: "conv1_1_fixed"
 110 |   type: "FixedNeuron"
 111 |   bottom: "conv1_1"
 112 |   top: "conv1_1"
 113 |   param {
 114 |     lr_mult: 0
 115 |     decay_mult: 0
 116 |   }
 117 |   fixed_param {
 118 |     fixed_method: OVER_FLOW
 119 |     bit_width: 8
 120 |   }
 121 | }
 122 | layer {
 123 |   name: "conv1_2"
 124 |   type: "ConvolutionBNFixed"
 125 |   bottom: "conv1_1"
 126 |   top: "conv1_2"
 127 |   param {
 128 |     lr_mult: 1
 129 |     decay_mult: 1
 130 |   }
 131 |   param {
 132 |     lr_mult: 1
 133 |     decay_mult: 0
 134 |   }
 135 |   param {
 136 |     lr_mult: 1
 137 |     decay_mult: 0
 138 |   }
 139 |   param {
 140 |     lr_mult: 0
 141 |     decay_mult: 0
 142 |   }
 143 |   param {
 144 |     lr_mult: 0
 145 |     decay_mult: 0
 146 |   }
 147 |   convolution_param {
 148 |     num_output: 64
 149 |     bias_term: false
 150 |     pad: 1
 151 |     kernel_size: 3
 152 |     weight_filler {
 153 |       type: "msra"
 154 |     }
 155 |     bias_filler {
 156 |       type: "constant"
 157 |       value: 0
 158 |     }
 159 |   }
 160 |   batch_norm_param {
 161 |     scale_filler {
 162 |       type: "constant"
 163 |       value: 1
 164 |     }
 165 |     bias_filler {
 166 |       type: "constant"
 167 |       value: 0
 168 |     }
 169 |   }
 170 |   fixed_param {
 171 |     fixed_method: OVER_FLOW
 172 |     bit_width: 8
 173 |   }
 174 | }
 175 | layer {
 176 |   name: "relu1_2"
 177 |   type: "ReLU"
 178 |   bottom: "conv1_2"
 179 |   top: "conv1_2"
 180 | }
 181 | layer {
 182 |   name: "pool1"
 183 |   type: "Pooling"
 184 |   bottom: "conv1_2"
 185 |   top: "pool1"
 186 |   pooling_param {
 187 |     pool: MAX
 188 |     kernel_size: 2
 189 |     stride: 2
 190 |   }
 191 | }
 192 | layer {
 193 |   name: "pool1_fixed"
 194 |   type: "FixedNeuron"
 195 |   bottom: "pool1"
 196 |   top: "pool1"
 197 |   param {
 198 |     lr_mult: 0
 199 |     decay_mult: 0
 200 |   }
 201 |   fixed_param {
 202 |     fixed_method: OVER_FLOW
 203 |     bit_width: 8
 204 |   }
 205 | }
 206 | layer {
 207 |   name: "conv2_1"
 208 |   type: "ConvolutionBNFixed"
 209 |   bottom: "pool1"
 210 |   top: "conv2_1"
 211 |   param {
 212 |     lr_mult: 1
 213 |     decay_mult: 1
 214 |   }
 215 |   param {
 216 |     lr_mult: 1
 217 |     decay_mult: 0
 218 |   }
 219 |   param {
 220 |     lr_mult: 1
 221 |     decay_mult: 0
 222 |   }
 223 |   param {
 224 |     lr_mult: 0
 225 |     decay_mult: 0
 226 |   }
 227 |   param {
 228 |     lr_mult: 0
 229 |     decay_mult: 0
 230 |   }
 231 |   convolution_param {
 232 |     num_output: 128
 233 |     bias_term: false
 234 |     pad: 1
 235 |     kernel_size: 3
 236 |     weight_filler {
 237 |       type: "msra"
 238 |     }
 239 |     bias_filler {
 240 |       type: "constant"
 241 |       value: 0
 242 |     }
 243 |   }
 244 |   batch_norm_param {
 245 |     scale_filler {
 246 |       type: "constant"
 247 |       value: 1
 248 |     }
 249 |     bias_filler {
 250 |       type: "constant"
 251 |       value: 0
 252 |     }
 253 |   }
 254 |   fixed_param {
 255 |     fixed_method: OVER_FLOW
 256 |     bit_width: 8
 257 |   }
 258 | }
 259 | layer {
 260 |   name: "relu2_1"
 261 |   type: "ReLU"
 262 |   bottom: "conv2_1"
 263 |   top: "conv2_1"
 264 | }
 265 | layer {
 266 |   name: "conv2_1_fixed"
 267 |   type: "FixedNeuron"
 268 |   bottom: "conv2_1"
 269 |   top: "conv2_1"
 270 |   param {
 271 |     lr_mult: 0
 272 |     decay_mult: 0
 273 |   }
 274 |   fixed_param {
 275 |     fixed_method: OVER_FLOW
 276 |     bit_width: 8
 277 |   }
 278 | }
 279 | layer {
 280 |   name: "conv2_2"
 281 |   type: "ConvolutionBNFixed"
 282 |   bottom: "conv2_1"
 283 |   top: "conv2_2"
 284 |   param {
 285 |     lr_mult: 1
 286 |     decay_mult: 1
 287 |   }
 288 |   param {
 289 |     lr_mult: 1
 290 |     decay_mult: 0
 291 |   }
 292 |   param {
 293 |     lr_mult: 1
 294 |     decay_mult: 0
 295 |   }
 296 |   param {
 297 |     lr_mult: 0
 298 |     decay_mult: 0
 299 |   }
 300 |   param {
 301 |     lr_mult: 0
 302 |     decay_mult: 0
 303 |   }
 304 |   convolution_param {
 305 |     num_output: 128
 306 |     bias_term: false
 307 |     pad: 1
 308 |     kernel_size: 3
 309 |     weight_filler {
 310 |       type: "msra"
 311 |     }
 312 |     bias_filler {
 313 |       type: "constant"
 314 |       value: 0
 315 |     }
 316 |   }
 317 |   batch_norm_param {
 318 |     scale_filler {
 319 |       type: "constant"
 320 |       value: 1
 321 |     }
 322 |     bias_filler {
 323 |       type: "constant"
 324 |       value: 0
 325 |     }
 326 |   }
 327 |   fixed_param {
 328 |     fixed_method: OVER_FLOW
 329 |     bit_width: 8
 330 |   }
 331 | }
 332 | layer {
 333 |   name: "relu2_2"
 334 |   type: "ReLU"
 335 |   bottom: "conv2_2"
 336 |   top: "conv2_2"
 337 | }
 338 | layer {
 339 |   name: "pool2"
 340 |   type: "Pooling"
 341 |   bottom: "conv2_2"
 342 |   top: "pool2"
 343 |   pooling_param {
 344 |     pool: MAX
 345 |     kernel_size: 2
 346 |     stride: 2
 347 |   }
 348 | }
 349 | layer {
 350 |   name: "pool2_fixed"
 351 |   type: "FixedNeuron"
 352 |   bottom: "pool2"
 353 |   top: "pool2"
 354 |   param {
 355 |     lr_mult: 0
 356 |     decay_mult: 0
 357 |   }
 358 |   fixed_param {
 359 |     fixed_method: OVER_FLOW
 360 |     bit_width: 8
 361 |   }
 362 | }
 363 | layer {
 364 |   name: "conv3_1"
 365 |   type: "ConvolutionBNFixed"
 366 |   bottom: "pool2"
 367 |   top: "conv3_1"
 368 |   param {
 369 |     lr_mult: 1
 370 |     decay_mult: 1
 371 |   }
 372 |   param {
 373 |     lr_mult: 1
 374 |     decay_mult: 0
 375 |   }
 376 |   param {
 377 |     lr_mult: 1
 378 |     decay_mult: 0
 379 |   }
 380 |   param {
 381 |     lr_mult: 0
 382 |     decay_mult: 0
 383 |   }
 384 |   param {
 385 |     lr_mult: 0
 386 |     decay_mult: 0
 387 |   }
 388 |   convolution_param {
 389 |     num_output: 256
 390 |     bias_term: false
 391 |     pad: 1
 392 |     kernel_size: 3
 393 |     weight_filler {
 394 |       type: "msra"
 395 |     }
 396 |     bias_filler {
 397 |       type: "constant"
 398 |       value: 0
 399 |     }
 400 |   }
 401 |   batch_norm_param {
 402 |     scale_filler {
 403 |       type: "constant"
 404 |       value: 1
 405 |     }
 406 |     bias_filler {
 407 |       type: "constant"
 408 |       value: 0
 409 |     }
 410 |   }
 411 |   fixed_param {
 412 |     fixed_method: OVER_FLOW
 413 |     bit_width: 8
 414 |   }
 415 | }
 416 | layer {
 417 |   name: "relu3_1"
 418 |   type: "ReLU"
 419 |   bottom: "conv3_1"
 420 |   top: "conv3_1"
 421 | }
 422 | layer {
 423 |   name: "conv3_1_fixed"
 424 |   type: "FixedNeuron"
 425 |   bottom: "conv3_1"
 426 |   top: "conv3_1"
 427 |   param {
 428 |     lr_mult: 0
 429 |     decay_mult: 0
 430 |   }
 431 |   fixed_param {
 432 |     fixed_method: OVER_FLOW
 433 |     bit_width: 8
 434 |   }
 435 | }
 436 | layer {
 437 |   name: "conv3_2"
 438 |   type: "ConvolutionBNFixed"
 439 |   bottom: "conv3_1"
 440 |   top: "conv3_2"
 441 |   param {
 442 |     lr_mult: 1
 443 |     decay_mult: 1
 444 |   }
 445 |   param {
 446 |     lr_mult: 1
 447 |     decay_mult: 0
 448 |   }
 449 |   param {
 450 |     lr_mult: 1
 451 |     decay_mult: 0
 452 |   }
 453 |   param {
 454 |     lr_mult: 0
 455 |     decay_mult: 0
 456 |   }
 457 |   param {
 458 |     lr_mult: 0
 459 |     decay_mult: 0
 460 |   }
 461 |   convolution_param {
 462 |     num_output: 256
 463 |     bias_term: false
 464 |     pad: 1
 465 |     kernel_size: 3
 466 |     weight_filler {
 467 |       type: "msra"
 468 |     }
 469 |     bias_filler {
 470 |       type: "constant"
 471 |       value: 0
 472 |     }
 473 |   }
 474 |   batch_norm_param {
 475 |     scale_filler {
 476 |       type: "constant"
 477 |       value: 1
 478 |     }
 479 |     bias_filler {
 480 |       type: "constant"
 481 |       value: 0
 482 |     }
 483 |   }
 484 |   fixed_param {
 485 |     fixed_method: OVER_FLOW
 486 |     bit_width: 8
 487 |   }
 488 | }
 489 | layer {
 490 |   name: "relu3_2"
 491 |   type: "ReLU"
 492 |   bottom: "conv3_2"
 493 |   top: "conv3_2"
 494 | }
 495 | layer {
 496 |   name: "conv3_2_fixed"
 497 |   type: "FixedNeuron"
 498 |   bottom: "conv3_2"
 499 |   top: "conv3_2"
 500 |   param {
 501 |     lr_mult: 0
 502 |     decay_mult: 0
 503 |   }
 504 |   fixed_param {
 505 |     fixed_method: OVER_FLOW
 506 |     bit_width: 8
 507 |   }
 508 | }
 509 | layer {
 510 |   name: "conv3_3"
 511 |   type: "ConvolutionBNFixed"
 512 |   bottom: "conv3_2"
 513 |   top: "conv3_3"
 514 |   param {
 515 |     lr_mult: 1
 516 |     decay_mult: 1
 517 |   }
 518 |   param {
 519 |     lr_mult: 1
 520 |     decay_mult: 0
 521 |   }
 522 |   param {
 523 |     lr_mult: 1
 524 |     decay_mult: 0
 525 |   }
 526 |   param {
 527 |     lr_mult: 0
 528 |     decay_mult: 0
 529 |   }
 530 |   param {
 531 |     lr_mult: 0
 532 |     decay_mult: 0
 533 |   }
 534 |   convolution_param {
 535 |     num_output: 256
 536 |     bias_term: false
 537 |     pad: 1
 538 |     kernel_size: 3
 539 |     weight_filler {
 540 |       type: "msra"
 541 |     }
 542 |     bias_filler {
 543 |       type: "constant"
 544 |       value: 0
 545 |     }
 546 |   }
 547 |   batch_norm_param {
 548 |     scale_filler {
 549 |       type: "constant"
 550 |       value: 1
 551 |     }
 552 |     bias_filler {
 553 |       type: "constant"
 554 |       value: 0
 555 |     }
 556 |   }
 557 |   fixed_param {
 558 |     fixed_method: OVER_FLOW
 559 |     bit_width: 8
 560 |   }
 561 | }
 562 | layer {
 563 |   name: "relu3_3"
 564 |   type: "ReLU"
 565 |   bottom: "conv3_3"
 566 |   top: "conv3_3"
 567 | }
 568 | layer {
 569 |   name: "pool3"
 570 |   type: "Pooling"
 571 |   bottom: "conv3_3"
 572 |   top: "pool3"
 573 |   pooling_param {
 574 |     pool: MAX
 575 |     kernel_size: 2
 576 |     stride: 2
 577 |   }
 578 | }
 579 | layer {
 580 |   name: "pool3_fixed"
 581 |   type: "FixedNeuron"
 582 |   bottom: "pool3"
 583 |   top: "pool3"
 584 |   param {
 585 |     lr_mult: 0
 586 |     decay_mult: 0
 587 |   }
 588 |   fixed_param {
 589 |     fixed_method: OVER_FLOW
 590 |     bit_width: 8
 591 |   }
 592 | }
 593 | layer {
 594 |   name: "conv4_1"
 595 |   type: "ConvolutionBNFixed"
 596 |   bottom: "pool3"
 597 |   top: "conv4_1"
 598 |   param {
 599 |     lr_mult: 1
 600 |     decay_mult: 1
 601 |   }
 602 |   param {
 603 |     lr_mult: 1
 604 |     decay_mult: 0
 605 |   }
 606 |   param {
 607 |     lr_mult: 1
 608 |     decay_mult: 0
 609 |   }
 610 |   param {
 611 |     lr_mult: 0
 612 |     decay_mult: 0
 613 |   }
 614 |   param {
 615 |     lr_mult: 0
 616 |     decay_mult: 0
 617 |   }
 618 |   convolution_param {
 619 |     num_output: 512
 620 |     bias_term: false
 621 |     pad: 1
 622 |     kernel_size: 3
 623 |     weight_filler {
 624 |       type: "msra"
 625 |     }
 626 |     bias_filler {
 627 |       type: "constant"
 628 |       value: 0
 629 |     }
 630 |   }
 631 |   batch_norm_param {
 632 |     scale_filler {
 633 |       type: "constant"
 634 |       value: 1
 635 |     }
 636 |     bias_filler {
 637 |       type: "constant"
 638 |       value: 0
 639 |     }
 640 |   }
 641 |   fixed_param {
 642 |     fixed_method: OVER_FLOW
 643 |     bit_width: 8
 644 |   }
 645 | }
 646 | layer {
 647 |   name: "relu4_1"
 648 |   type: "ReLU"
 649 |   bottom: "conv4_1"
 650 |   top: "conv4_1"
 651 | }
 652 | layer {
 653 |   name: "conv4_1_fixed"
 654 |   type: "FixedNeuron"
 655 |   bottom: "conv4_1"
 656 |   top: "conv4_1"
 657 |   param {
 658 |     lr_mult: 0
 659 |     decay_mult: 0
 660 |   }
 661 |   fixed_param {
 662 |     fixed_method: OVER_FLOW
 663 |     bit_width: 8
 664 |   }
 665 | }
 666 | layer {
 667 |   name: "conv4_2"
 668 |   type: "ConvolutionBNFixed"
 669 |   bottom: "conv4_1"
 670 |   top: "conv4_2"
 671 |   param {
 672 |     lr_mult: 1
 673 |     decay_mult: 1
 674 |   }
 675 |   param {
 676 |     lr_mult: 1
 677 |     decay_mult: 0
 678 |   }
 679 |   param {
 680 |     lr_mult: 1
 681 |     decay_mult: 0
 682 |   }
 683 |   param {
 684 |     lr_mult: 0
 685 |     decay_mult: 0
 686 |   }
 687 |   param {
 688 |     lr_mult: 0
 689 |     decay_mult: 0
 690 |   }
 691 |   convolution_param {
 692 |     num_output: 512
 693 |     bias_term: false
 694 |     pad: 1
 695 |     kernel_size: 3
 696 |     weight_filler {
 697 |       type: "msra"
 698 |     }
 699 |     bias_filler {
 700 |       type: "constant"
 701 |       value: 0
 702 |     }
 703 |   }
 704 |   batch_norm_param {
 705 |     scale_filler {
 706 |       type: "constant"
 707 |       value: 1
 708 |     }
 709 |     bias_filler {
 710 |       type: "constant"
 711 |       value: 0
 712 |     }
 713 |   }
 714 |   fixed_param {
 715 |     fixed_method: OVER_FLOW
 716 |     bit_width: 8
 717 |   }
 718 | }
 719 | layer {
 720 |   name: "relu4_2"
 721 |   type: "ReLU"
 722 |   bottom: "conv4_2"
 723 |   top: "conv4_2"
 724 | }
 725 | layer {
 726 |   name: "conv4_2_fixed"
 727 |   type: "FixedNeuron"
 728 |   bottom: "conv4_2"
 729 |   top: "conv4_2"
 730 |   param {
 731 |     lr_mult: 0
 732 |     decay_mult: 0
 733 |   }
 734 |   fixed_param {
 735 |     fixed_method: OVER_FLOW
 736 |     bit_width: 8
 737 |   }
 738 | }
 739 | layer {
 740 |   name: "conv4_3"
 741 |   type: "ConvolutionBNFixed"
 742 |   bottom: "conv4_2"
 743 |   top: "conv4_3"
 744 |   param {
 745 |     lr_mult: 1
 746 |     decay_mult: 1
 747 |   }
 748 |   param {
 749 |     lr_mult: 1
 750 |     decay_mult: 0
 751 |   }
 752 |   param {
 753 |     lr_mult: 1
 754 |     decay_mult: 0
 755 |   }
 756 |   param {
 757 |     lr_mult: 0
 758 |     decay_mult: 0
 759 |   }
 760 |   param {
 761 |     lr_mult: 0
 762 |     decay_mult: 0
 763 |   }
 764 |   convolution_param {
 765 |     num_output: 512
 766 |     bias_term: false
 767 |     pad: 1
 768 |     kernel_size: 3
 769 |     weight_filler {
 770 |       type: "msra"
 771 |     }
 772 |     bias_filler {
 773 |       type: "constant"
 774 |       value: 0
 775 |     }
 776 |   }
 777 |   batch_norm_param {
 778 |     scale_filler {
 779 |       type: "constant"
 780 |       value: 1
 781 |     }
 782 |     bias_filler {
 783 |       type: "constant"
 784 |       value: 0
 785 |     }
 786 |   }
 787 |   fixed_param {
 788 |     fixed_method: OVER_FLOW
 789 |     bit_width: 8
 790 |   }
 791 | }
 792 | layer {
 793 |   name: "relu4_3"
 794 |   type: "ReLU"
 795 |   bottom: "conv4_3"
 796 |   top: "conv4_3"
 797 | }
 798 | layer {
 799 |   name: "pool4"
 800 |   type: "Pooling"
 801 |   bottom: "conv4_3"
 802 |   top: "pool4"
 803 |   pooling_param {
 804 |     pool: MAX
 805 |     kernel_size: 2
 806 |     stride: 2
 807 |   }
 808 | }
 809 | layer {
 810 |   name: "pool4_fixed"
 811 |   type: "FixedNeuron"
 812 |   bottom: "pool4"
 813 |   top: "pool4"
 814 |   param {
 815 |     lr_mult: 0
 816 |     decay_mult: 0
 817 |   }
 818 |   fixed_param {
 819 |     fixed_method: OVER_FLOW
 820 |     bit_width: 8
 821 |   }
 822 | }
 823 | layer {
 824 |   name: "conv5_1"
 825 |   type: "ConvolutionBNFixed"
 826 |   bottom: "pool4"
 827 |   top: "conv5_1"
 828 |   param {
 829 |     lr_mult: 1
 830 |     decay_mult: 1
 831 |   }
 832 |   param {
 833 |     lr_mult: 1
 834 |     decay_mult: 0
 835 |   }
 836 |   param {
 837 |     lr_mult: 1
 838 |     decay_mult: 0
 839 |   }
 840 |   param {
 841 |     lr_mult: 0
 842 |     decay_mult: 0
 843 |   }
 844 |   param {
 845 |     lr_mult: 0
 846 |     decay_mult: 0
 847 |   }
 848 |   convolution_param {
 849 |     num_output: 512
 850 |     bias_term: false
 851 |     pad: 1
 852 |     kernel_size: 3
 853 |     weight_filler {
 854 |       type: "msra"
 855 |     }
 856 |     bias_filler {
 857 |       type: "constant"
 858 |       value: 0
 859 |     }
 860 |   }
 861 |   batch_norm_param {
 862 |     scale_filler {
 863 |       type: "constant"
 864 |       value: 1
 865 |     }
 866 |     bias_filler {
 867 |       type: "constant"
 868 |       value: 0
 869 |     }
 870 |   }
 871 |   fixed_param {
 872 |     fixed_method: OVER_FLOW
 873 |     bit_width: 8
 874 |   }
 875 | }
 876 | layer {
 877 |   name: "relu5_1"
 878 |   type: "ReLU"
 879 |   bottom: "conv5_1"
 880 |   top: "conv5_1"
 881 | }
 882 | layer {
 883 |   name: "conv5_1_fixed"
 884 |   type: "FixedNeuron"
 885 |   bottom: "conv5_1"
 886 |   top: "conv5_1"
 887 |   param {
 888 |     lr_mult: 0
 889 |     decay_mult: 0
 890 |   }
 891 |   fixed_param {
 892 |     fixed_method: OVER_FLOW
 893 |     bit_width: 8
 894 |   }
 895 | }
 896 | layer {
 897 |   name: "conv5_2"
 898 |   type: "ConvolutionBNFixed"
 899 |   bottom: "conv5_1"
 900 |   top: "conv5_2"
 901 |   param {
 902 |     lr_mult: 1
 903 |     decay_mult: 1
 904 |   }
 905 |   param {
 906 |     lr_mult: 1
 907 |     decay_mult: 0
 908 |   }
 909 |   param {
 910 |     lr_mult: 1
 911 |     decay_mult: 0
 912 |   }
 913 |   param {
 914 |     lr_mult: 0
 915 |     decay_mult: 0
 916 |   }
 917 |   param {
 918 |     lr_mult: 0
 919 |     decay_mult: 0
 920 |   }
 921 |   convolution_param {
 922 |     num_output: 512
 923 |     bias_term: false
 924 |     pad: 1
 925 |     kernel_size: 3
 926 |     weight_filler {
 927 |       type: "msra"
 928 |     }
 929 |     bias_filler {
 930 |       type: "constant"
 931 |       value: 0
 932 |     }
 933 |   }
 934 |   batch_norm_param {
 935 |     scale_filler {
 936 |       type: "constant"
 937 |       value: 1
 938 |     }
 939 |     bias_filler {
 940 |       type: "constant"
 941 |       value: 0
 942 |     }
 943 |   }
 944 |   fixed_param {
 945 |     fixed_method: OVER_FLOW
 946 |     bit_width: 8
 947 |   }
 948 | }
 949 | layer {
 950 |   name: "relu5_2"
 951 |   type: "ReLU"
 952 |   bottom: "conv5_2"
 953 |   top: "conv5_2"
 954 | }
 955 | layer {
 956 |   name: "conv5_2_fixed"
 957 |   type: "FixedNeuron"
 958 |   bottom: "conv5_2"
 959 |   top: "conv5_2"
 960 |   param {
 961 |     lr_mult: 0
 962 |     decay_mult: 0
 963 |   }
 964 |   fixed_param {
 965 |     fixed_method: OVER_FLOW
 966 |     bit_width: 8
 967 |   }
 968 | }
 969 | layer {
 970 |   name: "conv5_3"
 971 |   type: "ConvolutionBNFixed"
 972 |   bottom: "conv5_2"
 973 |   top: "conv5_3"
 974 |   param {
 975 |     lr_mult: 1
 976 |     decay_mult: 1
 977 |   }
 978 |   param {
 979 |     lr_mult: 1
 980 |     decay_mult: 0
 981 |   }
 982 |   param {
 983 |     lr_mult: 1
 984 |     decay_mult: 0
 985 |   }
 986 |   param {
 987 |     lr_mult: 0
 988 |     decay_mult: 0
 989 |   }
 990 |   param {
 991 |     lr_mult: 0
 992 |     decay_mult: 0
 993 |   }
 994 |   convolution_param {
 995 |     num_output: 512
 996 |     bias_term: false
 997 |     pad: 1
 998 |     kernel_size: 3
 999 |     weight_filler {
1000 |       type: "msra"
1001 |     }
1002 |     bias_filler {
1003 |       type: "constant"
1004 |       value: 0
1005 |     }
1006 |   }
1007 |   batch_norm_param {
1008 |     scale_filler {
1009 |       type: "constant"
1010 |       value: 1
1011 |     }
1012 |     bias_filler {
1013 |       type: "constant"
1014 |       value: 0
1015 |     }
1016 |   }
1017 |   fixed_param {
1018 |     fixed_method: OVER_FLOW
1019 |     bit_width: 8
1020 |   }
1021 | }
1022 | layer {
1023 |   name: "relu5_3"
1024 |   type: "ReLU"
1025 |   bottom: "conv5_3"
1026 |   top: "conv5_3"
1027 | }
1028 | layer {
1029 |   name: "pool5"
1030 |   type: "Pooling"
1031 |   bottom: "conv5_3"
1032 |   top: "pool5"
1033 |   pooling_param {
1034 |     pool: MAX
1035 |     kernel_size: 3
1036 |     stride: 1
1037 |     pad: 1
1038 |   }
1039 | }
1040 | layer {
1041 |   name: "pool5_fixed"
1042 |   type: "FixedNeuron"
1043 |   bottom: "pool5"
1044 |   top: "pool5"
1045 |   param {
1046 |     lr_mult: 0
1047 |     decay_mult: 0
1048 |   }
1049 |   fixed_param {
1050 |     fixed_method: OVER_FLOW
1051 |     bit_width: 8
1052 |   }
1053 | }
1054 | layer {
1055 |   name: "fc6_1"
1056 |   type: "ConvolutionBNFixed"
1057 |   bottom: "pool5"
1058 |   top: "fc6"
1059 |   param {
1060 |     lr_mult: 1
1061 |     decay_mult: 1
1062 |   }
1063 |   param {
1064 |     lr_mult: 1
1065 |     decay_mult: 0
1066 |   }
1067 |   param {
1068 |     lr_mult: 1
1069 |     decay_mult: 0
1070 |   }
1071 |   param {
1072 |     lr_mult: 0
1073 |     decay_mult: 0
1074 |   }
1075 |   param {
1076 |     lr_mult: 0
1077 |     decay_mult: 0
1078 |   }
1079 |   convolution_param {
1080 |     num_output: 1024
1081 |     bias_term: false
1082 |     pad: 1
1083 |     kernel_size: 3
1084 |     weight_filler {
1085 |       type: "msra"
1086 |     }
1087 |     bias_filler {
1088 |       type: "constant"
1089 |       value: 0
1090 |     }
1091 |   }
1092 |   batch_norm_param {
1093 |     scale_filler {
1094 |       type: "constant"
1095 |       value: 1
1096 |     }
1097 |     bias_filler {
1098 |       type: "constant"
1099 |       value: 0
1100 |     }
1101 |   }
1102 |   fixed_param {
1103 |     fixed_method: OVER_FLOW
1104 |     bit_width: 8
1105 |   }
1106 | }
1107 | layer {
1108 |   name: "relu6"
1109 |   type: "ReLU"
1110 |   bottom: "fc6"
1111 |   top: "fc6"
1112 | }
1113 | layer {
1114 |   name: "fc6_fixed"
1115 |   type: "FixedNeuron"
1116 |   bottom: "fc6"
1117 |   top: "fc6"
1118 |   param {
1119 |     lr_mult: 0
1120 |     decay_mult: 0
1121 |   }
1122 |   fixed_param {
1123 |     fixed_method: OVER_FLOW
1124 |     bit_width: 8
1125 |   }
1126 | }
1127 | layer {
1128 |   name: "fc7_2"
1129 |   type: "ConvolutionBNFixed"
1130 |   bottom: "fc6"
1131 |   top: "fc7"
1132 |   param {
1133 |     lr_mult: 1
1134 |     decay_mult: 1
1135 |   }
1136 |   param {
1137 |     lr_mult: 1
1138 |     decay_mult: 0
1139 |   }
1140 |   param {
1141 |     lr_mult: 1
1142 |     decay_mult: 0
1143 |   }
1144 |   param {
1145 |     lr_mult: 0
1146 |     decay_mult: 0
1147 |   }
1148 |   param {
1149 |     lr_mult: 0
1150 |     decay_mult: 0
1151 |   }
1152 |   convolution_param {
1153 |     num_output: 1024
1154 |     bias_term: false
1155 |     kernel_size: 1
1156 |     weight_filler {
1157 |       type: "msra"
1158 |     }
1159 |     bias_filler {
1160 |       type: "constant"
1161 |       value: 0
1162 |     }
1163 |   }
1164 |   batch_norm_param {
1165 |     scale_filler {
1166 |       type: "constant"
1167 |       value: 1
1168 |     }
1169 |     bias_filler {
1170 |       type: "constant"
1171 |       value: 0
1172 |     }
1173 |   }
1174 |   fixed_param {
1175 |     fixed_method: OVER_FLOW
1176 |     bit_width: 8
1177 |   }
1178 | }
1179 | layer {
1180 |   name: "relu7"
1181 |   type: "ReLU"
1182 |   bottom: "fc7"
1183 |   top: "fc7"
1184 | }
1185 | layer {
1186 |   name: "fc7_fixed"
1187 |   type: "FixedNeuron"
1188 |   bottom: "fc7"
1189 |   top: "fc7"
1190 |   param {
1191 |     lr_mult: 0
1192 |     decay_mult: 0
1193 |   }
1194 |   fixed_param {
1195 |     fixed_method: OVER_FLOW
1196 |     bit_width: 8
1197 |   }
1198 | }
1199 | layer {
1200 |   name: "conv6_1"
1201 |   type: "ConvolutionBNFixed"
1202 |   bottom: "fc7"
1203 |   top: "conv6_1"
1204 |   param {
1205 |     lr_mult: 1
1206 |     decay_mult: 1
1207 |   }
1208 |   param {
1209 |     lr_mult: 1
1210 |     decay_mult: 0
1211 |   }
1212 |   param {
1213 |     lr_mult: 1
1214 |     decay_mult: 0
1215 |   }
1216 |   param {
1217 |     lr_mult: 0
1218 |     decay_mult: 0
1219 |   }
1220 |   param {
1221 |     lr_mult: 0
1222 |     decay_mult: 0
1223 |   }
1224 |   convolution_param {
1225 |     num_output: 256
1226 |     bias_term: false
1227 |     pad: 0
1228 |     kernel_size: 1
1229 |     stride: 1
1230 |     weight_filler {
1231 |       type: "msra"
1232 |     }
1233 |     bias_filler {
1234 |       type: "constant"
1235 |       value: 0
1236 |     }
1237 |   }
1238 |   batch_norm_param {
1239 |     scale_filler {
1240 |       type: "constant"
1241 |       value: 1
1242 |     }
1243 |     bias_filler {
1244 |       type: "constant"
1245 |       value: 0
1246 |     }
1247 |   }
1248 |   fixed_param {
1249 |     fixed_method: OVER_FLOW
1250 |     bit_width: 8
1251 |   }
1252 | }
1253 | layer {
1254 |   name: "conv6_1_relu"
1255 |   type: "ReLU"
1256 |   bottom: "conv6_1"
1257 |   top: "conv6_1"
1258 | }
1259 | layer {
1260 |   name: "conv6_1_fixed"
1261 |   type: "FixedNeuron"
1262 |   bottom: "conv6_1"
1263 |   top: "conv6_1"
1264 |   param {
1265 |     lr_mult: 0
1266 |     decay_mult: 0
1267 |   }
1268 |   fixed_param {
1269 |     fixed_method: OVER_FLOW
1270 |     bit_width: 8
1271 |   }
1272 | }
1273 | layer {
1274 |   name: "conv6_2"
1275 |   type: "ConvolutionBNFixed"
1276 |   bottom: "conv6_1"
1277 |   top: "conv6_2"
1278 |   param {
1279 |     lr_mult: 1
1280 |     decay_mult: 1
1281 |   }
1282 |   param {
1283 |     lr_mult: 1
1284 |     decay_mult: 0
1285 |   }
1286 |   param {
1287 |     lr_mult: 1
1288 |     decay_mult: 0
1289 |   }
1290 |   param {
1291 |     lr_mult: 0
1292 |     decay_mult: 0
1293 |   }
1294 |   param {
1295 |     lr_mult: 0
1296 |     decay_mult: 0
1297 |   }
1298 |   convolution_param {
1299 |     num_output: 512
1300 |     bias_term: false
1301 |     pad: 1
1302 |     kernel_size: 3
1303 |     stride: 2
1304 |     weight_filler {
1305 |       type: "msra"
1306 |     }
1307 |     bias_filler {
1308 |       type: "constant"
1309 |       value: 0
1310 |     }
1311 |   }
1312 |   batch_norm_param {
1313 |     scale_filler {
1314 |       type: "constant"
1315 |       value: 1
1316 |     }
1317 |     bias_filler {
1318 |       type: "constant"
1319 |       value: 0
1320 |     }
1321 |   }
1322 |   fixed_param {
1323 |     fixed_method: OVER_FLOW
1324 |     bit_width: 8
1325 |   }
1326 | }
1327 | layer {
1328 |   name: "conv6_2_relu"
1329 |   type: "ReLU"
1330 |   bottom: "conv6_2"
1331 |   top: "conv6_2"
1332 | }
1333 | layer {
1334 |   name: "conv6_2_fixed"
1335 |   type: "FixedNeuron"
1336 |   bottom: "conv6_2"
1337 |   top: "conv6_2"
1338 |   param {
1339 |     lr_mult: 0
1340 |     decay_mult: 0
1341 |   }
1342 |   fixed_param {
1343 |     fixed_method: OVER_FLOW
1344 |     bit_width: 8
1345 |   }
1346 | }
1347 | layer {
1348 |   name: "conv7_1"
1349 |   type: "ConvolutionBNFixed"
1350 |   bottom: "conv6_2"
1351 |   top: "conv7_1"
1352 |   param {
1353 |     lr_mult: 1
1354 |     decay_mult: 1
1355 |   }
1356 |   param {
1357 |     lr_mult: 1
1358 |     decay_mult: 0
1359 |   }
1360 |   param {
1361 |     lr_mult: 1
1362 |     decay_mult: 0
1363 |   }
1364 |   param {
1365 |     lr_mult: 0
1366 |     decay_mult: 0
1367 |   }
1368 |   param {
1369 |     lr_mult: 0
1370 |     decay_mult: 0
1371 |   }
1372 |   convolution_param {
1373 |     num_output: 128
1374 |     bias_term: false
1375 |     pad: 0
1376 |     kernel_size: 1
1377 |     stride: 1
1378 |     weight_filler {
1379 |       type: "msra"
1380 |     }
1381 |     bias_filler {
1382 |       type: "constant"
1383 |       value: 0
1384 |     }
1385 |   }
1386 |   batch_norm_param {
1387 |     scale_filler {
1388 |       type: "constant"
1389 |       value: 1
1390 |     }
1391 |     bias_filler {
1392 |       type: "constant"
1393 |       value: 0
1394 |     }
1395 |   }
1396 |   fixed_param {
1397 |     fixed_method: OVER_FLOW
1398 |     bit_width: 8
1399 |   }
1400 | }
1401 | layer {
1402 |   name: "conv7_1_relu"
1403 |   type: "ReLU"
1404 |   bottom: "conv7_1"
1405 |   top: "conv7_1"
1406 | }
1407 | layer {
1408 |   name: "conv7_1_fixed"
1409 |   type: "FixedNeuron"
1410 |   bottom: "conv7_1"
1411 |   top: "conv7_1"
1412 |   param {
1413 |     lr_mult: 0
1414 |     decay_mult: 0
1415 |   }
1416 |   fixed_param {
1417 |     fixed_method: OVER_FLOW
1418 |     bit_width: 8
1419 |   }
1420 | }
1421 | layer {
1422 |   name: "conv7_2"
1423 |   type: "ConvolutionBNFixed"
1424 |   bottom: "conv7_1"
1425 |   top: "conv7_2"
1426 |   param {
1427 |     lr_mult: 1
1428 |     decay_mult: 1
1429 |   }
1430 |   param {
1431 |     lr_mult: 1
1432 |     decay_mult: 0
1433 |   }
1434 |   param {
1435 |     lr_mult: 1
1436 |     decay_mult: 0
1437 |   }
1438 |   param {
1439 |     lr_mult: 0
1440 |     decay_mult: 0
1441 |   }
1442 |   param {
1443 |     lr_mult: 0
1444 |     decay_mult: 0
1445 |   }
1446 |   convolution_param {
1447 |     num_output: 256
1448 |     bias_term: false
1449 |     pad: 1
1450 |     kernel_size: 3
1451 |     stride: 2
1452 |     weight_filler {
1453 |       type: "msra"
1454 |     }
1455 |     bias_filler {
1456 |       type: "constant"
1457 |       value: 0
1458 |     }
1459 |   }
1460 |   batch_norm_param {
1461 |     scale_filler {
1462 |       type: "constant"
1463 |       value: 1
1464 |     }
1465 |     bias_filler {
1466 |       type: "constant"
1467 |       value: 0
1468 |     }
1469 |   }
1470 |   fixed_param {
1471 |     fixed_method: OVER_FLOW
1472 |     bit_width: 8
1473 |   }
1474 | }
1475 | layer {
1476 |   name: "conv7_2_relu"
1477 |   type: "ReLU"
1478 |   bottom: "conv7_2"
1479 |   top: "conv7_2"
1480 | }
1481 | layer {
1482 |   name: "conv7_2_fixed"
1483 |   type: "FixedNeuron"
1484 |   bottom: "conv7_2"
1485 |   top: "conv7_2"
1486 |   param {
1487 |     lr_mult: 0
1488 |     decay_mult: 0
1489 |   }
1490 |   fixed_param {
1491 |     fixed_method: OVER_FLOW
1492 |     bit_width: 8
1493 |   }
1494 | }
1495 | 
1496 | layer {
1497 |   name: "conv4_3_norm_mbox_loc"
1498 |   type: "ConvolutionBNFixed"
1499 |   bottom: "conv4_3"
1500 |   top: "conv4_3_norm_mbox_loc"
1501 |   param {
1502 |     lr_mult: 1
1503 |     decay_mult: 1
1504 |   }
1505 |   param {
1506 |     lr_mult: 1
1507 |     decay_mult: 0
1508 |   }
1509 |   param {
1510 |     lr_mult: 1
1511 |     decay_mult: 0
1512 |   }
1513 |   param {
1514 |     lr_mult: 0
1515 |     decay_mult: 0
1516 |   }
1517 |   param {
1518 |     lr_mult: 0
1519 |     decay_mult: 0
1520 |   }
1521 |   convolution_param {
1522 |     num_output: 16 # 32
1523 |     bias_term: false
1524 |     pad: 1
1525 |     kernel_size: 3
1526 |     stride: 1
1527 |     weight_filler {
1528 |       type: "msra"
1529 |     }
1530 |     bias_filler {
1531 |       type: "constant"
1532 |       value: 0
1533 |     }
1534 |   }
1535 |   batch_norm_param {
1536 |     scale_filler {
1537 |       type: "constant"
1538 |       value: 1
1539 |     }
1540 |     bias_filler {
1541 |       type: "constant"
1542 |       value: 0
1543 |     }
1544 |   }
1545 |   fixed_param {
1546 |     fixed_method: OVER_FLOW
1547 |     bit_width: 8
1548 |   }
1549 | }
1550 | layer {
1551 |   name: "conv4_3_norm_mbox_loc_perm"
1552 |   type: "Permute"
1553 |   bottom: "conv4_3_norm_mbox_loc"
1554 |   top: "conv4_3_norm_mbox_loc_perm"
1555 |   permute_param {
1556 |     order: 0
1557 |     order: 2
1558 |     order: 3
1559 |     order: 1
1560 |   }
1561 | }
1562 | layer {
1563 |   name: "conv4_3_norm_mbox_loc_flat"
1564 |   type: "Flatten"
1565 |   bottom: "conv4_3_norm_mbox_loc_perm"
1566 |   top: "conv4_3_norm_mbox_loc_flat"
1567 |   flatten_param {
1568 |     axis: 1
1569 |   }
1570 | }
1571 | layer {
1572 |   name: "conv4_3_norm_mbox_conf"
1573 |   type: "ConvolutionBNFixed"
1574 |   bottom: "conv4_3"
1575 |   top: "conv4_3_norm_mbox_conf"
1576 |   param {
1577 |     lr_mult: 1
1578 |     decay_mult: 1
1579 |   }
1580 |   param {
1581 |     lr_mult: 1
1582 |     decay_mult: 0
1583 |   }
1584 |   param {
1585 |     lr_mult: 1
1586 |     decay_mult: 0
1587 |   }
1588 |   param {
1589 |     lr_mult: 0
1590 |     decay_mult: 0
1591 |   }
1592 |   param {
1593 |     lr_mult: 0
1594 |     decay_mult: 0
1595 |   }
1596 |   convolution_param {
1597 |     num_output: 396 # 792 #32
1598 |     bias_term: false
1599 |     pad: 1
1600 |     kernel_size: 3
1601 |     stride: 1
1602 |     weight_filler {
1603 |       type: "msra"
1604 |     }
1605 |     bias_filler {
1606 |       type: "constant"
1607 |       value: 0
1608 |     }
1609 |   }
1610 |   batch_norm_param {
1611 |     scale_filler {
1612 |       type: "constant"
1613 |       value: 1
1614 |     }
1615 |     bias_filler {
1616 |       type: "constant"
1617 |       value: 0
1618 |     }
1619 |   }
1620 |   fixed_param {
1621 |     fixed_method: OVER_FLOW
1622 |     bit_width: 8
1623 |   }
1624 | }
1625 | layer {
1626 |   name: "conv4_3_norm_mbox_conf_perm"
1627 |   type: "Permute"
1628 |   bottom: "conv4_3_norm_mbox_conf"
1629 |   top: "conv4_3_norm_mbox_conf_perm"
1630 |   permute_param {
1631 |     order: 0
1632 |     order: 2
1633 |     order: 3
1634 |     order: 1
1635 |   }
1636 | }
1637 | layer {
1638 |   name: "conv4_3_norm_mbox_conf_flat"
1639 |   type: "Flatten"
1640 |   bottom: "conv4_3_norm_mbox_conf_perm"
1641 |   top: "conv4_3_norm_mbox_conf_flat"
1642 |   flatten_param {
1643 |     axis: 1
1644 |   }
1645 | }
1646 | layer {
1647 |   name: "conv4_3_norm_mbox_priorbox"
1648 |   type: "PriorBox"
1649 |   bottom: "conv4_3"
1650 |   bottom: "data"
1651 |   top: "conv4_3_norm_mbox_priorbox"
1652 |   prior_box_param {
1653 |     # min_size: 15
1654 |     min_size: 30
1655 |     # max_size: 33
1656 |     max_size: 66
1657 |     aspect_ratio: 2
1658 |     flip: true
1659 |     clip: false
1660 |     variance: 0.1
1661 |     variance: 0.1
1662 |     variance: 0.2
1663 |     variance: 0.2
1664 |     step: 8
1665 |     offset: 0.5
1666 |   }
1667 | }
1668 | layer {
1669 |   name: "fc7_mbox_loc"
1670 |   type: "ConvolutionBNFixed"
1671 |   bottom: "fc7"
1672 |   top: "fc7_mbox_loc"
1673 |   param {
1674 |     lr_mult: 1
1675 |     decay_mult: 1
1676 |   }
1677 |   param {
1678 |     lr_mult: 1
1679 |     decay_mult: 0
1680 |   }
1681 |   param {
1682 |     lr_mult: 1
1683 |     decay_mult: 0
1684 |   }
1685 |   param {
1686 |     lr_mult: 0
1687 |     decay_mult: 0
1688 |   }
1689 |   param {
1690 |     lr_mult: 0
1691 |     decay_mult: 0
1692 |   }
1693 |   convolution_param {
1694 |     num_output: 24
1695 |     bias_term: false
1696 |     pad: 1
1697 |     kernel_size: 3
1698 |     stride: 1
1699 |     weight_filler {
1700 |       type: "msra"
1701 |     }
1702 |     bias_filler {
1703 |       type: "constant"
1704 |       value: 0
1705 |     }
1706 |   }
1707 |   batch_norm_param {
1708 |     scale_filler {
1709 |       type: "constant"
1710 |       value: 1
1711 |     }
1712 |     bias_filler {
1713 |       type: "constant"
1714 |       value: 0
1715 |     }
1716 |   }
1717 |   fixed_param {
1718 |     fixed_method: OVER_FLOW
1719 |     bit_width: 8
1720 |   }
1721 | }
1722 | layer {
1723 |   name: "fc7_mbox_loc_perm"
1724 |   type: "Permute"
1725 |   bottom: "fc7_mbox_loc"
1726 |   top: "fc7_mbox_loc_perm"
1727 |   permute_param {
1728 |     order: 0
1729 |     order: 2
1730 |     order: 3
1731 |     order: 1
1732 |   }
1733 | }
1734 | layer {
1735 |   name: "fc7_mbox_loc_flat"
1736 |   type: "Flatten"
1737 |   bottom: "fc7_mbox_loc_perm"
1738 |   top: "fc7_mbox_loc_flat"
1739 |   flatten_param {
1740 |     axis: 1
1741 |   }
1742 | }
1743 | layer {
1744 |   name: "fc7_mbox_conf"
1745 |   type: "ConvolutionBNFixed"
1746 |   bottom: "fc7"
1747 |   top: "fc7_mbox_conf"
1748 |   param {
1749 |     lr_mult: 1
1750 |     decay_mult: 1
1751 |   }
1752 |   param {
1753 |     lr_mult: 1
1754 |     decay_mult: 0
1755 |   }
1756 |   param {
1757 |     lr_mult: 1
1758 |     decay_mult: 0
1759 |   }
1760 |   param {
1761 |     lr_mult: 0
1762 |     decay_mult: 0
1763 |   }
1764 |   param {
1765 |     lr_mult: 0
1766 |     decay_mult: 0
1767 |   }
1768 |   convolution_param {
1769 |     num_output: 594 # 24
1770 |     bias_term: false
1771 |     pad: 1
1772 |     kernel_size: 3
1773 |     stride: 1
1774 |     weight_filler {
1775 |       type: "msra"
1776 |     }
1777 |     bias_filler {
1778 |       type: "constant"
1779 |       value: 0
1780 |     }
1781 |   }
1782 |   batch_norm_param {
1783 |     scale_filler {
1784 |       type: "constant"
1785 |       value: 1
1786 |     }
1787 |     bias_filler {
1788 |       type: "constant"
1789 |       value: 0
1790 |     }
1791 |   }
1792 |   fixed_param {
1793 |     fixed_method: OVER_FLOW
1794 |     bit_width: 8
1795 |   }
1796 | }
1797 | layer {
1798 |   name: "fc7_mbox_conf_perm"
1799 |   type: "Permute"
1800 |   bottom: "fc7_mbox_conf"
1801 |   top: "fc7_mbox_conf_perm"
1802 |   permute_param {
1803 |     order: 0
1804 |     order: 2
1805 |     order: 3
1806 |     order: 1
1807 |   }
1808 | }
1809 | layer {
1810 |   name: "fc7_mbox_conf_flat"
1811 |   type: "Flatten"
1812 |   bottom: "fc7_mbox_conf_perm"
1813 |   top: "fc7_mbox_conf_flat"
1814 |   flatten_param {
1815 |     axis: 1
1816 |   }
1817 | }
1818 | layer {
1819 |   name: "fc7_mbox_priorbox"
1820 |   type: "PriorBox"
1821 |   bottom: "fc7"
1822 |   bottom: "data"
1823 |   top: "fc7_mbox_priorbox"
1824 |   prior_box_param {
1825 |     min_size: 66
1826 |     max_size: 127
1827 |     aspect_ratio: 2
1828 |     aspect_ratio: 3
1829 |     flip: true
1830 |     clip: false
1831 |     variance: 0.1
1832 |     variance: 0.1
1833 |     variance: 0.2
1834 |     variance: 0.2
1835 |     step: 16
1836 |     offset: 0.5
1837 |   }
1838 | }
1839 | layer {
1840 |   name: "conv6_2_mbox_loc"
1841 |   type: "ConvolutionBNFixed"
1842 |   bottom: "conv6_2"
1843 |   top: "conv6_2_mbox_loc"
1844 |   param {
1845 |     lr_mult: 1
1846 |     decay_mult: 1
1847 |   }
1848 |   param {
1849 |     lr_mult: 1
1850 |     decay_mult: 0
1851 |   }
1852 |   param {
1853 |     lr_mult: 1
1854 |     decay_mult: 0
1855 |   }
1856 |   param {
1857 |     lr_mult: 0
1858 |     decay_mult: 0
1859 |   }
1860 |   param {
1861 |     lr_mult: 0
1862 |     decay_mult: 0
1863 |   }
1864 |   convolution_param {
1865 |     num_output: 24
1866 |     bias_term: false
1867 |     pad: 1
1868 |     kernel_size: 3
1869 |     stride: 1
1870 |     weight_filler {
1871 |       type: "msra"
1872 |     }
1873 |     bias_filler {
1874 |       type: "constant"
1875 |       value: 0
1876 |     }
1877 |   }
1878 |   batch_norm_param {
1879 |     scale_filler {
1880 |       type: "constant"
1881 |       value: 1
1882 |     }
1883 |     bias_filler {
1884 |       type: "constant"
1885 |       value: 0
1886 |     }
1887 |   }
1888 |   fixed_param {
1889 |     fixed_method: OVER_FLOW
1890 |     bit_width: 8
1891 |   }
1892 | }
1893 | layer {
1894 |   name: "conv6_2_mbox_loc_perm"
1895 |   type: "Permute"
1896 |   bottom: "conv6_2_mbox_loc"
1897 |   top: "conv6_2_mbox_loc_perm"
1898 |   permute_param {
1899 |     order: 0
1900 |     order: 2
1901 |     order: 3
1902 |     order: 1
1903 |   }
1904 | }
1905 | layer {
1906 |   name: "conv6_2_mbox_loc_flat"
1907 |   type: "Flatten"
1908 |   bottom: "conv6_2_mbox_loc_perm"
1909 |   top: "conv6_2_mbox_loc_flat"
1910 |   flatten_param {
1911 |     axis: 1
1912 |   }
1913 | }
1914 | layer {
1915 |   name: "conv6_2_mbox_conf"
1916 |   type: "ConvolutionBNFixed"
1917 |   bottom: "conv6_2"
1918 |   top: "conv6_2_mbox_conf"
1919 |   param {
1920 |     lr_mult: 1
1921 |     decay_mult: 1
1922 |   }
1923 |   param {
1924 |     lr_mult: 1
1925 |     decay_mult: 0
1926 |   }
1927 |   param {
1928 |     lr_mult: 1
1929 |     decay_mult: 0
1930 |   }
1931 |   param {
1932 |     lr_mult: 0
1933 |     decay_mult: 0
1934 |   }
1935 |   param {
1936 |     lr_mult: 0
1937 |     decay_mult: 0
1938 |   }
1939 |   convolution_param {
1940 |     num_output: 594 # 24
1941 |     bias_term: false
1942 |     pad: 1
1943 |     kernel_size: 3
1944 |     stride: 1
1945 |     weight_filler {
1946 |       type: "msra"
1947 |     }
1948 |     bias_filler {
1949 |       type: "constant"
1950 |       value: 0
1951 |     }
1952 |   }
1953 |   batch_norm_param {
1954 |     scale_filler {
1955 |       type: "constant"
1956 |       value: 1
1957 |     }
1958 |     bias_filler {
1959 |       type: "constant"
1960 |       value: 0
1961 |     }
1962 |   }
1963 |   fixed_param {
1964 |     fixed_method: OVER_FLOW
1965 |     bit_width: 8
1966 |   }
1967 | }
1968 | layer {
1969 |   name: "conv6_2_mbox_conf_perm"
1970 |   type: "Permute"
1971 |   bottom: "conv6_2_mbox_conf"
1972 |   top: "conv6_2_mbox_conf_perm"
1973 |   permute_param {
1974 |     order: 0
1975 |     order: 2
1976 |     order: 3
1977 |     order: 1
1978 |   }
1979 | }
1980 | layer {
1981 |   name: "conv6_2_mbox_conf_flat"
1982 |   type: "Flatten"
1983 |   bottom: "conv6_2_mbox_conf_perm"
1984 |   top: "conv6_2_mbox_conf_flat"
1985 |   flatten_param {
1986 |     axis: 1
1987 |   }
1988 | }
1989 | layer {
1990 |   name: "conv6_2_mbox_priorbox"
1991 |   type: "PriorBox"
1992 |   bottom: "conv6_2"
1993 |   bottom: "data"
1994 |   top: "conv6_2_mbox_priorbox"
1995 |   prior_box_param {
1996 |     min_size: 127
1997 |     max_size: 188
1998 |     aspect_ratio: 2
1999 |     aspect_ratio: 3
2000 |     flip: true
2001 |     clip: false
2002 |     variance: 0.1
2003 |     variance: 0.1
2004 |     variance: 0.2
2005 |     variance: 0.2
2006 |     step: 32
2007 |     offset: 0.5
2008 |   }
2009 | }
2010 | layer {
2011 |   name: "conv7_2_mbox_loc"
2012 |   type: "ConvolutionBNFixed"
2013 |   bottom: "conv7_2"
2014 |   top: "conv7_2_mbox_loc"
2015 |   param {
2016 |     lr_mult: 1
2017 |     decay_mult: 1
2018 |   }
2019 |   param {
2020 |     lr_mult: 1
2021 |     decay_mult: 0
2022 |   }
2023 |   param {
2024 |     lr_mult: 1
2025 |     decay_mult: 0
2026 |   }
2027 |   param {
2028 |     lr_mult: 0
2029 |     decay_mult: 0
2030 |   }
2031 |   param {
2032 |     lr_mult: 0
2033 |     decay_mult: 0
2034 |   }
2035 |   convolution_param {
2036 |     num_output: 24
2037 |     bias_term: false
2038 |     pad: 1
2039 |     kernel_size: 3
2040 |     stride: 1
2041 |     weight_filler {
2042 |       type: "msra"
2043 |     }
2044 |     bias_filler {
2045 |       type: "constant"
2046 |       value: 0
2047 |     }
2048 |   }
2049 |   batch_norm_param {
2050 |     scale_filler {
2051 |       type: "constant"
2052 |       value: 1
2053 |     }
2054 |     bias_filler {
2055 |       type: "constant"
2056 |       value: 0
2057 |     }
2058 |   }
2059 |   fixed_param {
2060 |     fixed_method: OVER_FLOW
2061 |     bit_width: 8
2062 |   }
2063 | }
2064 | layer {
2065 |   name: "conv7_2_mbox_loc_perm"
2066 |   type: "Permute"
2067 |   bottom: "conv7_2_mbox_loc"
2068 |   top: "conv7_2_mbox_loc_perm"
2069 |   permute_param {
2070 |     order: 0
2071 |     order: 2
2072 |     order: 3
2073 |     order: 1
2074 |   }
2075 | }
2076 | layer {
2077 |   name: "conv7_2_mbox_loc_flat"
2078 |   type: "Flatten"
2079 |   bottom: "conv7_2_mbox_loc_perm"
2080 |   top: "conv7_2_mbox_loc_flat"
2081 |   flatten_param {
2082 |     axis: 1
2083 |   }
2084 | }
2085 | layer {
2086 |   name: "conv7_2_mbox_conf"
2087 |   type: "ConvolutionBNFixed"
2088 |   bottom: "conv7_2"
2089 |   top: "conv7_2_mbox_conf"
2090 |   param {
2091 |     lr_mult: 1
2092 |     decay_mult: 1
2093 |   }
2094 |   param {
2095 |     lr_mult: 1
2096 |     decay_mult: 0
2097 |   }
2098 |   param {
2099 |     lr_mult: 1
2100 |     decay_mult: 0
2101 |   }
2102 |   param {
2103 |     lr_mult: 0
2104 |     decay_mult: 0
2105 |   }
2106 |   param {
2107 |     lr_mult: 0
2108 |     decay_mult: 0
2109 |   }
2110 |   convolution_param {
2111 |     num_output: 594 # 24
2112 |     bias_term: false
2113 |     pad: 1
2114 |     kernel_size: 3
2115 |     stride: 1
2116 |     weight_filler {
2117 |       type: "msra"
2118 |     }
2119 |     bias_filler {
2120 |       type: "constant"
2121 |       value: 0
2122 |     }
2123 |   }
2124 |   batch_norm_param {
2125 |     scale_filler {
2126 |       type: "constant"
2127 |       value: 1
2128 |     }
2129 |     bias_filler {
2130 |       type: "constant"
2131 |       value: 0
2132 |     }
2133 |   }
2134 |   fixed_param {
2135 |     fixed_method: OVER_FLOW
2136 |     bit_width: 8
2137 |   }
2138 | }
2139 | layer {
2140 |   name: "conv7_2_mbox_conf_perm"
2141 |   type: "Permute"
2142 |   bottom: "conv7_2_mbox_conf"
2143 |   top: "conv7_2_mbox_conf_perm"
2144 |   permute_param {
2145 |     order: 0
2146 |     order: 2
2147 |     order: 3
2148 |     order: 1
2149 |   }
2150 | }
2151 | layer {
2152 |   name: "conv7_2_mbox_conf_flat"
2153 |   type: "Flatten"
2154 |   bottom: "conv7_2_mbox_conf_perm"
2155 |   top: "conv7_2_mbox_conf_flat"
2156 |   flatten_param {
2157 |     axis: 1
2158 |   }
2159 | }
2160 | layer {
2161 |   name: "conv7_2_mbox_priorbox"
2162 |   type: "PriorBox"
2163 |   bottom: "conv7_2"
2164 |   bottom: "data"
2165 |   top: "conv7_2_mbox_priorbox"
2166 |   prior_box_param {
2167 |     min_size: 188
2168 |     max_size: 249
2169 |     aspect_ratio: 2
2170 |     aspect_ratio: 3
2171 |     flip: true
2172 |     clip: false
2173 |     variance: 0.1
2174 |     variance: 0.1
2175 |     variance: 0.2
2176 |     variance: 0.2
2177 |     step: 64
2178 |     offset: 0.5
2179 |   }
2180 | }
2181 | 
2182 | layer {
2183 |   name: "mbox_loc"
2184 |   type: "Concat"
2185 |   bottom: "conv4_3_norm_mbox_loc_flat"
2186 |   bottom: "fc7_mbox_loc_flat"
2187 |   bottom: "conv6_2_mbox_loc_flat"
2188 |   bottom: "conv7_2_mbox_loc_flat"
2189 |   top: "mbox_loc"
2190 |   concat_param {
2191 |     axis: 1
2192 |   }
2193 | }
2194 | layer {
2195 |   name: "mbox_loc_fixed"
2196 |   type: "FixedNeuron"
2197 |   bottom: "mbox_loc"
2198 |   top: "mbox_loc"
2199 |   param {
2200 |     lr_mult: 0
2201 |     decay_mult: 0
2202 |   }
2203 |   fixed_param {
2204 |     fixed_method: OVER_FLOW
2205 |     bit_width: 8
2206 |   }
2207 | }
2208 | layer {
2209 |   name: "mbox_conf"
2210 |   type: "Concat"
2211 |   bottom: "conv4_3_norm_mbox_conf_flat"
2212 |   bottom: "fc7_mbox_conf_flat"
2213 |   bottom: "conv6_2_mbox_conf_flat"
2214 |   bottom: "conv7_2_mbox_conf_flat"
2215 |   top: "mbox_conf"
2216 |   concat_param {
2217 |     axis: 1
2218 |   }
2219 | }
2220 | layer {
2221 |   name: "mbox_conf_fixed"
2222 |   type: "FixedNeuron"
2223 |   bottom: "mbox_conf"
2224 |   top: "mbox_conf"
2225 |   param {
2226 |     lr_mult: 0
2227 |     decay_mult: 0
2228 |   }
2229 |   fixed_param {
2230 |     fixed_method: OVER_FLOW
2231 |     bit_width: 8
2232 |   }
2233 | }
2234 | layer {
2235 |   name: "mbox_priorbox"
2236 |   type: "Concat"
2237 |   bottom: "conv4_3_norm_mbox_priorbox"
2238 |   bottom: "fc7_mbox_priorbox"
2239 |   bottom: "conv6_2_mbox_priorbox"
2240 |   bottom: "conv7_2_mbox_priorbox"
2241 |   top: "mbox_priorbox"
2242 |   concat_param {
2243 |     axis: 2
2244 |   }
2245 | }
2246 | 
2247 | 
2248 | 
2249 | 
2250 | layer {
2251 |   name: "mbox_conf_reshape"
2252 |   type: "Reshape"
2253 |   bottom: "mbox_conf"
2254 |   top: "mbox_conf_reshape"
2255 |   reshape_param {
2256 |     shape {
2257 |       dim: 0
2258 |       dim: -1
2259 |       dim: 99
2260 |     }
2261 |   }
2262 | }
2263 | layer {
2264 |   name: "mbox_conf_softmax"
2265 |   type: "Softmax"
2266 |   bottom: "mbox_conf_reshape"
2267 |   top: "mbox_conf_softmax"
2268 |   softmax_param {
2269 |     axis: 2
2270 |   }
2271 | }
2272 | layer {
2273 |   name: "mbox_conf_flatten"
2274 |   type: "Flatten"
2275 |   bottom: "mbox_conf_softmax"
2276 |   top: "mbox_conf_flatten"
2277 |   flatten_param {
2278 |     axis: 1
2279 |   }
2280 | }
2281 | layer {
2282 |   name: "detection_out"
2283 |   type: "DetectionOutput"
2284 |   bottom: "mbox_loc"
2285 |   bottom: "mbox_conf_flatten"
2286 |   bottom: "mbox_priorbox"
2287 |   top: "detection_out"
2288 |   include {
2289 |     phase: TEST
2290 |   }
2291 |   detection_output_param {
2292 |     num_classes: 99
2293 |     share_location: true
2294 |     background_label_id: 0
2295 |     nms_param {
2296 |       nms_threshold: 0.449999988079
2297 |       top_k: 400
2298 |     }
2299 |     save_output_param {
2300 |       output_directory: "/home/chenweicong/code/tgiif/results/SSD_tgiif"
2301 |       output_name_prefix: "comp4_det_test_"
2302 |       output_format: "VOC"
2303 |       label_map_file: "lists/labelmap.prototxt"
2304 |       name_size_file: "lists/tgiif_name_size_0.7.txt"
2305 |       num_test_image: 8903
2306 |     }
2307 |     code_type: CENTER_SIZE
2308 |     keep_top_k: 200
2309 |     confidence_threshold: 0.00999999977648
2310 |   }
2311 | }
2312 | layer {
2313 |   name: "detection_eval"
2314 |   type: "DetectionEvaluate"
2315 |   bottom: "detection_out"
2316 |   bottom: "label"
2317 |   top: "detection_eval"
2318 |   top: "iou_eval"
2319 |   include {
2320 |     phase: TEST
2321 |   }
2322 |   detection_evaluate_param {
2323 |     num_classes: 99
2324 |     background_label_id: 0
2325 |     overlap_threshold: 0.5
2326 |     evaluate_difficult_gt: false
2327 |     name_size_file: "lists/tgiif_name_size_0.7.txt"
2328 |   }
2329 | }
2330 | 


--------------------------------------------------------------------------------
/train/solver.prototxt:
--------------------------------------------------------------------------------
 1 | train_net: "model_bnfixed_train.prototxt"
 2 | test_net: "model_bnfixed_test.prototxt"
 3 | test_iter: 2226
 4 | test_interval: 10000
 5 | base_lr: 0.01 
 6 | display: 40
 7 | max_iter: 80000
 8 | lr_policy: "multistep"
 9 | gamma: 0.10000000149
10 | momentum: 0.899999976158
11 | weight_decay: 0.000500000023749
12 | snapshot: 10000
13 | snapshot_prefix: "models/SSD_tgiif"
14 | solver_mode: GPU
15 | device_id: 0
16 | debug_info: false
17 | snapshot_after_train: true
18 | test_initialization: true
19 | average_loss: 10
20 | stepvalue: 20000
21 | stepvalue: 30000
22 | stepvalue: 40000
23 | stepvalue: 50000
24 | iter_size: 1
25 | type: "SGD"
26 | eval_type: "detection"
27 | ap_version: "11point"
28 | 
29 | 


--------------------------------------------------------------------------------