├── .gitignore
├── LICENSE
├── __init__.py
├── anypacking
    ├── dsp_packing.py
    └── quant_module.py
├── cifar
    ├── export_hls.py
    ├── hls
    │   └── config_simd_pe.txt
    ├── main_train.py
    ├── models.py
    ├── search_train.py
    ├── simulate_hw.py
    ├── test_acc.py
    └── train_normal.py
├── dacsdc
    ├── datasets.py
    ├── export_hls.py
    ├── export_hls_skynet.py
    ├── hls
    │   └── config_simd_pe.txt
    ├── main_train.py
    ├── mymodel.py
    ├── pareto_train.py
    ├── quant_dorefa.py
    ├── search_train.py
    ├── simulate_hw.py
    ├── test.py
    ├── train_old.py
    └── yolo_utils.py
├── readme.md
└── utils
    ├── __init__.py
    ├── torch_utils.py
    └── view_pt.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /.vscode
 2 | __pycache__
 3 | *.pt
 4 | test_result
 5 | results
 6 | /train_log
 7 | /weights/*
 8 | */hls/*
 9 | !*/hls/config_simd_pe.txt
10 | localconfig.py
11 | /*.txt
12 | _logs
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Haitong Huang, Erjing Luo, Cheng Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fffasttime/AnyPackingNet/1d740bf0071bec024a745adc3bcd31426b29f601/__init__.py


--------------------------------------------------------------------------------
/anypacking/dsp_packing.py:
--------------------------------------------------------------------------------
 1 | factors_k11=[
 2 | [12,8,8,6,6,4,4],
 3 | [10,8,6,6,4,4,4],
 4 | [8,6,6,4,4,4,3],
 5 | [6,6,4,4,4,4,2],
 6 | [6,4,4,4,2,2,2],
 7 | [4,4,4,4,2,2,2],
 8 | [4,4,3,2,2,2,2],
 9 | ]
10 | 
11 | factors_k33=[
12 | [18,15,12,7.5,7.5,6,6],
13 | [15,12,7.5,6,6,6,3],
14 | [12,7.5,6,6,6,6,3],
15 | [9,6,6,6,6,3,3],
16 | [7.5,6,6,4.5,3,3,3],
17 | [6,6,4.5,3,3,3,2.25],
18 | [6,3,3,3,3,3,2],
19 | ]
20 | 
21 | factors_k55=[
22 | [20,15,10,7.5,7.5,5,5],
23 | [12.5,10,6.67,5,5,5,3.33],
24 | [10,7.5,5,5,5,5,3.33],
25 | [7.5,6.67,5,5,5,3.33,3.33],
26 | [6.67,5,5,5,3.33,2.5,2.5],
27 | [5,5,5,3.33,2.5,2.5,2.5],
28 | [5,3.33,3.33,3.33,2.5,2.5,2],
29 | ]


--------------------------------------------------------------------------------
/anypacking/quant_module.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from turtle import forward
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.nn.parameter import Parameter
  7 | 
  8 | # load dsp packing factors
  9 | from .dsp_packing import *
 10 | 
 11 | gaussian_steps = {1: 1.596, 2: 0.996, 3: 0.586, 4: 0.336, 5: 0.190, 6: 0.106, 7: 0.059, 8: 0.032}
 12 | hwgq_steps = {1: 0.799, 2: 0.538, 3: 0.3217, 4: 0.185, 5: 0.104, 6: 0.058, 7: 0.033, 8: 0.019}
 13 | 
 14 | class _gauss_quantize_sym(torch.autograd.Function):
 15 | 
 16 |     @staticmethod
 17 |     def forward(ctx, x, step, bit):
 18 |         lvls = 2 ** bit / 2
 19 |         alpha = x.std().item()
 20 |         step *= alpha
 21 |         y = (torch.round(x/step+0.5)-0.5) * step
 22 |         thr = (lvls-0.5)*step
 23 |         y = y.clamp(min=-thr, max=thr)
 24 |         return y
 25 | 
 26 |     @staticmethod
 27 |     def backward(ctx, grad_output):
 28 |         return grad_output, None, None
 29 | 
 30 | 
 31 | class _gauss_quantize_resclaed_step_sym(torch.autograd.Function):
 32 | 
 33 |     @staticmethod
 34 |     def forward(ctx, x, step, bit):
 35 |         lvls = 2 ** bit / 2
 36 |         y = (torch.round(x/step+0.5)-0.5) * step
 37 |         thr = (lvls-0.5)*step
 38 |         y = y.clamp(min=-thr, max=thr)
 39 |         return y
 40 | 
 41 |     @staticmethod
 42 |     def backward(ctx, grad_output):
 43 |         return grad_output, None, None
 44 | 
 45 | 
 46 | class _gauss_quantize(torch.autograd.Function):
 47 | 
 48 |     @staticmethod
 49 |     def forward(ctx, x, step, bit):
 50 |         lvls = 2 ** bit / 2
 51 |         alpha = x.std().item()
 52 |         step *= alpha
 53 |         y = torch.clamp(torch.round(x/step), -lvls, lvls-1) * step
 54 |         return y
 55 | 
 56 |     @staticmethod
 57 |     def backward(ctx, grad_output):
 58 |         return grad_output, None, None
 59 | 
 60 | def _gauss_quantize_export(x, step, bit):
 61 |     lvls = 2 ** bit / 2
 62 |     alpha = x.std().item()
 63 |     step *= alpha
 64 |     y = torch.clamp(torch.round(x/step), -lvls, lvls-1)
 65 |     return y.cpu().detach().int().numpy(), step
 66 | 
 67 | class _gauss_quantize_resclaed_step(torch.autograd.Function):
 68 | 
 69 |     @staticmethod
 70 |     def forward(ctx, x, step, bit):
 71 |         lvls = 2 ** bit / 2
 72 |         y = torch.clamp(torch.round(x/step), -lvls, lvls-1) * step
 73 |         return y
 74 | 
 75 |     @staticmethod
 76 |     def backward(ctx, grad_output):
 77 |         return grad_output, None, None
 78 | 
 79 | class _hwgq(torch.autograd.Function):
 80 | 
 81 |     @staticmethod
 82 |     def forward(ctx, x, step):
 83 |         y = torch.round(x / step) * step
 84 |         return y
 85 | 
 86 |     @staticmethod
 87 |     def backward(ctx, grad_output):
 88 |         return grad_output, None
 89 | 
 90 | 
 91 | class HWGQ(nn.Module):
 92 |     def __init__(self, bit=2):
 93 |         super(HWGQ, self).__init__()
 94 |         self.bit = bit
 95 |         if bit < 32:
 96 |             self.step = hwgq_steps[bit]
 97 |         else:
 98 |             self.step = None
 99 | 
100 |     def forward(self, x):
101 |         if self.bit >= 32:
102 |             return x.clamp(min=0.0)
103 |         lvls = float(2 ** self.bit - 1)
104 |         clip_thr = self.step * lvls
105 |         y = x.clamp(min=0.0, max=clip_thr)
106 |         out = _hwgq.apply(y, self.step)
107 |         return out
108 | 
109 | class ImageInputQ(nn.Module):
110 |     '''
111 |     Assume image input are discrete value [0/256, 1/256, 2/256, ..., 255/256]
112 |     '''
113 |     def __init__(self, bit = 8):
114 |         super(ImageInputQ, self).__init__()
115 |         self.bit = bit
116 |         self.step = 1/2**bit
117 | 
118 |     def forward(self, x):
119 |         if self.step==32:
120 |             return out
121 |         out = torch.floor(x/self.step) * self.step  # [!] There will be no gradient on x
122 |         return out
123 | 
124 | class QuantConv2d(nn.Conv2d):
125 | 
126 |     def __init__(self, *kargs, **kwargs):
127 |         self.bit = kwargs.pop('bit', 1)
128 |         super(QuantConv2d, self).__init__(*kargs, **kwargs)
129 |         assert self.bit > 0
130 |         self.step = None if self.bit==32 else gaussian_steps[self.bit]
131 | 
132 |     def forward(self, input):
133 |         # quantized conv, otherwise regular
134 |         if self.bit < 32:
135 |             quant_weight = _gauss_quantize.apply(self.weight, self.step, self.bit)
136 |             out = F.conv2d(
137 |                 input, quant_weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
138 |         else:
139 |             out = F.conv2d(
140 |                 input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
141 |         return out
142 | 
143 |     def export_quant(self):
144 |         return _gauss_quantize_export(self.weight, self.step, self.bit)
145 | 
146 | class QuantLinear(nn.Linear):
147 | 
148 |     def __init__(self, *kargs, **kwargs):
149 |         self.bit = kwargs.pop('bit', 1)
150 |         super(QuantLinear, self).__init__(*kargs, **kwargs)
151 |         assert self.bit > 0
152 |         self.step = gaussian_steps[self.bit]
153 | 
154 |     def forward(self, input):
155 |         # quantized linear, otherwise regular
156 |         if self.bit < 32:
157 |             # assert self.bias is None
158 |             quant_weight = _gauss_quantize.apply(self.weight, self.step, self.bit)
159 |             out = F.linear(input, quant_weight, self.bias)
160 |         else:
161 |             out = F.linear(input, self.weight, self.bias)
162 |         return out
163 | 
164 |     def export_quant(self):
165 |         return _gauss_quantize_export(self.weight, self.step, self.bit)
166 | 
167 | class QuantActivConv2d(nn.Module):
168 | 
169 |     def __init__(self, inplane, outplane, wbit=1, abit=2, ActQ = HWGQ, **kwargs):
170 |         super(QuantActivConv2d, self).__init__()
171 |         self.abit = abit
172 |         self.wbit = wbit
173 |         self.activ = ActQ(abit)
174 |         self.conv = QuantConv2d(inplane, outplane, bit=wbit, **kwargs)
175 |         # complexities
176 |         stride = kwargs['stride'] if 'stride' in kwargs else 1
177 |         if isinstance(kwargs['kernel_size'], tuple):
178 |             kernel_size = kwargs['kernel_size'][0] * kwargs['kernel_size'][1]
179 |         else:
180 |             kernel_size = kwargs['kernel_size'] * kwargs['kernel_size']
181 |         self.kernel_size = kwargs['kernel_size']
182 |         if 'groups' in kwargs: groups = kwargs['groups']
183 |         else: groups = 1
184 |         self.inplane = inplane
185 |         self.outplane = outplane
186 |         self.groups = groups
187 |         self.param_size = inplane * outplane * kernel_size * 1e-6 / groups
188 |         self.filter_size = self.param_size / float(stride ** 2.0)
189 |         self.register_buffer('size_product', torch.tensor(0, dtype=torch.float))
190 |         self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float))
191 |         self.register_buffer('in_width', torch.tensor(0, dtype=torch.float))
192 | 
193 |     def forward(self, input):
194 |         in_shape = input.shape
195 |         tmp = torch.tensor(in_shape[1] * in_shape[2] * in_shape[3] * 1e-3, dtype=torch.float)
196 |         self.memory_size.copy_(tmp)
197 |         tmp = torch.tensor(self.filter_size * in_shape[-1] * in_shape[-2], dtype=torch.float)
198 |         self.size_product.copy_(tmp)
199 |         out = self.activ(input)
200 |         tmp = torch.tensor(input.shape[3], dtype=torch.float)
201 |         self.in_width.copy_(tmp)
202 |         ## print('ii',input[0,0,:,0]/self.activ.step)
203 |         ## print('convi', torch.round(out[0,0,:,0]/self.activ.step).int())
204 |         ## wstd = self.conv.weight.std()
205 |         out = self.conv(out)
206 |         ## print('convo', torch.round(out[0,0,:,0]/(self.activ.step*self.conv.step*wstd)).int())
207 |         return out
208 | 
209 | 
210 | class QuantActivLinear(nn.Module):
211 | 
212 |     def __init__(self, inplane, outplane, wbit=1, abit=2, **kwargs):
213 |         super(QuantActivLinear, self).__init__()
214 |         self.abit = abit
215 |         self.wbit = wbit
216 |         self.activ = HWGQ(abit)
217 |         self.linear = QuantLinear(inplane, outplane, bit=wbit, **kwargs)
218 |         # complexities
219 |         self.param_size = inplane * outplane * 1e-6
220 |         self.register_buffer('size_product', torch.tensor(self.param_size, dtype=torch.float))
221 |         self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float))
222 | 
223 |     def forward(self, input):
224 |         tmp = torch.tensor(input.shape[1] * 1e-3, dtype=torch.float)
225 |         self.memory_size.copy_(tmp)
226 |         out = self.activ(input)
227 |         ## print('ii',input[0,0,:,0]/self.activ.step)
228 |         ## print('lineari', torch.round(out[0,:]/self.activ.step).int())
229 |         ## wstd = self.linear.weight.std()
230 |         out = self.linear(out)
231 |         ## print('linearo', torch.round(out[0,:]/(self.activ.step*self.linear.step*wstd)).int())
232 |         return out
233 | 
234 | 
235 | class MixQuantActiv(nn.Module):
236 | 
237 |     def __init__(self, bits, ActQ = HWGQ):
238 |         super(MixQuantActiv, self).__init__()
239 |         self.bits = bits
240 |         self.alpha_activ = Parameter(torch.Tensor(len(self.bits)))
241 |         self.alpha_activ.data.fill_(0.01)
242 |         self.mix_activ = nn.ModuleList()
243 |         for bit in self.bits:
244 |             self.mix_activ.append(ActQ(bit=bit))
245 | 
246 |     def forward(self, input):
247 |         outs = []
248 |         sw = F.softmax(self.alpha_activ, dim=0)
249 |         for i, branch in enumerate(self.mix_activ):
250 |             outs.append(branch(input) * sw[i])
251 |         activ = sum(outs)
252 |         return activ
253 | 
254 | 
255 | class MixQuantConv2d(nn.Module):
256 | 
257 |     def __init__(self, inplane, outplane, bits, **kwargs):
258 |         super(MixQuantConv2d, self).__init__()
259 |         assert not kwargs['bias']
260 |         self.bits = bits
261 |         self.alpha_weight = Parameter(torch.Tensor(len(self.bits)))
262 |         self.alpha_weight.data.fill_(0.01)
263 |         self.conv_list = nn.ModuleList()
264 |         self.steps = []
265 |         for bit in self.bits:
266 |             assert 0 < bit < 32
267 |             self.conv_list.append(nn.Conv2d(inplane, outplane, **kwargs))
268 |             self.steps.append(gaussian_steps[bit])
269 | 
270 |     def forward(self, input):
271 |         mix_quant_weight = []
272 |         sw = F.softmax(self.alpha_weight, dim=0)
273 |         for i, bit in enumerate(self.bits):
274 |             weight = self.conv_list[i].weight
275 |             weight_std = weight.std().item()
276 |             step = self.steps[i] * weight_std
277 |             quant_weight = _gauss_quantize_resclaed_step.apply(weight, step, bit)
278 |             scaled_quant_weight = quant_weight * sw[i]
279 |             mix_quant_weight.append(scaled_quant_weight)
280 |         mix_quant_weight = sum(mix_quant_weight)
281 |         conv = self.conv_list[0]
282 |         out = F.conv2d(
283 |             input, mix_quant_weight, conv.bias, conv.stride, conv.padding, conv.dilation, conv.groups)
284 |         return out
285 | 
286 | 
287 | class SharedMixQuantConv2d(nn.Module):
288 | 
289 |     def __init__(self, inplane, outplane, bits, **kwargs):
290 |         super(SharedMixQuantConv2d, self).__init__()
291 |         # assert not kwargs['bias']
292 |         self.bits = bits
293 |         self.alpha_weight = Parameter(torch.Tensor(len(self.bits)))
294 |         self.alpha_weight.data.fill_(0.01)
295 |         self.conv = nn.Conv2d(inplane, outplane, **kwargs)
296 |         self.steps = []
297 |         for bit in self.bits:
298 |             assert 0 < bit < 32
299 |             self.steps.append(gaussian_steps[bit])
300 | 
301 |     def forward(self, input):
302 |         mix_quant_weight = []
303 |         sw = F.softmax(self.alpha_weight, dim=0)
304 |         conv = self.conv
305 |         weight = conv.weight
306 |         # save repeated std computation for shared weights
307 |         weight_std = weight.std().item()
308 |         for i, bit in enumerate(self.bits):
309 |             step = self.steps[i] * weight_std
310 |             quant_weight = _gauss_quantize_resclaed_step.apply(weight, step, bit)
311 |             scaled_quant_weight = quant_weight * sw[i]
312 |             mix_quant_weight.append(scaled_quant_weight)
313 |         mix_quant_weight = sum(mix_quant_weight)
314 |         out = F.conv2d(
315 |             input, mix_quant_weight, conv.bias, conv.stride, conv.padding, conv.dilation, conv.groups)
316 |         return out
317 | 
318 | 
319 | class MixActivConv2d(nn.Module):
320 | 
321 |     def __init__(self, inplane, outplane, wbits=None, abits=None, share_weight=False, ActQ = HWGQ, **kwargs):
322 |         super(MixActivConv2d, self).__init__()
323 |         if wbits is None:
324 |             self.wbits = [1, 2]
325 |         else:
326 |             self.wbits = wbits
327 |         if abits is None:
328 |             self.abits = [1, 2]
329 |         else:
330 |             self.abits = abits
331 |         # build mix-precision branches
332 |         self.mix_activ = MixQuantActiv(self.abits, ActQ = ActQ)
333 |         self.share_weight = share_weight
334 |         if share_weight:
335 |             self.mix_weight = SharedMixQuantConv2d(inplane, outplane, self.wbits, **kwargs)
336 |         else:
337 |             self.mix_weight = MixQuantConv2d(inplane, outplane, self.wbits, **kwargs)
338 |         # complexities
339 |         stride = kwargs['stride'] if 'stride' in kwargs else 1
340 |         if isinstance(kwargs['kernel_size'], tuple):
341 |             kernel_size = kwargs['kernel_size'][0] * kwargs['kernel_size'][1]
342 |         else:
343 |             kernel_size = kwargs['kernel_size'] * kwargs['kernel_size']
344 |         self.kernel_size = kwargs['kernel_size']
345 |         
346 |         if 'groups' in kwargs: groups = kwargs['groups']
347 |         else: groups = 1
348 |         self.inplane = inplane
349 |         self.outplane = outplane
350 |         self.groups = groups
351 |         self.param_size = inplane * outplane * kernel_size * 1e-6 / groups
352 |         self.filter_size = self.param_size / float(stride ** 2.0)
353 |         self.register_buffer('size_product', torch.tensor(0, dtype=torch.float))
354 |         self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float))
355 |         self.register_buffer('in_width', torch.tensor(0, dtype=torch.float))
356 | 
357 |     def forward(self, input):
358 |         in_shape = input.shape
359 |         tmp = torch.tensor(in_shape[1] * in_shape[2] * in_shape[3] * 1e-3, dtype=torch.float)
360 |         self.memory_size.copy_(tmp)
361 |         tmp = torch.tensor(self.filter_size * in_shape[-1] * in_shape[-2], dtype=torch.float)
362 |         self.size_product.copy_(tmp)
363 |         tmp = torch.tensor(input.shape[3], dtype=torch.float)
364 |         self.in_width.copy_(tmp)
365 |         out = self.mix_activ(input)
366 |         out = self.mix_weight(out)
367 |         return out
368 | 
369 |     def complexity_loss_trivial(self):
370 |         sw = F.softmax(self.mix_activ.alpha_activ, dim=0)
371 |         mix_abit = 0
372 |         abits = self.mix_activ.bits
373 |         for i in range(len(abits)):
374 |             mix_abit += sw[i] * abits[i]
375 |         sw = F.softmax(self.mix_weight.alpha_weight, dim=0)
376 |         mix_wbit = 0
377 |         wbits = self.mix_weight.bits
378 |         for i in range(len(wbits)):
379 |             mix_wbit += sw[i] * wbits[i]
380 |         complexity = self.size_product.item() * mix_abit * mix_wbit
381 |         return complexity
382 |     
383 |     def complexity_loss(self):
384 |         sa = F.softmax(self.mix_activ.alpha_activ, dim=0)
385 |         abits = self.mix_activ.bits
386 |         sw = F.softmax(self.mix_weight.alpha_weight, dim=0)
387 |         mix_scale = 0
388 |         wbits = self.mix_weight.bits
389 | 
390 |         if self.kernel_size == 1:
391 |             factors = factors_k11
392 |         elif self.kernel_size == 3:
393 |             factors = factors_k33
394 |         elif self.kernel_size == 5:
395 |             factors = factors_k55
396 |         else:
397 |             raise NotImplementedError
398 |         for i in range(len(wbits)):
399 |             for j in range(len(abits)):
400 |                 mix_scale += sw[i] * sa[j] / factors[wbits[i]-2][abits[j]-2]
401 |         complexity = self.size_product.item() * 64 * mix_scale
402 |         return complexity
403 |     
404 |     def bram_loss(self):
405 |         sa = F.softmax(self.mix_activ.alpha_activ, dim=0)
406 |         abits = self.mix_activ.bits
407 |         sw = F.softmax(self.mix_weight.alpha_weight, dim=0)
408 |         wbits = self.mix_weight.bits
409 | 
410 |         if self.kernel_size == 1:
411 |             bram_sw = 2 * self.in_width.item() * self.inplane
412 |         else: # sliding window size
413 |             bram_sw = (self.kernel_size+1)*self.in_width.item()*self.inplane
414 |         bram_sw *= 1e-3
415 |         
416 |         mix_wbit, mix_abit = 0, 0
417 |         for i in range(len(wbits)):
418 |             mix_wbit += sw[i] * wbits[i]
419 |         for i in range(len(abits)):
420 |             mix_abit += sa[i] * abits[i]
421 |         
422 |         bram_weight = self.param_size * 1e3 * mix_wbit # kbit
423 |         bram_cache = bram_sw * mix_abit # kbit
424 | 
425 |         bram = (bram_weight + bram_cache) * 64
426 |         return bram
427 | 
428 |     def fetch_best_arch(self, layer_idx):
429 |         size_product = float(self.size_product.cpu().numpy())
430 |         memory_size = float(self.memory_size.cpu().numpy())
431 |         prob_activ = F.softmax(self.mix_activ.alpha_activ, dim=0)
432 |         prob_activ = prob_activ.detach().cpu().numpy()
433 |         best_activ = prob_activ.argmax()
434 |         mix_abit = 0
435 |         abits = self.mix_activ.bits
436 |         for i in range(len(abits)):
437 |             mix_abit += prob_activ[i] * abits[i]
438 |         prob_weight = F.softmax(self.mix_weight.alpha_weight, dim=0)
439 |         prob_weight = prob_weight.detach().cpu().numpy()
440 |         best_weight = prob_weight.argmax()
441 |         mix_wbit = 0
442 |         wbits = self.mix_weight.bits
443 |         for i in range(len(wbits)):
444 |             mix_wbit += prob_weight[i] * wbits[i]
445 |         if self.share_weight:
446 |             weight_shape = list(self.mix_weight.conv.weight.shape)
447 |         else:
448 |             weight_shape = list(self.mix_weight.conv_list[0].weight.shape)
449 |     
450 |         if self.kernel_size == 1:
451 |             bram_sw = 2 * self.in_width.item() * self.inplane
452 |         else:
453 |             bram_sw = (self.kernel_size+1)*self.in_width.item()*self.inplane*self.outplane/self.groups
454 |         bram_sw *= 1e-3
455 | 
456 |         print('idx {} with shape {}, activ alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, '
457 |               'memory: {:.3f}K * {:.3f}, cache: {:.3f}K'.format(layer_idx, weight_shape, prob_activ, size_product,
458 |                                                 mix_abit, mix_wbit, memory_size, mix_abit, bram_sw))
459 |         print('idx {} with shape {}, weight alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, '
460 |               'param: {:.3f}M * {:.3f}'.format(layer_idx, weight_shape, prob_weight, size_product,
461 |                                                mix_abit, mix_wbit, self.param_size, mix_wbit))
462 |         best_arch = {'best_activ': [best_activ], 'best_weight': [best_weight]}
463 |         bitops = size_product * abits[best_activ] * wbits[best_weight]
464 |         bita = memory_size * abits[best_activ]
465 |         bitw = self.param_size * wbits[best_weight]
466 |         
467 |         if self.kernel_size == 1:
468 |             factors = factors_k11
469 |         elif self.kernel_size == 3:
470 |             factors = factors_k33
471 |         elif self.kernel_size == 5:
472 |             factors = factors_k55
473 |         else:
474 |             raise NotImplementedError
475 |         dsps = size_product / factors[wbits[best_weight]-2][abits[best_activ]-2]
476 |         mixbitops = size_product * mix_abit * mix_wbit
477 |         mixbita = memory_size * mix_abit
478 |         mixbitw = self.param_size * mix_wbit
479 |         mixdsps = 0
480 |         for i in range(len(wbits)):
481 |             for j in range(len(abits)):
482 |                 mixdsps += prob_weight[i] * prob_activ[j] / factors[wbits[i]-2][abits[j]-2]
483 |         mixdsps *= size_product
484 |         mixbram_weight = self.param_size * 1e3 * mix_wbit # kbit
485 |         mixbram_cache = bram_sw * mix_abit # kbit
486 | 
487 |         return best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache
488 | 
489 | 
490 | class SharedMixQuantLinear(nn.Module):
491 | 
492 |     def __init__(self, inplane, outplane, bits, **kwargs):
493 |         super(SharedMixQuantLinear, self).__init__()
494 |         # assert not kwargs['bias']
495 |         self.bits = bits
496 |         self.alpha_weight = Parameter(torch.Tensor(len(self.bits)))
497 |         self.alpha_weight.data.fill_(0.01)
498 |         self.linear = nn.Linear(inplane, outplane, **kwargs)
499 |         self.steps = []
500 |         for bit in self.bits:
501 |             assert 0 < bit < 32
502 |             self.steps.append(gaussian_steps[bit])
503 | 
504 |     def forward(self, input):
505 |         mix_quant_weight = []
506 |         sw = F.softmax(self.alpha_weight, dim=0)
507 |         linear = self.linear
508 |         weight = linear.weight
509 |         # save repeated std computation for shared weights
510 |         weight_std = weight.std().item()
511 |         for i, bit in enumerate(self.bits):
512 |             step = self.steps[i] * weight_std
513 |             quant_weight = _gauss_quantize_resclaed_step.apply(weight, step, bit)
514 |             scaled_quant_weight = quant_weight * sw[i]
515 |             mix_quant_weight.append(scaled_quant_weight)
516 |         mix_quant_weight = sum(mix_quant_weight)
517 |         out = F.linear(input, mix_quant_weight, linear.bias)
518 |         return out
519 | 
520 | class MixActivLinear(nn.Module):
521 |     def __init__(self, inplane, outplane, wbits=None, abits=None, share_weight=True, **kwargs):
522 |         super(MixActivLinear, self).__init__()
523 |         if wbits is None:
524 |             self.wbits = [1, 2]
525 |         else:
526 |             self.wbits = wbits
527 |         if abits is None:
528 |             self.abits = [1, 2]
529 |         else:
530 |             self.abits = abits
531 |         # build mix-precision branches
532 |         self.mix_activ = MixQuantActiv(self.abits)
533 |         assert share_weight
534 |         self.share_weight = share_weight
535 |         self.mix_weight = SharedMixQuantLinear(inplane, outplane, self.wbits, **kwargs)
536 |         # complexities
537 |         self.param_size = inplane * outplane * 1e-6
538 |         self.register_buffer('size_product', torch.tensor(self.param_size, dtype=torch.float))
539 |         self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float))
540 | 
541 |     def forward(self, input):
542 |         tmp = torch.tensor(input.shape[1] * 1e-3, dtype=torch.float)
543 |         self.memory_size.copy_(tmp)
544 |         out = self.mix_activ(input)
545 |         out = self.mix_weight(out)
546 |         return out
547 | 
548 |     def complexity_loss_old(self):
549 |         sw = F.softmax(self.mix_activ.alpha_activ, dim=0)
550 |         mix_abit = 0
551 |         abits = self.mix_activ.bits
552 |         for i in range(len(abits)):
553 |             mix_abit += sw[i] * abits[i]
554 |         sw = F.softmax(self.mix_weight.alpha_weight, dim=0)
555 |         mix_wbit = 0
556 |         wbits = self.mix_weight.bits
557 |         for i in range(len(wbits)):
558 |             mix_wbit += sw[i] * wbits[i]
559 |         complexity = self.size_product.item() * mix_abit * mix_wbit
560 |         return complexity
561 | 
562 |     def complexity_loss(self):
563 |         sa = F.softmax(self.mix_activ.alpha_activ, dim=0)
564 |         abits = self.mix_activ.bits
565 |         sw = F.softmax(self.mix_weight.alpha_weight, dim=0)
566 |         mix_scale = 0
567 |         wbits = self.mix_weight.bits
568 |         for i in range(len(wbits)):
569 |             for j in range(len(abits)):
570 |                 mix_scale += sw[i] * sa[j] / factors_k11[wbits[i]-2][abits[j]-2]
571 |         complexity = self.size_product.item() * 64 * mix_scale
572 |         return complexity
573 | 
574 |     def fetch_best_arch(self, layer_idx):
575 |         size_product = float(self.size_product.cpu().numpy())
576 |         memory_size = float(self.memory_size.cpu().numpy())
577 |         prob_activ = F.softmax(self.mix_activ.alpha_activ, dim=0)
578 |         prob_activ = prob_activ.detach().cpu().numpy()
579 |         best_activ = prob_activ.argmax()
580 |         mix_abit = 0
581 |         abits = self.mix_activ.bits
582 |         for i in range(len(abits)):
583 |             mix_abit += prob_activ[i] * abits[i]
584 |         prob_weight = F.softmax(self.mix_weight.alpha_weight, dim=0)
585 |         prob_weight = prob_weight.detach().cpu().numpy()
586 |         best_weight = prob_weight.argmax()
587 |         mix_wbit = 0
588 |         wbits = self.mix_weight.bits
589 |         for i in range(len(wbits)):
590 |             mix_wbit += prob_weight[i] * wbits[i]
591 |         weight_shape = list(self.mix_weight.linear.weight.shape)
592 |         print('idx {} with shape {}, activ alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, '
593 |               'memory: {:.3f}K * {:.3f}'.format(layer_idx, weight_shape, prob_activ, size_product,
594 |                                                 mix_abit, mix_wbit, memory_size, mix_abit))
595 |         print('idx {} with shape {}, weight alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, '
596 |               'param: {:.3f}M * {:.3f}'.format(layer_idx, weight_shape, prob_weight, size_product,
597 |                                                mix_abit, mix_wbit, self.param_size, mix_wbit))
598 |         best_arch = {'best_activ': [best_activ], 'best_weight': [best_weight]}
599 |         bitops = size_product * abits[best_activ] * wbits[best_weight]
600 |         bita = memory_size * abits[best_activ]
601 |         bitw = self.param_size * wbits[best_weight]
602 |         dsps = size_product / factors_k11[wbits[best_weight]-2][abits[best_activ]-2]
603 |         mixbitops = size_product * mix_abit * mix_wbit
604 |         mixbita = memory_size * mix_abit
605 |         mixbitw = self.param_size * mix_wbit
606 |         mixdsps = 0
607 |         for i in range(len(wbits)):
608 |             for j in range(len(abits)):
609 |                 mixdsps += prob_weight[i] * prob_activ[j] / factors_k11[wbits[i]-2][abits[j]-2]
610 |         mixdsps *= size_product
611 |         return best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps
612 | 


--------------------------------------------------------------------------------
/cifar/export_hls.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | from typing import Dict, List
  4 | import torch
  5 | import numpy as np
  6 | import sys
  7 | import os
  8 | 
  9 | import sys
 10 | sys.path.append('..')
 11 | import models
 12 | from utils.view_pt import select_weight_file
 13 | from anypacking.quant_module import HWGQ, QuantConv2d, ImageInputQ, QuantLinear
 14 | 
 15 | class ConvParam: ...
 16 | 
 17 | def write_hls_config(model_param, path):
 18 |     name_mapping = {
 19 |         'k': 'K',
 20 |         #'s': 'S',
 21 |         #'p': 'P',
 22 |         'ich': 'IFM_CH',
 23 |         'irow': 'IFM_ROW',
 24 |         'icol': 'IFM_COL',
 25 |         'och': 'OFM_CH',
 26 |         'orow': 'OFM_ROW',
 27 |         'ocol': 'OFM_COL',
 28 |         'abit': 'IN_BIT',
 29 |         'wbit': 'W_BIT',
 30 |         'incbit': 'INC_BIT',
 31 |         'biasbit': 'BIAS_BIT',
 32 |         'simd': 'SIMD',
 33 |         'pe': 'PE',
 34 |         'lshift': 'L_SHIFT'
 35 |     }
 36 |     content = f'''/********************************************************************************
 37 | * Filename: config.h
 38 | * Date: {time.ctime()}
 39 | * Description: This file is generated by {parser.prog}
 40 | *   ptfilename: {opt.weight} 
 41 | ********************************************************************************/
 42 | 
 43 | #ifndef _CONFIG_H_
 44 | #define _CONFIG_H_
 45 | 
 46 | '''
 47 |     for n, conv_param in enumerate(model_param):
 48 |         content += f'// {conv_param.type}_{n}\n'
 49 |         for k, v in name_mapping.items():
 50 |             if hasattr(conv_param, k): # e.g. conv_last has no incbit
 51 |                 content += f'#define {conv_param.type.upper()}_{n}_{v} {getattr(conv_param, k)}\n'
 52 |         content += '\n'
 53 |     content += '#endif'
 54 | 
 55 |     with open(path + 'config.h', 'w') as f:
 56 |         print(content, file=f)
 57 | 
 58 | def extract_model(in_shape):
 59 |     model_param: List[ConvParam] = []
 60 |     feature_map_shape = in_shape
 61 |     conv_cnt = 0
 62 |     conv_cur = None
 63 |     for sub_module in model.modules():
 64 |         # expect [QAct] -> [Pooling] -> Conv -> [BN] -> [Pooling], state machine mode
 65 |         if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ):
 66 |             print('  Detected ActQ Layer', end='')
 67 |             if conv_cur is None: conv_cur = ConvParam()
 68 | 
 69 |             conv_cur.abit = sub_module.bit
 70 |             conv_cur.astep = sub_module.step
 71 |             
 72 |             conv_cur.actq_class = type(sub_module).__name__
 73 |             print(f', abit {conv_cur.abit}, astep {conv_cur.astep}, class {conv_cur.actq_class}')
 74 | 
 75 |             if conv_cnt: # previous.obit = cur.abit
 76 |                 model_param[conv_cnt-1].obit = conv_cur.abit
 77 |                 model_param[conv_cnt-1].ostep = conv_cur.astep
 78 |             
 79 |         elif isinstance(sub_module, torch.nn.Conv2d):
 80 |             if conv_cur is None: conv_cur = ConvParam()
 81 |             conv_cur.n = conv_cnt
 82 |             print('Extract conv_%d'%conv_cnt, end='')
 83 | 
 84 |             conv_cur.k = sub_module.kernel_size[0]
 85 |             conv_cur.s = sub_module.stride[0]
 86 |             conv_cur.p = sub_module.padding[0]
 87 |             conv_cur.ich = sub_module.in_channels
 88 |             conv_cur.och = sub_module.out_channels
 89 |             conv_cur.irow = feature_map_shape[1]
 90 |             conv_cur.icol = feature_map_shape[2]
 91 |             
 92 |             feature_map_shape[0] = sub_module.out_channels
 93 |             feature_map_shape[1] = (feature_map_shape[1] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1
 94 |             feature_map_shape[2] = (feature_map_shape[2] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1
 95 |             conv_cur.orow = feature_map_shape[1]
 96 |             conv_cur.ocol = feature_map_shape[2]
 97 | 
 98 |             assert sub_module.bias is None, 'inner conv has no bias in this model'
 99 |             if isinstance(sub_module, QuantConv2d): # New quant
100 |                 conv_cur.wbit = sub_module.bit
101 |                 conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step because of alpha
102 |             else:
103 |                 raise NotImplementedError(sub_module)
104 |             print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur)))
105 |             
106 |             conv_cur.type = 'conv'
107 |             model_param.append(conv_cur)
108 |             conv_cur = None
109 |             conv_cnt += 1
110 | 
111 |         elif isinstance(sub_module, torch.nn.Linear):
112 |             if conv_cur is None: conv_cur = ConvParam() # TODO: independent type for linear layer
113 |             conv_cur.n = conv_cnt
114 |             print('Extract layer %d (linear layer)'%conv_cnt, end='')
115 | 
116 |             conv_cur.ich = sub_module.in_features
117 |             conv_cur.och = sub_module.out_features
118 |             conv_cur.irow = feature_map_shape[1]
119 |             conv_cur.icol = feature_map_shape[2]
120 |             
121 |             if sub_module.bias is not None:
122 |                 conv_cur.convbias = sub_module.bias.detach().numpy()
123 |                 print(', +bias', end='')
124 |             
125 |             if isinstance(sub_module, QuantLinear): # New quant
126 |                 conv_cur.wbit = sub_module.bit
127 |                 conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantLinear.step because of alpha
128 | 
129 |             print(', ich {ich}, och {och}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur)))
130 |             
131 |             conv_cur.type = 'linear'
132 |             model_param.append(conv_cur)
133 |             conv_cur = None
134 |             conv_cnt += 1
135 |         
136 |         elif isinstance(sub_module, torch.nn.BatchNorm2d):
137 |             print('  Detected BatchNorm2d')
138 |             gamma = sub_module.weight
139 |             beta = sub_module.bias
140 |             mean = sub_module.running_mean
141 |             var = sub_module.running_var
142 |             eps = sub_module.eps
143 |             
144 |             model_param[-1].bn_w = (gamma / (torch.sqrt(var + eps))).detach().numpy()
145 |             model_param[-1].bn_b = (beta - (mean / (torch.sqrt(var + eps)) * gamma)).detach().numpy()
146 | 
147 |         elif isinstance(sub_module, torch.nn.MaxPool2d):
148 |             print('  Detected MaxPool2d')
149 |             feature_map_shape[1] = feature_map_shape[1] // sub_module.kernel_size
150 |             feature_map_shape[2] = feature_map_shape[2] // sub_module.kernel_size
151 |     
152 |     assert hasattr(model_param[0], 'abit')
153 | 
154 |     return model_param
155 | 
156 | def process_batchnorm(model_param):
157 |     '''process_batchnorm(model_param)
158 |     Merge wstep, astep, ostep scale into batchnorm, then quantize. 
159 | 
160 |     Method:
161 |     Define MAC = Conv(w, a), out = MAC*BN_w + BN_b,
162 |     wq = w/wstep, aq = a/astep, MACq = MAC/MACstep, outq = out/ostep.
163 | 
164 |     outq = (MAC*BN_w + BN_b) / ostep
165 |          = MACq * (MACstep/ostep)*BN_w + BN_b/ostep
166 |          = MACq *     inc_raw          + bias_raw
167 |     next layer activation a' = ActQ(out), i.e. a'q = clip(round(outq))
168 | 
169 |     Quantiaztion of inc_raw & bias_raw: 
170 |     outq_real = round((MACq*round(inc_raw*scale) + round(bias_raw*scale)) / scale)         ; where scale=2**T
171 |               = (MACq*round(inc_raw*scale) + round(bias_raw*scale) + 0.5 * scale) // scale ; div floor
172 |               = (MACq*        inc          +         bias          +  2**(T-1)  ) >> T     ; [!] the 2**(T-1) bias is done by hls code
173 | 
174 |     Params:
175 |     T = (wbit-1)+abit+lshift  # This comes from dorefa quant, not optimal
176 |     MBIT = wbit+abit+ceil(log2(sum_number))
177 |     incbit = len(bit(inc)); biasbit = len(bit(bias))
178 |     larger lshift is better, but MBIT+incbit<48
179 |     '''
180 |     lshift = 16
181 | 
182 |     for conv in model_param[:-1]:
183 |         print(f'Process bn_{conv.n}, shape {conv.bn_w.shape},', end = ' ')
184 | 
185 |         # Merge step to BN
186 |         conv.lshift = lshift
187 |         MACstep = conv.wstep * conv.astep
188 |         ostep = conv.ostep
189 |         inc_raw = conv.bn_w * MACstep / ostep
190 |         bias_raw = conv.bn_b / ostep
191 |         conv.inc_raw = inc_raw
192 |         conv.bias_raw = bias_raw
193 | 
194 |         # Quantization
195 |         T = lshift+conv.wbit+conv.abit-1
196 |         conv.inc = np.round(inc_raw * 2**T).astype(np.int64)
197 |         conv.bias = np.round(bias_raw * 2**T).astype(np.int64)
198 |         conv.lshift_T = T
199 |         # Get bitlength
200 |         bitlength = lambda x: 1 + int(np.abs(x).max()).bit_length()
201 |         conv.incbit = bitlength(conv.inc)
202 |         conv.biasbit = bitlength(conv.bias)
203 |         print(f'incbit {conv.incbit}, biasbit {conv.biasbit}, lshift_T {conv.lshift_T}')
204 |     
205 |     conv_last = model_param[-1] # process lastbias
206 |     conv_last.inc = None
207 |     conv_last.div = 1/(conv_last.wstep * conv_last.astep)
208 |     conv_last.bias = np.round(conv_last.convbias * conv_last.div).astype(np.int64)
209 |     conv_last.biasbit = bitlength(conv_last.bias)
210 |     print(f'conv_last biasbit {conv_last.biasbit}, div {conv_last.div}')
211 | 
212 | def reorder_weight(model_param, layers_simd, layers_pe):
213 |     '''reorder_weight(model_param)
214 |     Reorder array for hlscode.
215 |     '''
216 | 
217 |     for conv in model_param:
218 |         if conv.type == 'linear': #new reorder
219 |             pe_l = 1
220 |             simd_l = 1
221 |             in_pe_l = 8
222 |             w = conv.w.reshape(10, -1, 4, 4)
223 |             w = w.reshape(10 // (2 * pe_l), pe_l, 2, 256 // in_pe_l, in_pe_l // simd_l, simd_l, 4, 4)  #[OUT_CH/2PE, PE, 2, IN_CH/IN_PE, IN_PE/SIMD, SIMD, H, W]
224 |             w = w.transpose(1, 6, 3, 7, 0, 4, 5, 2)                                                    #[PE, H, IN_CH/IN_PE, W, OUT_CH/2PE, IN_PE/SIMD, SIMD, 2]
225 |             w = w.reshape(w.shape[0], w.shape[1], w.shape[2], w.shape[3], w.shape[4], w.shape[5], -1)  #[PE, H, IN_CH/IN_PE, W, OUT_CH/2PE, IN_PE/SIMD, SIMD * 2]
226 |             print(w.shape)
227 |             conv.w = w
228 |             continue
229 | 
230 |         print(f'Reorder conv_{conv.n}, w {conv.w.shape}', end='')
231 |         conv.simd = layers_simd[conv.n]
232 |         conv.pe = layers_pe[conv.n]
233 | 
234 |         # process batchnorm
235 |         if conv.inc is not None:
236 |             conv.inc = conv.inc.reshape(conv.och//conv.pe, conv.pe).T
237 |         if conv.bias is not None:
238 |             conv.bias = conv.bias.reshape(conv.och//conv.pe, conv.pe).T
239 |         
240 |         # process conv weight
241 |         w = conv.w    # [och, ich, kr, kc]
242 |         assert conv.och%conv.pe == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}"
243 |         assert conv.k*conv.ich%conv.simd == 0, f"conv_{conv.n}, ich {conv.ich}, k {conv.k}, simd {conv.simd}"
244 | 
245 |         # if conv.n==0: # first layer is different
246 |         #    w = w.transpose(0, 2, 3, 1) # [och, kr, kc, ich]
247 |         # else:
248 |         w = w.transpose(0, 3, 2, 1) # [och, kc, kr, ich]
249 | 
250 |         w = w.reshape(conv.och//conv.pe, conv.pe, conv.k, conv.k*conv.ich//conv.simd, conv.simd)
251 |         w = w.transpose(1,2,0,3,4) # [pe, k, och/pe, k*ich/simd, simd]
252 |         w = w.reshape(conv.pe, conv.k, -1, conv.simd) # hls format [pe, k, och/pe*k*ich/simd, simd]
253 | 
254 |         if conv.k == 1: # kernel size=1
255 |             w = w.reshape(conv.pe, -1, conv.simd)
256 |         print(' ->', w.shape)
257 | 
258 |         conv.w = w
259 | 
260 | def print_ndarray_recursion(arr, str_func=str, file=sys.stdout, stop=0):
261 |     if not hasattr(arr, '__iter__') or len(arr.shape) == stop:
262 |         print(str_func(arr), file=file, end='')
263 |         return
264 |     ends = '' if (len(arr.shape)==stop+1) else '\n'
265 |     print('{', file=file, end='')
266 |     for i, item in enumerate(arr):
267 |         print_ndarray_recursion(item, str_func, file, stop)
268 |         if i!=len(arr)-1: print(',', file=file, end=ends)
269 |     print(ends+'}', file=file, end='')
270 | 
271 | def write_hls_linearlayer(layer, f):
272 |     n = layer.n
273 |     print(f"// layer: {n}, wbit: {layer.wbit}", file=f)
274 |     hex_str = lambda x: '"' + hex(x) + '"'
275 |     print(f"const ap_int<{layer.wbit}> linear_{n}_w[{layer.och}][{layer.ich}]=", file=f)
276 |     print_ndarray_recursion(layer.w, hex_str, f)
277 |     print(';', file=f)
278 |     
279 |     if layer.bias is not None:
280 |         print(f"const ap_int<{layer.biasbit}> linear_{n}_bias[{layer.och}]=", file=f)
281 |         print_ndarray_recursion(layer.bias, hex_str, f)
282 |         print(';', file=f)
283 | 
284 | def write_hls_linearlayer_reorder(layer, d0, d1, d2, d3, d4, d5, d6, f):
285 |     n = layer.n
286 |     print(f"// layer: {n}, wbit: {layer.wbit}", file=f)
287 |     hex_str = lambda x: '"' + hex(x) + '"'
288 |     def pack1d_str(arr): # x: 1d-array
289 |         x = 0
290 |         # print(arr.shape)
291 |         for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention
292 |             v = int(v) # use python bignumber, not np.int
293 |             assert -1<<layer.wbit-1 <= v < 1<<layer.wbit-1, f'got v={v} while wbit={layer.wbit}'
294 |             x=(x<<layer.wbit) + (v&(2**layer.wbit-1))
295 |         return hex_str(x)
296 |     print(f"const ap_uint<{layer.wbit * d6}> linear_{n}_w[{d0}][{d1}][{d2}][{d3}][{d4}][{d5}]=", file=f)
297 |     print_ndarray_recursion(layer.w, pack1d_str, f, stop=1)
298 |     print(';', file=f)
299 |     
300 |     if layer.bias is not None:
301 |         print(f"const ap_int<{layer.biasbit}> linear_{n}_bias[{layer.och}]=", file=f)
302 |         print_ndarray_recursion(layer.bias, hex_str, f)
303 |         print(';', file=f)
304 | 
305 | def write_hls_weights(model_param, path):
306 |     '''write_hls_weights(model_param, path)
307 |     Write hls weights+inc+bias array code according to numpy shape.
308 |     '''
309 |     f = open(path + 'weights.hpp', 'w')
310 | 
311 |     print(f'''/********************************************************************************
312 | * Filename: weights.hpp
313 | * Date: {time.ctime()}
314 | * Description: This file is generated by {parser.prog}
315 | *   ptfilename: {opt.weight} 
316 | ********************************************************************************/
317 | 
318 | #ifndef _WEIGHTS_HPP_
319 | #define _WEIGHTS_HPP_
320 | #include <ap_int.h>
321 | ''', file=f)
322 | 
323 |     for conv in model_param:
324 |         if conv.type == 'linear':
325 |             pe_pr = conv.w.shape[0]
326 |             h_pr = conv.w.shape[1]
327 |             inch_inpe_pr = conv.w.shape[2]
328 |             w_pr = conv.w.shape[3]
329 |             outch_2pe_pr = conv.w.shape[4]
330 |             inpe_simd_pr = conv.w.shape[5]
331 |             simd2_pr = conv.w.shape[6]
332 |             write_hls_linearlayer_reorder(conv, pe_pr, h_pr, inch_inpe_pr, w_pr, outch_2pe_pr, inpe_simd_pr, simd2_pr, f)
333 |             continue
334 | 
335 |         n = conv.n
336 |         print(f"Write conv_{n} weight, pe {conv.pe}, simd {conv.simd}, wbit {conv.wbit}")
337 |         print(f"// layer: {n}, PE: {conv.pe}, SIMD: {conv.simd}, wbit: {conv.wbit}", file=f)
338 | 
339 |         # print conv weight,  merge [SIMD] value into one ap_uint
340 |         if conv.k>1:
341 |             print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.k}][{conv.w.shape[2]}]=", file=f)
342 |         else:
343 |             print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.w.shape[1]}]=", file=f)
344 |         hex_str = lambda x: '"' + hex(x) + '"'
345 |         def pack1d_str(arr): # x: 1d-array
346 |             x = 0
347 |             for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention
348 |                 v = int(v) # use python bignumber, not np.int
349 |                 assert -1<<conv.wbit-1 <= v < 1<<conv.wbit-1, f'got v={v} while wbit={conv.wbit}'
350 |                 x=(x<<conv.wbit) + (v&(2**conv.wbit-1))
351 |             return hex_str(x)
352 |         print_ndarray_recursion(conv.w, pack1d_str, f, stop=1)
353 |         print(';', file=f)
354 | 
355 |         # print inc, bias
356 |         if conv.inc is not None:
357 |             print(f"const ap_int<{conv.incbit}> conv_{n}_inc[{conv.pe}][{conv.och//conv.pe}]=", file=f)
358 |             print_ndarray_recursion(conv.inc, hex_str, f)
359 |             print(';', file=f)
360 |         if conv.bias is not None:
361 |             print(f"const ap_int<{conv.biasbit}> conv_{n}_bias[{conv.pe}][{conv.och//conv.pe}]=", file=f)
362 |             print_ndarray_recursion(conv.bias, hex_str, f)
363 |             print(';', file=f)
364 |     
365 |     print('#endif', file=f)
366 |     f.close()
367 | 
368 | def adjust_weight(model_param):
369 |     # special_wa_bit = ((5,6), (7,3)) # These packing can't quantize to -2**(wbit-1)
370 |     special_wa_bit = ((4, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (7, 2), (7, 3)) # These packing can't quantize to -2**(wbit-1)
371 |     for conv in model_param:
372 |         if (conv.wbit, conv.abit) in special_wa_bit:
373 |             print(f'Adjust conv_{conv.n} wbit={conv.wbit}')
374 |             conv.w = np.maximum(conv.w, -2**(conv.wbit-1)+1)
375 | 
376 | if __name__=='__main__':
377 |     parser = argparse.ArgumentParser()
378 |     parser.add_argument('-w', '--weight', default=None, help='.pt file name in ./weights/')
379 |     parser.add_argument('-m', '--model', default='VGG_tiny_FixQ', help = 'model class name in models.py')
380 |     parser.add_argument('-c', '--config-simd-pe', default='config_simd_pe', help = '.txt file in ./hls/')
381 |     opt = parser.parse_args()
382 |     if opt.weight is None: opt.weight = select_weight_file()
383 | 
384 |     simd_pe = np.loadtxt('hls/'+opt.config_simd_pe+'.txt', dtype=int, skiprows=1)
385 |     dir_output = 'hls/' + opt.weight + '/'
386 |     if not os.path.exists(dir_output): os.makedirs(dir_output)
387 | 
388 |     # load model and state_dict
389 |     ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu')
390 |     model = getattr(models, opt.model)(**ptfile.setdefault('model_params', {}))
391 |     model.load_state_dict(ptfile['model'], strict = False)
392 | 
393 |     # processs
394 |     model_param = extract_model([1, 32, 32])
395 |     adjust_weight(model_param)
396 |     process_batchnorm(model_param) # get bn param before write hls config
397 |     torch.save(model_param, dir_output + 'model_param.pkl')
398 |     
399 |     reorder_weight(model_param, simd_pe[:,0], simd_pe[:,1]) # get pe, simd param before write hls config
400 |     write_hls_config(model_param, dir_output)
401 |     write_hls_weights(model_param, dir_output)
402 | 


--------------------------------------------------------------------------------
/cifar/hls/config_simd_pe.txt:
--------------------------------------------------------------------------------
1 | simd pe
2 | 3    4
3 | 8    8
4 | 8    8
5 | 8    8
6 | 8    8
7 | 8    8
8 | 


--------------------------------------------------------------------------------
/cifar/main_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import torch
  5 | import torch.nn as nn
  6 | import torchvision
  7 | import torchvision.transforms as transforms
  8 | import torch.optim as optim
  9 | import numpy as np
 10 | from tqdm import tqdm
 11 | 
 12 | import sys
 13 | sys.path.append('..')
 14 | 
 15 | from localconfig import data_path
 16 | import models
 17 | from test_acc import test
 18 | from utils import torch_utils
 19 |  
 20 | transform_train = transforms.Compose([
 21 |     transforms.RandomCrop(32, padding=4),
 22 |     transforms.RandomHorizontalFlip(),
 23 |     transforms.ToTensor(),
 24 |     models.InputFactor(),
 25 | ])
 26 |  
 27 | trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,
 28 |                                         download=False, transform=transform_train)
 29 | classes = ('plane', 'car', 'bird', 'cat',
 30 |            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
 31 | 
 32 | 
 33 | def train():
 34 |     torch_utils.init_seeds()
 35 | 
 36 |     model = models.VGG_tiny_FixQ(bitw = opt.bitw, bita = opt.bita)
 37 |     model.to(device)
 38 |     if opt.weights is not None:
 39 |         weights_file = 'weights/' + opt.weights + '.pt'
 40 |         chkpt = torch.load(weights_file, map_location=device)
 41 |         chkpt['model'] = {k: v for k, v in chkpt['model'].items() if 
 42 |                     model.state_dict()[k].numel() == v.numel()}
 43 |         model.load_state_dict(chkpt['model'], strict=False)
 44 | 
 45 |     results_file = 'results/%s.txt'%opt.name
 46 |     
 47 |     criterion = nn.CrossEntropyLoss()
 48 |     optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4)
 49 |     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
 50 |             optimizer, T_max=opt.epochs, eta_min=opt.lr*0.01)
 51 | 
 52 |     model.train()
 53 | 
 54 |     start_epoch, epochs = 0, opt.epochs
 55 |     train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True, num_workers=2)
 56 |     test_best_acc = 0.0
 57 | 
 58 |     test(model, device)
 59 |     bops, bita, bitw, dsps = model.fetch_arch_info()
 60 |     print('model with bops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format(bops, bita, bitw, dsps))
 61 |     
 62 |     for epoch in range(start_epoch, epochs):
 63 |         model.train()
 64 |         mloss = macc = 0.
 65 |         pbar = tqdm(enumerate(train_loader), total=len(train_loader))
 66 |         for i, (inputs, labels) in pbar:
 67 |             inputs, labels = inputs.to(device), labels.to(device)
 68 |             
 69 |             optimizer.zero_grad()
 70 |             
 71 |             outputs = model(inputs)
 72 |             _, predicted = torch.max(outputs.data, 1)
 73 |             correct = (predicted == labels).sum().item()
 74 |             loss = criterion(outputs, labels)
 75 |             loss.backward()
 76 |             optimizer.step()
 77 |             
 78 |             mloss = (mloss*i + loss.item()) / (i+1)
 79 |             macc = (macc*i + correct/opt.batch_size) / (i+1)
 80 |             s = '%10s%10.2f%10.3g'%('%d/%d'%(epoch,epochs-1), macc*100, mloss)
 81 |             pbar.set_description(s)
 82 |         
 83 |         scheduler.step()
 84 |         results = test(model, device)
 85 |         with open(results_file, 'a') as f:
 86 |             f.write(s + '%10.2f%10.3g'% results + '\n')
 87 |         test_acc = results[0]
 88 |         test_best_acc = max(test_best_acc, test_acc)
 89 | 
 90 |         final_epoch = epoch == epochs-1
 91 |         if True or final_epoch:
 92 |             with open(results_file, 'r') as f:
 93 |                 chkpt = {'epoch': epoch,
 94 |                             'training_results': f.read(),
 95 |                             'model': model.module.state_dict() if type(
 96 |                                 model) is nn.parallel.DistributedDataParallel else model.state_dict(),
 97 |                             'optimizer': None if final_epoch else optimizer.state_dict(),
 98 |                             'model_params': model.model_params, # arch param
 99 |                             'extra': {'time': time.ctime(), 'name': opt.name}}
100 |             # Save last checkpoint
101 |             torch.save(chkpt, wdir + '%s_last.pt'%opt.name)
102 |             
103 |             if test_acc == test_best_acc:
104 |                 torch.save(chkpt, wdir + '%s_best.pt'%opt.name)
105 |     
106 |     print('Finished Training')
107 | 
108 |     with open('results.csv', 'a') as f:
109 |         print("fixed,%s,%d/%d, , ,%s,%s,%.1f,%.1f, , , ,%d, ,%.3f, "%
110 |               (opt.name,epochs-1,epochs,opt.bitw,opt.bita,macc*100,(test_acc+test_best_acc)/2,
111 |                int(round(bops)), dsps), file=f)
112 | 
113 |     # torch.save(net.state_dict(), 'lenet_cifar10.pth')
114 | 
115 | if __name__ == '__main__':
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('-n', '--name', default='VGG_tiny_FixQ', help='result and weight file name')
118 |     parser.add_argument('-w', '--weights', default=None, help='weights path')
119 |     parser.add_argument('-e', '--epochs', type=int, default=200) 
120 |     parser.add_argument('--batch-size', type=int, default=128) 
121 |     parser.add_argument('--bypass', action='store_true', help='use bypass model')
122 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)')
123 |     parser.add_argument('--lr', type=float, default=0.03)
124 |     parser.add_argument('--mixm', type=str)
125 |     parser.add_argument('--bitw', type=str, default='')
126 |     parser.add_argument('--bita', type=str, default='')
127 | 
128 |     opt = parser.parse_args()
129 | 
130 |     if opt.mixm is not None:
131 |         wmix = torch.load('weights/%s.pt'%opt.mixm)
132 |         opt.bitw = wmix['extra']['bestw']
133 |         opt.bita = wmix['extra']['besta']
134 |         del wmix
135 | 
136 |     print(opt)
137 | 
138 |     wdir = 'weights' + os.sep  # weights dir
139 |     last = wdir + '%s_last.pt'%opt.name
140 | 
141 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
142 | 
143 |     train()
144 | 


--------------------------------------------------------------------------------
/cifar/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from anypacking import quant_module as qm
  5 | 
  6 | class InputFactor:
  7 |     def __call__(self, pic):
  8 |         return pic * 255.0 / 256.0
  9 | 
 10 | class LeNet(nn.Module):
 11 |     def __init__(self):
 12 |         super(LeNet,self).__init__()
 13 |         conv=nn.Conv2d
 14 |         self.conv1 = conv(3,6,5)
 15 |         self.conv2 = conv(6,16,5)
 16 |         self.fc1 = nn.Linear(16*5*5,120)
 17 |         self.fc2 = nn.Linear(120,84)
 18 |         self.fc3 = nn.Linear(84,10)
 19 | 
 20 |     def forward(self,x):
 21 |         x = F.max_pool2d(F.relu(self.conv1(x)),(2,2))
 22 |         x = F.max_pool2d(F.relu(self.conv2(x)),2)
 23 |         x = x.view(x.size()[0],-1)
 24 |         x = F.relu(self.fc1(x))
 25 |         x = F.relu(self.fc2(x))
 26 |         x = self.fc3(x)
 27 |         return x
 28 | 
 29 | class VGG_small(nn.Module):
 30 |     def __init__(self, num_classes=10):
 31 |         super(VGG_small, self).__init__()
 32 |         self.pooling = nn.MaxPool2d(kernel_size=2, stride=2)
 33 |         self.nonlinear = nn.ReLU(inplace=True)
 34 | 
 35 |         self.layers = nn.Sequential(
 36 |             nn.Conv2d(3, 128, kernel_size=3, padding=1, bias=False), # 0
 37 |             nn.BatchNorm2d(128),
 38 |             self.nonlinear,
 39 | 
 40 |             nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False), # 1
 41 |             self.pooling,
 42 |             nn.BatchNorm2d(128),
 43 |             self.nonlinear,
 44 | 
 45 |             nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False), # 2
 46 |             nn.BatchNorm2d(256),
 47 |             self.nonlinear,
 48 | 
 49 |             nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False), # 3
 50 |             self.pooling,
 51 |             nn.BatchNorm2d(256),
 52 |             self.nonlinear,
 53 | 
 54 |             nn.Conv2d(256, 512, kernel_size=3, padding=1, bias=False), # 4
 55 |             nn.BatchNorm2d(512),
 56 |             self.nonlinear,
 57 | 
 58 |             nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False), # 5
 59 |             self.pooling,
 60 |             nn.BatchNorm2d(512),
 61 |             self.nonlinear,
 62 | 
 63 |             nn.Flatten(),
 64 |             nn.Linear(512*4*4, num_classes)
 65 |         )
 66 | 
 67 | 
 68 |     def forward(self, x):
 69 |         return self.layers(x)
 70 | 
 71 | class VGG_tiny(nn.Module):
 72 |     def __init__(self, num_classes=10):
 73 |         super(VGG_tiny, self).__init__()
 74 |         self.pooling = nn.MaxPool2d(kernel_size=2, stride=2)
 75 |         self.nonlinear = nn.ReLU(inplace=True)
 76 | 
 77 |         self.layers = nn.Sequential(
 78 |             nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False), # 0
 79 |             nn.BatchNorm2d(64),
 80 |             self.nonlinear,
 81 | 
 82 |             nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=False), # 1
 83 |             self.pooling,
 84 |             nn.BatchNorm2d(64),
 85 |             self.nonlinear,
 86 | 
 87 |             nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=False), # 2
 88 |             nn.BatchNorm2d(128),
 89 |             self.nonlinear,
 90 | 
 91 |             nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False), # 3
 92 |             self.pooling,
 93 |             nn.BatchNorm2d(128),
 94 |             self.nonlinear,
 95 | 
 96 |             nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False), # 4
 97 |             nn.BatchNorm2d(256),
 98 |             self.nonlinear,
 99 | 
100 |             nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False), # 5
101 |             self.pooling,
102 |             nn.BatchNorm2d(256),
103 |             self.nonlinear,
104 | 
105 |             nn.Flatten(),
106 |             nn.Linear(256*4*4, num_classes)
107 |         )
108 | 
109 |     def forward(self, x):
110 |         return self.layers(x)
111 | 
112 | 
113 | class VGG_tiny_MixQ(nn.Module):
114 |     def __init__(self, num_classes=10, share_weight = True):
115 |         super(VGG_tiny_MixQ, self).__init__()
116 |         self.pooling = nn.MaxPool2d(kernel_size=2, stride=2)
117 |         self.conv_func = qm.MixActivConv2d
118 |         conv_func = self.conv_func
119 | 
120 |         conv_kwargs = {'kernel_size':3, 'stride':1, 'padding':1, 'bias':False}
121 |         qspace = {'wbits':[2,3,4,5,6,7,8], 'abits':[2,3,4,5,6,7,8], 'share_weight': share_weight}
122 | 
123 |         self.layers = nn.Sequential(
124 |             conv_func(3, 64, ActQ = qm.ImageInputQ, **conv_kwargs, **qspace), # 0
125 |             nn.BatchNorm2d(64),
126 | 
127 |             conv_func(64, 64, **conv_kwargs, **qspace), # 1
128 |             nn.BatchNorm2d(64),
129 |             self.pooling,
130 | 
131 |             conv_func(64, 128, **conv_kwargs, **qspace), # 2
132 |             nn.BatchNorm2d(128),
133 | 
134 |             conv_func(128, 128, **conv_kwargs, **qspace), # 3
135 |             nn.BatchNorm2d(128),
136 |             self.pooling,
137 | 
138 |             conv_func(128, 256, **conv_kwargs, **qspace), # 4
139 |             nn.BatchNorm2d(256),
140 | 
141 |             conv_func(256, 256, **conv_kwargs, **qspace), # 5
142 |             nn.BatchNorm2d(256),
143 |             self.pooling,
144 | 
145 |             nn.Flatten(),
146 |             qm.QuantActivLinear(256*4*4, num_classes, bias=True, wbit=8, abit=8)
147 |         )
148 | 
149 |     def forward(self, x):
150 |         return self.layers(x)
151 | 
152 |     def fetch_best_arch(self):
153 |         sum_bitops, sum_bita, sum_bitw, sum_dsps = 0, 0, 0, 0
154 |         sum_mixbitops, sum_mixbita, sum_mixbitw, sum_mixdsps = 0, 0, 0, 0
155 |         layer_idx = 0
156 |         best_arch = None
157 |         for m in self.modules():
158 |             if isinstance(m, self.conv_func):
159 |                 layer_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache = m.fetch_best_arch(layer_idx)
160 |                 if best_arch is None:
161 |                     best_arch = layer_arch
162 |                 else:
163 |                     for key in layer_arch.keys():
164 |                         if key not in best_arch:
165 |                             best_arch[key] = layer_arch[key]
166 |                         else:
167 |                             best_arch[key].append(layer_arch[key][0])
168 |                 sum_bitops += bitops
169 |                 sum_bita += bita
170 |                 sum_bitw += bitw
171 |                 sum_mixbitops += mixbitops
172 |                 sum_mixbita += mixbita
173 |                 sum_mixbitw += mixbitw
174 |                 sum_dsps += dsps
175 |                 sum_mixdsps += mixdsps
176 |                 layer_idx += 1
177 |         return best_arch, sum_bitops, sum_bita, sum_bitw, sum_mixbitops, sum_mixbita, sum_mixbitw, sum_dsps, sum_mixdsps
178 | 
179 |     def complexity_loss(self):
180 |         size_product = []
181 |         loss = 0
182 |         for m in self.modules():
183 |             if isinstance(m, self.conv_func):
184 |                 loss += m.complexity_loss()
185 |                 size_product += [m.size_product]
186 |         normalizer = size_product[0].item()
187 |         loss /= normalizer
188 |         return loss
189 |     
190 |     def complexity_loss_trivial(self):
191 |         size_product = []
192 |         loss = 0
193 |         for m in self.modules():
194 |             if isinstance(m, self.conv_func):
195 |                 loss += m.complexity_loss_trivial()
196 |                 size_product += [m.size_product]
197 |         normalizer = size_product[0].item()
198 |         loss /= normalizer
199 |         return loss
200 | 
201 | class VGG_tiny_FixQ(nn.Module):
202 |     def __init__(self, num_classes=10, bitw = '444444', bita = '844444'):
203 |         super(VGG_tiny_FixQ, self).__init__()
204 |         self.conv_func = qm.QuantActivConv2d
205 |         conv_func = self.conv_func
206 | 
207 |         assert(len(bitw)==0 or len(bitw)==6)
208 |         assert(len(bita)==0 or len(bita)==6)
209 |         if isinstance(bitw, str):
210 |             bitw=list(map(int, bitw))
211 |         if isinstance(bita, str):
212 |             bita=list(map(int, bita))
213 | 
214 |         self.bitw = bitw
215 |         self.bita = bita
216 |         self.model_params = {'bitw': bitw, 'bita': bita}
217 | 
218 |         conv_kwargs = {'kernel_size':3, 'stride':1, 'padding':1, 'bias':False}
219 | 
220 |         self.layers = nn.Sequential(
221 |             conv_func(3, 64, ActQ = qm.ImageInputQ, **conv_kwargs, wbit=bitw[0], abit=bita[0]), # 0
222 |             nn.BatchNorm2d(64),
223 | 
224 |             conv_func(64, 64, **conv_kwargs, wbit=bitw[1], abit=bita[1]), # 1
225 |             nn.BatchNorm2d(64),
226 |             nn.MaxPool2d(kernel_size=2, stride=2),
227 | 
228 |             conv_func(64, 128, **conv_kwargs, wbit=bitw[2], abit=bita[2]), # 2
229 |             nn.BatchNorm2d(128),
230 | 
231 |             conv_func(128, 128, **conv_kwargs, wbit=bitw[3], abit=bita[3]), # 3
232 |             nn.BatchNorm2d(128),
233 |             nn.MaxPool2d(kernel_size=2, stride=2),
234 | 
235 |             conv_func(128, 256, **conv_kwargs, wbit=bitw[4], abit=bita[4]), # 4
236 |             nn.BatchNorm2d(256),
237 | 
238 |             conv_func(256, 256, **conv_kwargs, wbit=bitw[5], abit=bita[5]), # 5
239 |             nn.BatchNorm2d(256),
240 |             nn.MaxPool2d(kernel_size=2, stride=2),
241 | 
242 |             nn.Flatten(),
243 |             qm.QuantActivLinear(256*4*4, num_classes, bias=True, wbit=8, abit=8)
244 |         )
245 | 
246 |     def forward(self, x):
247 |         return self.layers(x)
248 | 
249 |     def fetch_arch_info(self):
250 |         sum_bitops, sum_bita, sum_bitw, sum_dsps = 0, 0, 0, 0
251 |         layer_idx = 0
252 |         for m in self.modules():
253 |             if isinstance(m, self.conv_func):
254 |                 size_product = m.size_product.item()
255 |                 memory_size = m.memory_size.item()
256 |                 bitops = size_product * m.abit * m.wbit
257 |                 bita = m.memory_size.item() * m.abit
258 |                 bitw = m.param_size * m.wbit
259 |                 dsps = size_product / qm.dsp_factors_k33[m.wbit-2][m.abit-2]
260 |                 weight_shape = list(m.conv.weight.shape)
261 |                 print('idx {} with shape {}, bitops: {:.3f}M * {} * {}, memory: {:.3f}K * {}, '
262 |                       'param: {:.3f}M * {}, dsps: {:.3f}M'.format(layer_idx, weight_shape, size_product, m.abit,
263 |                                                    m.wbit, memory_size, m.abit, m.param_size, m.wbit, dsps))
264 |                 sum_bitops += bitops
265 |                 sum_bita += bita
266 |                 sum_bitw += bitw
267 |                 sum_dsps += dsps
268 |                 layer_idx += 1
269 |         return sum_bitops, sum_bita, sum_bitw, sum_dsps
270 | 


--------------------------------------------------------------------------------
/cifar/search_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import torch
  5 | import torch.nn as nn
  6 | import torchvision
  7 | import torchvision.transforms as transforms
  8 | import torch.optim as optim
  9 | import numpy as np
 10 | from tqdm import tqdm
 11 | 
 12 | import sys
 13 | sys.path.append('..')
 14 | 
 15 | from localconfig import data_path
 16 | import models
 17 | from utils import torch_utils
 18 | from test_acc import test
 19 |  
 20 | transform_train = transforms.Compose([
 21 |     transforms.RandomCrop(32, padding=4),
 22 |     transforms.RandomHorizontalFlip(),
 23 |     transforms.ToTensor(),
 24 |     models.InputFactor(),
 25 | ])
 26 |  
 27 | trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,
 28 |                                         download=False, transform=transform_train)
 29 | classes = ('plane', 'car', 'bird', 'cat',
 30 |            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
 31 | 
 32 | 
 33 | def train():
 34 |     torch_utils.init_seeds()
 35 | 
 36 |     model = models.VGG_tiny_MixQ(10, not opt.noshare)
 37 |     model.to(device)
 38 | 
 39 |     results_file = 'results/%s.txt'%opt.name
 40 |     
 41 |     criterion = nn.CrossEntropyLoss()
 42 |     
 43 |     params, alpha_params = [], []
 44 |     for name, param in model.named_parameters():
 45 |         if 'alpha' in name:
 46 |             alpha_params += [param]
 47 |         else:
 48 |             params += [param]
 49 |     optimizer = optim.SGD(params, lr=opt.lr, momentum=0.9, weight_decay=5e-4)
 50 |     arch_optimizer = torch.optim.SGD(alpha_params, opt.lra, momentum=0.9, weight_decay=5e-4)
 51 | 
 52 |     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
 53 |             optimizer, T_max=opt.epochs, eta_min=opt.lr*0.01) 
 54 |     arch_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
 55 |             arch_optimizer, T_max=opt.epochs, eta_min=opt.lr*0.3)
 56 | 
 57 |     model.train()
 58 | 
 59 |     start_epoch, epochs = 0, opt.epochs
 60 |     train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True, num_workers=2)
 61 |     test_best_acc = 0.0
 62 | 
 63 |     for epoch in range(start_epoch, epochs):
 64 |         model.train()
 65 |         mloss = macc = 0.
 66 |         pbar = tqdm(enumerate(train_loader), total=len(train_loader))
 67 |         for i, (inputs, labels) in pbar:
 68 |             inputs, labels = inputs.to(device), labels.to(device)
 69 |             
 70 |             optimizer.zero_grad()
 71 |             arch_optimizer.zero_grad()
 72 |             
 73 |             outputs = model(inputs)
 74 |             _, predicted = torch.max(outputs.data, 1)
 75 |             correct = (predicted == labels).sum().item()
 76 |             loss = criterion(outputs, labels)
 77 |             
 78 |             if opt.complexity_decay != 0 or opt.complexity_decay_trivial!=0:
 79 |                 loss_complexity = opt.complexity_decay * model.complexity_loss() + \
 80 |                                   opt.complexity_decay_trivial * model.complexity_loss_trivial()
 81 |                 loss += loss_complexity
 82 | 
 83 |             loss.backward()
 84 |             optimizer.step()
 85 |             arch_optimizer.step()
 86 |             
 87 |             mloss = (mloss*i + loss.item()) / (i+1)
 88 |             macc = (macc*i + correct/opt.batch_size) / (i+1)
 89 |             s = '%10s%10.2f%10.3g'%('%d/%d'%(epoch,epochs-1), macc*100, mloss)
 90 |             pbar.set_description(s)
 91 | 
 92 |         print('========= architecture =========')
 93 |         best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps  = model.fetch_best_arch()
 94 |         print('best model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format(
 95 |             bitops, bita, bitw, dsps))
 96 |         print('expected model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format(
 97 |             mixbitops, mixbita, mixbitw, mixdsps))
 98 |         bestw_str = "".join([str(x+2) for x in best_arch["best_weight"]])
 99 |         besta_str = "".join([str(x+2) for x in best_arch["best_activ"]])
100 |         print(f'best_weight: {best_arch["best_weight"]}')
101 |         print(f'best_activ: {best_arch["best_activ"]}')
102 |         
103 |         scheduler.step()
104 |         arch_scheduler.step()
105 | 
106 |         results = test(model, device)
107 |         with open(results_file, 'a') as f:
108 |             f.write(s + '%10.2f%10.3g'% results + '\n')
109 |         test_acc = results[0]
110 |         test_best_acc = max(test_best_acc, test_acc)
111 | 
112 |         final_epoch = epoch == epochs-1
113 |         if True or final_epoch:
114 |             with open(results_file, 'r') as f:
115 |                 chkpt = {'epoch': epoch,
116 |                             'training_results': f.read(),
117 |                             'model': model.module.state_dict() if type(
118 |                                 model) is nn.parallel.DistributedDataParallel else model.state_dict(),
119 |                             'optimizer': None if final_epoch else optimizer.state_dict(),
120 |                             'arch_optimizer': None if final_epoch else arch_optimizer.state_dict(),
121 |                             'extra': {'time': time.ctime(), 'name': opt.name, 'bestw': bestw_str, 'besta': besta_str}}
122 |             # Save last checkpoint
123 |             torch.save(chkpt, wdir + '%s_last.pt'%opt.name)
124 |             
125 |             if test_acc == test_best_acc:
126 |                 torch.save(chkpt, wdir + '%s_best.pt'%opt.name)
127 |     
128 |     print('Finished Training')
129 | 
130 |     with open('results.csv', 'a') as f:
131 |         print("mixed,%s,%d/%d, , , , ,%.1f,%.1f, ,%s,%s,%d,%d,%.3f,%.3f"%
132 |               (opt.name,epochs-1,epochs,macc*100,(test_acc+test_best_acc)/2,
133 |                bestw_str,besta_str,
134 |                int(round(bitops)), int(round(mixbitops)), dsps, mixdsps), file=f)
135 | 
136 |     # torch.save(net.state_dict(), 'lenet_cifar10.pth')
137 | 
138 | if __name__ == '__main__':
139 |     parser = argparse.ArgumentParser()
140 |     parser.add_argument('--epochs', type=int, default=40) 
141 |     parser.add_argument('--batch-size', type=int, default=128) 
142 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)')
143 |     parser.add_argument('--lr', type=float, default=0.1)
144 |     parser.add_argument('--name', default='', help='result and weight file name')
145 |     parser.add_argument('--noshare', action='store_true', help='no share weight')
146 |     parser.add_argument('--complexity-decay', '--cd', default=0, type=float, metavar='W', help='complexity decay (default: 0)')
147 |     parser.add_argument('--complexity-decay-trivial', '--cdt', default=0, type=float, metavar='W', help='complexity decay w/o hardware-aware')
148 |     parser.add_argument('--lra', '--learning-rate-alpha', default=0.1, type=float, metavar='LR', help='initial alpha learning rate')
149 | 
150 |     opt = parser.parse_args()
151 |     print(opt)
152 |     wdir = 'weights' + os.sep  # weights dir
153 |     last = wdir + '%s_last.pt'%opt.name
154 | 
155 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
156 | 
157 |     train()
158 | 


--------------------------------------------------------------------------------
/cifar/simulate_hw.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | 
  7 | from export_hls import ConvParam
  8 | from test_acc import testset
  9 | from utils.view_pt import select_weight_file
 10 | 
 11 | class QConvLayer:
 12 |     def __init__(self, conv_param):
 13 |         self.conv = conv_param
 14 |         self.w = torch.tensor(self.conv.w, dtype = torch.int64)
 15 |     
 16 |     def __call__(self, x: torch.Tensor, downsampling):
 17 |         if self.conv.icol < x.shape[-1]: # Maxpool. Note: Order of Maxpool and BN is IMPORTANT when BN.inc can be negative
 18 |             assert self.conv.irow*2, self.conv.icol*2 == x.shape[2:]
 19 |             x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64)
 20 | 
 21 |         if self.conv.type == 'linear':
 22 |             x = x.flatten(1)
 23 |             x = F.linear(x, self.w)
 24 |             x += self.conv.bias
 25 |             return x
 26 |         # print('convi', self.conv.n, x[0,0,:,:])
 27 | 
 28 |         x = F.conv2d(x, self.w, bias=None, stride=self.conv.s, padding=self.conv.p) # [N, OCH, OROW, OCOL]
 29 |         # print('convo', self.conv.n, x[0,0,:,:])
 30 |         #if downsampling: # Maxpool
 31 |         #    x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64)
 32 |         och = x.shape[1]
 33 |         if True:
 34 |             if self.conv.inc is not None:
 35 |                 inc_ch = self.conv.inc.reshape((1, och, 1, 1))
 36 |                 x *= inc_ch
 37 |             if hasattr(self.conv, 'bias'):
 38 |                 bias_ch = self.conv.bias.reshape((1, och, 1, 1))
 39 |                 x += bias_ch
 40 | 
 41 |             # print('biaso', self.conv.n, x[0,0,:,:]/2**self.conv.lshift_T)
 42 |             if hasattr(self.conv, 'lshift'):
 43 |                 x += 1 << self.conv.lshift_T-1
 44 |                 x >>= self.conv.lshift_T
 45 | 
 46 |         else: ## no inc/bias quantization
 47 |             if self.conv.inc is not None:
 48 |                 inc_ch = self.conv.inc_raw.reshape((1, och, 1, 1))
 49 |                 x *= inc_ch
 50 |             if hasattr(self.conv, 'bias'):
 51 |                 bias_ch = self.conv.bias_raw.reshape((1, och, 1, 1))
 52 |                 x += bias_ch
 53 |             # print('biaso', self.conv.n, x[0,0,:,:])
 54 |             x = torch.round(x).to(dtype = torch.int64)
 55 |         
 56 |         if hasattr(self.conv, 'obit'):
 57 |             x.clip_(0, 2**(self.conv.obit)-1)
 58 |         return x
 59 | 
 60 | class HWModel:
 61 |     def __init__(self, model_param):
 62 |         self.layers = [QConvLayer(conv_param) for conv_param in model_param]
 63 | 
 64 |     def __call__(self, x):
 65 |         assert len(x.shape) == 4 and x.dtype == torch.int64
 66 |         img_size = x.shape[-2:]
 67 | 
 68 |         if self.layers[0].conv.abit<8: # ImageInputQ
 69 |             x=x>>(8-self.layers[0].conv.abit) 
 70 | 
 71 |         for i, layer in enumerate(self.layers):
 72 |             x = layer(x, self.layers[i+1].conv.icol<layer.conv.icol if i+1<len(self.layers) else False)
 73 |         
 74 |         x = x.float() / self.layers[-1].conv.div
 75 |         return x
 76 | 
 77 | def testdataset(hwmodel):
 78 |     testloader = torch.utils.data.DataLoader(testset, batch_size=opt.batch_size, shuffle=False, num_workers=2)
 79 |     criterion = nn.CrossEntropyLoss()
 80 | 
 81 |     mloss = macc = 0.
 82 |     for i, (inputs, labels) in enumerate(testloader):
 83 |         if i == opt.num_batch: break
 84 |         bn, _, height, width = inputs.shape  # batch size, channels, height, width
 85 | 
 86 |         inputs *= 256.0
 87 |         inputs = inputs.to(dtype = torch.int64)
 88 |         inf_out = hwmodel(inputs)
 89 |         _, predicted = torch.max(inf_out.data, 1)
 90 |         correct = (predicted == labels).sum().item()
 91 |         loss = criterion(inf_out, labels)
 92 | 
 93 |         np.set_printoptions(precision = 2)
 94 |         for p in range(len(inputs)):
 95 |             print(predicted[p].numpy(), labels[p].numpy(), inf_out[p].numpy())
 96 | 
 97 |         mloss = (mloss*i + loss.item()) / (i+1)
 98 |         macc = (macc*i + correct/opt.batch_size) / (i+1)
 99 | 
100 |     print('acc %.2f, loss %.4f'%(macc*100, mloss))
101 | 
102 | if __name__=='__main__':
103 |     parser = argparse.ArgumentParser()
104 |     parser.add_argument('-w', '--weight', help='weight folder name in ./hls/, which contians model_param.pkl')
105 |     parser.add_argument('-bs', '--batch-size', type=int, default=1, help = 'batch-size')
106 |     parser.add_argument('-nb', '--num-batch', type=int, default=1, help = 'num of batchs to run, -1 for full dataset')
107 |     opt = parser.parse_args()
108 |     print(opt)
109 |     if opt.weight is None: opt.weight = select_weight_file()
110 |     
111 |     x = torch.zeros([1,3,32,32], dtype=torch.int64)
112 |     hwmodel = HWModel(torch.load('hls/'+opt.weight+'/model_param.pkl'))
113 |     
114 |     testdataset(hwmodel)
115 | 


--------------------------------------------------------------------------------
/cifar/test_acc.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import Dict
 3 | import torch
 4 | import torch.nn as nn
 5 | import torchvision
 6 | import torchvision.transforms as transforms
 7 | import numpy as np
 8 | from tqdm import tqdm
 9 | 
10 | import sys
11 | sys.path.append('..')
12 | 
13 | import models
14 | from localconfig import data_path
15 | from utils.view_pt import select_weight_file
16 | from utils import torch_utils
17 | 
18 | opt = None
19 | 
20 | transform_test = transforms.Compose([
21 |     transforms.ToTensor(),
22 |     models.InputFactor(),
23 | ])
24 | testset = torchvision.datasets.CIFAR10(root=data_path, train=False,
25 |                                        download=False, transform=transform_test)
26 | 
27 | def test(model, device, batch_size = 64, num_batch = -1):
28 |     model.eval()
29 |     testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
30 |     criterion = nn.CrossEntropyLoss()
31 |     with torch.no_grad():
32 |         mloss = macc = 0.
33 |         pbar = tqdm(enumerate(testloader), total=len(testloader))
34 |         for i, (inputs, labels) in pbar:
35 |             if i == num_batch: break
36 |             inputs, labels = inputs.to(device), labels.to(device)
37 |             
38 |             outputs = model(inputs)
39 |             _, predicted = torch.max(outputs.data, 1)
40 |             correct = (predicted == labels).sum().item()
41 |             
42 |             if opt and opt.verbose:
43 |                 np.set_printoptions(precision = 2)
44 |                 for p in range(len(inputs)):
45 |                     print(predicted[p].numpy(), labels[p].numpy(), outputs[p].numpy())
46 | 
47 |             loss = criterion(outputs, labels)
48 |             mloss = (mloss*i + loss.item()) / (i+1)
49 |             macc = (macc*i + correct/batch_size) / (i+1)
50 |             s = ' '*10 + '%10.2f%10.3g'%(macc*100, mloss)
51 |             pbar.set_description(s)
52 |     
53 |     return macc * 100, mloss
54 | 
55 | if __name__ == '__main__':
56 |     parser = argparse.ArgumentParser(prog='test.py')
57 |     parser.add_argument('-m', '--model', type=str, default='VGG_tiny_FixQ', help='model name')
58 |     parser.add_argument('-w', '--weight', default=None, help='weights path')
59 |     parser.add_argument('-bs', '--batch-size', type=int, default=64, help='size of each image batch')
60 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
61 |     parser.add_argument('-v', '--verbose', action='store_true', help = 'show predict value result')
62 |     parser.add_argument('-nb', '--num-batch', type=int, default='-1', help='num of batchs to run, -1 for full dataset')
63 |     opt = parser.parse_args()
64 |     print(opt)
65 |     if opt.weight is None: opt.weight = select_weight_file()
66 |     
67 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
68 |     ptfile: Dict = torch.load('weights/' + opt.weight+'.pt', map_location=device)
69 |     model_params = ptfile.setdefault('model_params', {})
70 |     model = getattr(models, opt.model)(**model_params).to(device)
71 |     model.load_state_dict(ptfile['model'], strict = False)
72 | 
73 |     # Test
74 |     res = test(model, device, batch_size=opt.batch_size, num_batch=opt.num_batch)
75 |     print(('%s %s.pt\nacc %.2f, loss %.4f')%(opt.model, opt.weight, *res))
76 | 


--------------------------------------------------------------------------------
/cifar/train_normal.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import torch
  5 | import torch.nn as nn
  6 | import torchvision
  7 | import torchvision.transforms as transforms
  8 | import torch.optim as optim
  9 | import numpy as np
 10 | from tqdm import tqdm
 11 | 
 12 | import sys
 13 | sys.path.append('..')
 14 | 
 15 | from localconfig import data_path
 16 | import models
 17 | from utils import torch_utils
 18 | from test_acc import test
 19 |  
 20 | transform_train = transforms.Compose([
 21 |     transforms.RandomCrop(32, padding=4),
 22 |     transforms.RandomHorizontalFlip(),
 23 |     transforms.ToTensor(),
 24 |     models.InputFactor(),
 25 | ])
 26 |  
 27 | trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,
 28 |                                         download=False, transform=transform_train)
 29 | classes = ('plane', 'car', 'bird', 'cat',
 30 |            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
 31 | 
 32 | 
 33 | def train():
 34 |     torch_utils.init_seeds()
 35 | 
 36 |     model = getattr(models, opt.model)()
 37 |     model.to(device)
 38 | 
 39 |     results_file = 'results/%s.txt'%opt.name
 40 |     
 41 |     criterion = nn.CrossEntropyLoss()
 42 |     optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4)
 43 |     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
 44 |             optimizer, T_max=opt.epochs, eta_min=opt.lr*0.01)
 45 | 
 46 |     model.train()
 47 | 
 48 |     start_epoch, epochs = 0, opt.epochs
 49 |     train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True, num_workers=2)
 50 |     test_best_acc = 0.0
 51 | 
 52 |     for epoch in range(start_epoch, epochs):
 53 |         model.train()
 54 |         mloss = macc = 0.
 55 |         pbar = tqdm(enumerate(train_loader), total=len(train_loader))
 56 |         for i, (inputs, labels) in pbar:
 57 |             inputs, labels = inputs.to(device), labels.to(device)
 58 |             
 59 |             optimizer.zero_grad()
 60 |             
 61 |             outputs = model(inputs)
 62 |             _, predicted = torch.max(outputs.data, 1)
 63 |             correct = (predicted == labels).sum().item()
 64 |             loss = criterion(outputs, labels)
 65 |             loss.backward()
 66 |             optimizer.step()
 67 |             
 68 |             mloss = (mloss*i + loss.item()) / (i+1)
 69 |             macc = (macc*i + correct/opt.batch_size) / (i+1)
 70 |             s = '%10s%10.2f%10.3g'%('%d/%d'%(epoch,epochs-1), macc*100, mloss)
 71 |             pbar.set_description(s)
 72 |         
 73 |         scheduler.step()
 74 |         results = test(model, device)
 75 |         with open(results_file, 'a') as f:
 76 |             f.write(s + '%10.2f%10.3g'% results + '\n')
 77 |         test_acc = results[0]
 78 |         test_best_acc = max(test_best_acc, test_acc)
 79 | 
 80 |         final_epoch = epoch == epochs-1
 81 |         if True or final_epoch:
 82 |             with open(results_file, 'r') as f:
 83 |                 chkpt = {'epoch': epoch,
 84 |                             'training_results': f.read(),
 85 |                             'model': model.module.state_dict() if type(
 86 |                                 model) is nn.parallel.DistributedDataParallel else model.state_dict(),
 87 |                             'optimizer': None if final_epoch else optimizer.state_dict(),
 88 |                             'extra': {'time': time.ctime(), 'name': opt.name}}
 89 |             # Save last checkpoint
 90 |             torch.save(chkpt, wdir + '%s_last.pt'%opt.name)
 91 |             
 92 |             if test_acc == test_best_acc:
 93 |                 torch.save(chkpt, wdir + '%s_best.pt'%opt.name)
 94 |     
 95 |     print('Finished Training')
 96 | 
 97 |     # torch.save(net.state_dict(), 'lenet_cifar10.pth')
 98 | 
 99 | if __name__ == '__main__':
100 |     parser = argparse.ArgumentParser()
101 |     parser.add_argument('--epochs', type=int, default=40) 
102 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)')
103 |     parser.add_argument('--batch-size', type=int, default=128) 
104 |     parser.add_argument('--lr', type=float, default=0.03)
105 |     parser.add_argument('--model', type=str, default='VGG_tiny')
106 |     parser.add_argument('--name', default='', help='result and weight file name')
107 | 
108 |     opt = parser.parse_args()
109 |     print(opt)
110 |     wdir = 'weights' + os.sep  # weights dir
111 |     last = wdir + '%s_last.pt'%opt.name
112 | 
113 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
114 | 
115 |     train()
116 | 


--------------------------------------------------------------------------------
/dacsdc/export_hls.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | from typing import Dict, List
  4 | import torch
  5 | import numpy as np
  6 | import sys
  7 | import os
  8 | 
  9 | import sys
 10 | sys.path.append('..')
 11 | import mymodel
 12 | from utils.view_pt import select_weight_file
 13 | from quant_dorefa import activation_quantize_fn
 14 | from anypacking.quant_module import HWGQ, QuantConv2d, ImageInputQ
 15 | 
 16 | class ConvParam: ...
 17 | 
 18 | def write_hls_config(model_param, path):
 19 |     name_mapping = {
 20 |         'k': 'K',
 21 |         #'s': 'S',
 22 |         #'p': 'P',
 23 |         'ich': 'IFM_CH',
 24 |         'irow': 'IFM_ROW',
 25 |         'icol': 'IFM_COL',
 26 |         'och': 'OFM_CH',
 27 |         'orow': 'OFM_ROW',
 28 |         'ocol': 'OFM_COL',
 29 |         'abit': 'IN_BIT',
 30 |         'wbit': 'W_BIT',
 31 |         'incbit': 'INC_BIT',
 32 |         'biasbit': 'BIAS_BIT',
 33 |         'simd': 'SIMD',
 34 |         'pe': 'PE',
 35 |         'lshift': 'L_SHIFT'
 36 |     }
 37 |     content = f'''/********************************************************************************
 38 | * Filename: config.h
 39 | * Date: {time.ctime()}
 40 | * Description: This file is generated by {parser.prog}
 41 | *   ptfilename: {opt.weight} 
 42 | ********************************************************************************/
 43 | 
 44 | #ifndef _CONFIG_H_
 45 | #define _CONFIG_H_
 46 | 
 47 | '''
 48 |     for n, conv_param in enumerate(model_param):
 49 |         content += f'// conv_{n}\n'
 50 |         for k, v in name_mapping.items():
 51 |             if hasattr(conv_param, k): # e.g. conv_last has no incbit
 52 |                 content += f'#define CONV_{n}_{v} {getattr(conv_param, k)}\n'
 53 |         content += '\n'
 54 |     content += '#endif'
 55 | 
 56 |     with open(path + 'config.h', 'w') as f:
 57 |         print(content, file=f)
 58 | 
 59 | def extract_model(in_shape):
 60 |     model_param: List[ConvParam] = []
 61 |     feature_map_shape = in_shape
 62 |     conv_cnt = 0
 63 |     conv_cur = None
 64 |     for sub_module in model.modules():
 65 |         # expect [QAct] -> [Pooling] -> Conv -> [BN] -> [Pooling], state machine mode
 66 |         if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ) or isinstance(sub_module, activation_quantize_fn):
 67 |             print('  Detected ActQ Layer', end='')
 68 |             if conv_cur is None: conv_cur = ConvParam()
 69 |             if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ):
 70 |                 conv_cur.abit = sub_module.bit
 71 |                 conv_cur.astep = sub_module.step
 72 |             else:
 73 |                 conv_cur.abit = sub_module.a_bit
 74 |                 conv_cur.astep = 1/2**conv_cur.abit
 75 |             
 76 |             conv_cur.actq_class = type(sub_module).__name__
 77 |             print(f', abit {conv_cur.abit}, astep {conv_cur.astep}, class {conv_cur.actq_class}')
 78 | 
 79 |             if conv_cnt: # previous.obit = cur.abit
 80 |                 model_param[conv_cnt-1].obit = conv_cur.abit
 81 |                 model_param[conv_cnt-1].ostep = conv_cur.astep
 82 |             
 83 |         elif isinstance(sub_module, torch.nn.Conv2d):
 84 |             if conv_cur is None: conv_cur = ConvParam()
 85 |             conv_cur.n = conv_cnt
 86 |             print('Extract conv_%d'%conv_cnt, end='')
 87 | 
 88 |             conv_cur.k = sub_module.kernel_size[0]
 89 |             conv_cur.s = sub_module.stride[0]
 90 |             conv_cur.p = sub_module.padding[0]
 91 |             conv_cur.ich = sub_module.in_channels
 92 |             conv_cur.och = sub_module.out_channels
 93 |             conv_cur.irow = feature_map_shape[1]
 94 |             conv_cur.icol = feature_map_shape[2]
 95 |             
 96 |             feature_map_shape[0] = sub_module.out_channels
 97 |             feature_map_shape[1] = (feature_map_shape[1] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1
 98 |             feature_map_shape[2] = (feature_map_shape[2] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1
 99 |             conv_cur.orow = feature_map_shape[1]
100 |             conv_cur.ocol = feature_map_shape[2]
101 | 
102 |             if sub_module.bias is not None:
103 |                 conv_cur.convbias = sub_module.bias.detach().numpy()
104 |                 print(', +bias', end='')
105 | 
106 |             if isinstance(sub_module, QuantConv2d): # New quant
107 |                 conv_cur.wbit = sub_module.bit
108 |                 conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step becuause of alpha
109 | 
110 |             elif type(sub_module).__name__ == 'Conv2d_Q': # Old dorefa quant
111 |                 conv_cur.wbit = sub_module.w_bit
112 |                 conv_cur.wstep = 1/2**(conv_cur.wbit-1)
113 |                 weight = np.tanh(sub_module.weight.detach().numpy())
114 |                 weight = weight / np.max(np.abs(weight))
115 |                 n = 2**(conv_cur.wbit-1)
116 |                 weight_q = weight * n
117 |                 weight_q = np.clip(np.round(weight_q),-n, n-1)
118 |                 weight_q = weight_q.astype(np.int32)
119 |                 conv_cur.w = weight_q
120 |             else:
121 |                 raise NotImplementedError(sub_module)
122 |             print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur)))
123 |             
124 |             model_param.append(conv_cur)
125 |             conv_cur = None
126 |             conv_cnt += 1
127 |         
128 |         elif isinstance(sub_module, torch.nn.BatchNorm2d):
129 |             print('  Detected BatchNorm2d')
130 |             gamma = sub_module.weight
131 |             beta = sub_module.bias
132 |             mean = sub_module.running_mean
133 |             var = sub_module.running_var
134 |             eps = sub_module.eps
135 |             
136 |             model_param[-1].bn_w = (gamma / (torch.sqrt(var + eps))).detach().numpy()
137 |             model_param[-1].bn_b = (beta - (mean / (torch.sqrt(var + eps)) * gamma)).detach().numpy()
138 | 
139 |         elif isinstance(sub_module, torch.nn.MaxPool2d):
140 |             feature_map_shape[1] = feature_map_shape[1] // sub_module.kernel_size
141 |             feature_map_shape[2] = feature_map_shape[2] // sub_module.kernel_size
142 |             model_param[-1].max_pool = True
143 |     
144 |     if not hasattr(model_param[0], 'abit'): # train code rescaled [0,255] to [0,1) by /256 default
145 |         model_param[0].abit = 8
146 |     if not hasattr(model_param[0], 'astep'):
147 |         model_param[0].astep = 1/256
148 | 
149 |     return model_param
150 | 
151 | def process_batchnorm(model_param):
152 |     '''process_batchnorm(model_param)
153 |     Merge wstep, astep, ostep scale into batchnorm, then quantize. 
154 | 
155 |     Method:
156 |     Define MAC = Conv(w, a), out = MAC*BN_w + BN_b,
157 |     wq = w/wstep, aq = a/astep, MACq = MAC/MACstep, outq = out/ostep.
158 | 
159 |     outq = (MAC*BN_w + BN_b) / ostep
160 |          = MACq * (MACstep/ostep)*BN_w + BN_b/ostep
161 |          = MACq *     inc_raw          + bias_raw
162 |     next layer activation a' = ActQ(out), i.e. a'q = clip(round(outq))
163 | 
164 |     Quantiaztion of inc_raw & bias_raw: 
165 |     outq_real = round((MACq*round(inc_raw*scale) + round(bias_raw*scale)) / scale)         ; where scale=2**T
166 |               = (MACq*round(inc_raw*scale) + round(bias_raw*scale) + 0.5 * scale) // scale ; div floor
167 |               = (MACq*        inc          +         bias          +  2**(T-1)  ) >> T     ; [!] the 2**(T-1) bias is done by hls code
168 | 
169 |     Params:
170 |     T = (wbit-1)+abit+lshift  # This comes from dorefa quant, not optimal
171 |     MBIT = wbit+abit+ceil(log2(sum_number))
172 |     incbit = len(bit(inc)); biasbit = len(bit(bias))
173 |     larger lshift is better, but MBIT+incbit<48
174 |     '''
175 |     lshift = 16
176 | 
177 |     for conv in model_param[:-1]:
178 |         print(f'Process bn_{conv.n}, shape {conv.bn_w.shape},', end = ' ')
179 | 
180 |         # Merge step to BN
181 |         conv.lshift = lshift
182 |         MACstep = conv.wstep * conv.astep
183 |         ostep = conv.ostep
184 |         inc_raw = conv.bn_w * MACstep / ostep
185 |         bias_raw = conv.bn_b / ostep
186 |         conv.inc_raw = inc_raw
187 |         conv.bias_raw = bias_raw
188 | 
189 |         # Quantization
190 |         T = lshift+conv.wbit+conv.abit-1
191 |         conv.inc = np.round(inc_raw * 2**T).astype(np.int64)
192 |         conv.bias = np.round(bias_raw * 2**T).astype(np.int64)
193 |         conv.lshift_T = T
194 |         # Get bitlength
195 |         bitlength = lambda x: 1 + int(np.abs(x).max()).bit_length()
196 |         conv.incbit = bitlength(conv.inc)
197 |         conv.biasbit = bitlength(conv.bias)
198 |         print(f'incbit {conv.incbit}, biasbit {conv.biasbit}, lshift_T {conv.lshift_T}')
199 |     
200 |     conv_last = model_param[-1] # process lastbias
201 |     conv_last.inc = None
202 |     conv_last.div = 1/(conv_last.wstep * conv_last.astep)
203 |     conv_last.bias = np.round(conv_last.convbias * conv_last.div).astype(np.int64)
204 |     conv_last.bias_raw = conv_last.convbias * conv_last.div
205 |     conv_last.biasbit = bitlength(conv_last.bias)
206 |     print(f'conv_last biasbit {conv_last.biasbit}, div {conv_last.div}')
207 | 
208 | def reorder_weight(model_param, layers_simd, layers_pe):
209 |     '''reorder_weight(model_param)
210 |     Reorder array for hlscode.
211 |     '''
212 | 
213 |     for conv, simd, pe in zip(model_param, layers_simd, layers_pe):
214 |         print(f'Reorder conv_{conv.n}, w {conv.w.shape}', end='')
215 |         conv.simd = simd
216 |         conv.pe = pe
217 | 
218 |         # process batchnorm
219 |         if conv.inc is not None:
220 |             conv.inc = conv.inc.reshape(conv.och//conv.pe, conv.pe).T
221 |         if conv.bias is not None:
222 |             conv.bias = conv.bias.reshape(conv.och//conv.pe, conv.pe).T
223 |         
224 |         # process conv weight
225 |         w = conv.w    # [och, ich, kr, kc]
226 |         assert conv.och%conv.pe == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}"
227 |         assert conv.k*conv.ich%simd == 0, f"conv_{conv.n}, ich {conv.ich}, k {conv.k}, simd {conv.simd}"
228 | 
229 |         # if conv.n==0: # first layer is different
230 |         #    w = w.transpose(0, 2, 3, 1) # [och, kr, kc, ich]
231 |         # else:
232 |         w = w.transpose(0, 3, 2, 1) # [och, kc, kr, ich]
233 | 
234 |         w = w.reshape(conv.och//conv.pe, conv.pe, conv.k, conv.k*conv.ich//simd, simd)
235 |         w = w.transpose(1,2,0,3,4) # [pe, k, och/pe, k*ich/simd, simd]
236 |         w = w.reshape(conv.pe, conv.k, -1, simd) # hls format [pe, k, och/pe*k*ich/simd, simd]
237 | 
238 |         if conv.k == 1: # kernel size=1
239 |             w = w.reshape(conv.pe, -1, simd)
240 |         print(' ->', w.shape)
241 | 
242 |         conv.w = w
243 | 
244 | def print_ndarray_recursion(arr, str_func=str, file=sys.stdout, stop=0):
245 |     if not hasattr(arr, '__iter__') or len(arr.shape) == stop:
246 |         print(str_func(arr), file=file, end='')
247 |         return
248 |     ends = '' if (len(arr.shape)==stop+1) else '\n'
249 |     print('{', file=file, end='')
250 |     for i, item in enumerate(arr):
251 |         print_ndarray_recursion(item, str_func, file, stop)
252 |         if i!=len(arr)-1: print(',', file=file, end=ends)
253 |     print(ends+'}', file=file, end='')
254 | 
255 | def write_hls_weights(model_param, path):
256 |     '''write_hls_weights(model_param, path)
257 |     Write hls weights+inc+bias array code according to numpy shape.
258 |     '''
259 |     f = open(path + 'weights.hpp', 'w')
260 | 
261 |     print(f'''/********************************************************************************
262 | * Filename: weights.hpp
263 | * Date: {time.ctime()}
264 | * Description: This file is generated by {parser.prog}
265 | *   ptfilename: {opt.weight} 
266 | ********************************************************************************/
267 | 
268 | #ifndef _WEIGHTS_HPP_
269 | #define _WEIGHTS_HPP_
270 | #include <ap_int.h>
271 | ''', file=f)
272 | 
273 |     for conv in model_param:
274 |         n = conv.n
275 |         print(f"Write conv_{n} weight, pe {conv.pe}, simd {conv.simd}, wbit {conv.wbit}")
276 |         print(f"// layer: {n}, PE: {conv.pe}, SIMD: {conv.simd}, wbit: {conv.wbit}", file=f)
277 | 
278 |         # print conv weight,  merge [SIMD] value into one ap_uint
279 |         if conv.k>1:
280 |             print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.k}][{conv.w.shape[2]}]=", file=f)
281 |         else:
282 |             print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.w.shape[1]}]=", file=f)
283 |         hex_str = lambda x: '"' + hex(x) + '"'
284 |         def pack1d_str(arr): # x: 1d-array
285 |             x = 0
286 |             for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention
287 |                 v = int(v) # use python bignumber, not np.int
288 |                 assert -1<<conv.wbit-1 <= v < 1<<conv.wbit-1, f'got v={v} while wbit={conv.wbit}'
289 |                 x=(x<<conv.wbit) + (v&(2**conv.wbit-1))
290 |             return hex_str(x)
291 |         print_ndarray_recursion(conv.w, pack1d_str, f, stop=1)
292 |         print(';', file=f)
293 | 
294 |         # print inc, bias
295 |         if conv.inc is not None:
296 |             print(f"const ap_int<{conv.incbit}> conv_{n}_inc[{conv.pe}][{conv.och//conv.pe}]=", file=f)
297 |             print_ndarray_recursion(conv.inc, hex_str, f)
298 |             print(';', file=f)
299 |         if conv.bias is not None:
300 |             print(f"const ap_int<{conv.biasbit}> conv_{n}_bias[{conv.pe}][{conv.och//conv.pe}]=", file=f)
301 |             print_ndarray_recursion(conv.bias, hex_str, f)
302 |             print(';', file=f)
303 |     
304 |     print('#endif', file=f)
305 |     f.close()
306 | 
307 | def adjust_weight(model_param):
308 |     special_wa_bit = ((4,2),(5,3),(5,4),(5,5),(5,6),(5,7),(5,8),(7,2),(7,3)) 
309 |     # These packing can't quantize to -2**(wbit-1)
310 |     for conv in model_param:
311 |         if (conv.wbit, conv.abit) in special_wa_bit:
312 |             print(f'Adjust conv_{conv.n} wbit={conv.wbit}')
313 |             conv.w = np.maximum(conv.w, -2**(conv.wbit-1)+1)
314 | 
315 | if __name__=='__main__':
316 |     parser = argparse.ArgumentParser()
317 |     parser.add_argument('-w', '--weight', default=None, help='.pt file name in ./weights/')
318 |     parser.add_argument('-m', '--model', default='UltraNet_FixQ', help = 'model class name in mymodel.py')
319 |     parser.add_argument('-c', '--config-simd-pe', default='config_simd_pe', help = '.txt file in ./hls/')
320 |     opt = parser.parse_args()
321 |     if opt.weight is None: opt.weight = select_weight_file()
322 | 
323 |     simd_pe = np.loadtxt('hls/'+opt.config_simd_pe+'.txt', dtype=int, skiprows=1)
324 |     dir_output = 'hls/' + opt.weight + '/'
325 |     if not os.path.exists(dir_output): os.makedirs(dir_output)
326 | 
327 |     # load model and state_dict
328 |     ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu')
329 |     model = getattr(mymodel, opt.model)(**ptfile.setdefault('model_params', {}))
330 |     model.load_state_dict(ptfile['model'])
331 | 
332 |     # processs
333 |     model_param = extract_model([1, 160, 320])
334 |     adjust_weight(model_param)
335 |     process_batchnorm(model_param) # get bn param before write hls config
336 |     torch.save(model_param, dir_output + 'model_param.pkl')
337 |     
338 |     reorder_weight(model_param, simd_pe[:,0], simd_pe[:,1]) # get pe, simd param before write hls config
339 |     write_hls_config(model_param, dir_output)
340 |     write_hls_weights(model_param, dir_output)
341 | 


--------------------------------------------------------------------------------
/dacsdc/export_hls_skynet.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | from typing import Dict, List
  4 | import torch
  5 | import numpy as np
  6 | import sys
  7 | import os
  8 | 
  9 | import sys
 10 | sys.path.append('..')
 11 | import mymodel
 12 | from utils.view_pt import select_weight_file
 13 | from quant_dorefa import activation_quantize_fn
 14 | from anypacking.quant_module import HWGQ, QuantConv2d, ImageInputQ
 15 | 
 16 | class ConvParam: ...
 17 | 
 18 | def write_hls_config(model_param, path):
 19 |     name_mapping = {
 20 |         'k': 'K',
 21 |         #'s': 'S',
 22 |         #'p': 'P',
 23 |         'ich': 'IFM_CH',
 24 |         'irow': 'IFM_ROW',
 25 |         'icol': 'IFM_COL',
 26 |         'och': 'OFM_CH',
 27 |         'orow': 'OFM_ROW',
 28 |         'ocol': 'OFM_COL',
 29 |         'abit': 'IN_BIT',
 30 |         'wbit': 'W_BIT',
 31 |         'incbit': 'INC_BIT',
 32 |         'biasbit': 'BIAS_BIT',
 33 |         'simd': 'SIMD',
 34 |         'pe': 'PE',
 35 |         'lshift': 'L_SHIFT'
 36 |     }
 37 |     content = f'''/********************************************************************************
 38 | * Filename: config.h
 39 | * Date: {time.ctime()}
 40 | * Description: This file is generated by {parser.prog}
 41 | *   ptfilename: {opt.weight} 
 42 | ********************************************************************************/
 43 | 
 44 | #ifndef _CONFIG_H_
 45 | #define _CONFIG_H_
 46 | 
 47 | '''
 48 |     for n, conv_param in enumerate(model_param):
 49 |         content += f'// conv_{n}\n'
 50 |         for k, v in name_mapping.items():
 51 |             if hasattr(conv_param, k): # e.g. conv_last has no incbit
 52 |                 content += f'#define CONV_{n}_{v} {getattr(conv_param, k)}\n'
 53 |         content += '\n'
 54 |     content += '#endif'
 55 | 
 56 |     with open(path + 'config.h', 'w') as f:
 57 |         print(content, file=f)
 58 | 
 59 | def extract_model(in_shape):
 60 |     model_param: List[ConvParam] = []
 61 |     feature_map_shape = in_shape
 62 |     conv_cnt = 0
 63 |     conv_cur = None
 64 |     for sub_module in model.modules():
 65 |         # expect [QAct] -> [Pooling] -> Conv -> [BN] -> [Pooling], state machine mode
 66 |         if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ) or isinstance(sub_module, activation_quantize_fn):
 67 |             print('  Detected ActQ Layer', end='')
 68 |             if conv_cur is None: conv_cur = ConvParam()
 69 |             if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ):
 70 |                 conv_cur.abit = sub_module.bit
 71 |                 conv_cur.astep = sub_module.step
 72 |             else:
 73 |                 conv_cur.abit = sub_module.a_bit
 74 |                 conv_cur.astep = 1/2**conv_cur.abit
 75 |             
 76 |             conv_cur.actq_class = type(sub_module).__name__
 77 |             print(f', abit {conv_cur.abit}, astep {conv_cur.astep}, class {conv_cur.actq_class}')
 78 | 
 79 |             if conv_cnt: # previous.obit = cur.abit
 80 |                 model_param[conv_cnt-1].obit = conv_cur.abit
 81 |                 model_param[conv_cnt-1].ostep = conv_cur.astep
 82 |             
 83 |         elif isinstance(sub_module, torch.nn.Conv2d):
 84 |             if conv_cur is None: conv_cur = ConvParam()
 85 |             conv_cur.n = conv_cnt
 86 |             print('Extract conv_%d'%conv_cnt, end='')
 87 | 
 88 |             conv_cur.k = sub_module.kernel_size[0]
 89 |             conv_cur.s = sub_module.stride[0]
 90 |             conv_cur.p = sub_module.padding[0]
 91 |             conv_cur.ich = sub_module.in_channels
 92 |             conv_cur.och = sub_module.out_channels
 93 |             conv_cur.groups = sub_module.groups if hasattr(sub_module, 'groups') else 1
 94 |             conv_cur.irow = feature_map_shape[1]
 95 |             conv_cur.icol = feature_map_shape[2]
 96 |             
 97 |             feature_map_shape[0] = sub_module.out_channels
 98 |             feature_map_shape[1] = (feature_map_shape[1] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1
 99 |             feature_map_shape[2] = (feature_map_shape[2] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1
100 |             conv_cur.orow = feature_map_shape[1]
101 |             conv_cur.ocol = feature_map_shape[2]
102 | 
103 |             if sub_module.bias is not None:
104 |                 conv_cur.convbias = sub_module.bias.detach().numpy()
105 |                 print(', +bias', end='')
106 | 
107 |             if isinstance(sub_module, QuantConv2d): # New quant
108 |                 conv_cur.wbit = sub_module.bit
109 |                 conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step becuause of alpha
110 | 
111 |             elif type(sub_module).__name__ == 'Conv2d_Q': # Old dorefa quant
112 |                 conv_cur.wbit = sub_module.w_bit
113 |                 conv_cur.wstep = 1/2**(conv_cur.wbit-1)
114 |                 weight = np.tanh(sub_module.weight.detach().numpy())
115 |                 weight = weight / np.max(np.abs(weight))
116 |                 n = 2**(conv_cur.wbit-1)
117 |                 weight_q = weight * n
118 |                 weight_q = np.clip(np.round(weight_q),-n, n-1)
119 |                 weight_q = weight_q.astype(np.int32)
120 |                 conv_cur.w = weight_q
121 |             else:
122 |                 raise NotImplementedError(sub_module)
123 |             print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}, g {groups}'.format(**vars(conv_cur)))
124 |             
125 |             model_param.append(conv_cur)
126 |             conv_cur = None
127 |             conv_cnt += 1
128 |         
129 |         elif isinstance(sub_module, torch.nn.BatchNorm2d):
130 |             print('  Detected BatchNorm2d')
131 |             gamma = sub_module.weight
132 |             beta = sub_module.bias
133 |             mean = sub_module.running_mean
134 |             var = sub_module.running_var
135 |             eps = sub_module.eps
136 |             
137 |             model_param[-1].bn_w = (gamma / (torch.sqrt(var + eps))).detach().numpy()
138 |             model_param[-1].bn_b = (beta - (mean / (torch.sqrt(var + eps)) * gamma)).detach().numpy()
139 | 
140 |         elif isinstance(sub_module, torch.nn.MaxPool2d):
141 |             feature_map_shape[1] = feature_map_shape[1] // sub_module.kernel_size
142 |             feature_map_shape[2] = feature_map_shape[2] // sub_module.kernel_size
143 |             model_param[-1].max_pool = True
144 |     
145 |     if not hasattr(model_param[0], 'abit'): # train code rescaled [0,255] to [0,1) by /256 default
146 |         model_param[0].abit = 8
147 |     if not hasattr(model_param[0], 'astep'):
148 |         model_param[0].astep = 1/256
149 | 
150 |     return model_param
151 | 
152 | def process_batchnorm(model_param):
153 |     '''process_batchnorm(model_param)
154 |     Merge wstep, astep, ostep scale into batchnorm, then quantize. 
155 | 
156 |     Method:
157 |     Define MAC = Conv(w, a), out = MAC*BN_w + BN_b,
158 |     wq = w/wstep, aq = a/astep, MACq = MAC/MACstep, outq = out/ostep.
159 | 
160 |     outq = (MAC*BN_w + BN_b) / ostep
161 |          = MACq * (MACstep/ostep)*BN_w + BN_b/ostep
162 |          = MACq *     inc_raw          + bias_raw
163 |     next layer activation a' = ActQ(out), i.e. a'q = clip(round(outq))
164 | 
165 |     Quantiaztion of inc_raw & bias_raw: 
166 |     outq_real = round((MACq*round(inc_raw*scale) + round(bias_raw*scale)) / scale)         ; where scale=2**T
167 |               = (MACq*round(inc_raw*scale) + round(bias_raw*scale) + 0.5 * scale) // scale ; div floor
168 |               = (MACq*        inc          +         bias          +  2**(T-1)  ) >> T     ; [!] the 2**(T-1) bias is done by hls code
169 | 
170 |     Params:
171 |     T = (wbit-1)+abit+lshift  # This comes from dorefa quant, not optimal
172 |     MBIT = wbit+abit+ceil(log2(sum_number))
173 |     incbit = len(bit(inc)); biasbit = len(bit(bias))
174 |     larger lshift is better, but MBIT+incbit<48
175 |     '''
176 |     lshift = 6
177 | 
178 |     for conv in model_param[:-1]:
179 |         print(f'Process bn_{conv.n}, shape {conv.bn_w.shape},', end = ' ')
180 | 
181 |         # Merge step to BN
182 |         conv.lshift = lshift
183 |         MACstep = conv.wstep * conv.astep
184 |         ostep = conv.ostep
185 |         inc_raw = conv.bn_w * MACstep / ostep
186 |         bias_raw = conv.bn_b / ostep
187 |         conv.inc_raw = inc_raw
188 |         conv.bias_raw = bias_raw
189 | 
190 |         # Quantization
191 |         T = lshift+conv.wbit+conv.abit-1
192 |         conv.inc = np.round(inc_raw * 2**T).astype(np.int64)
193 |         conv.bias = np.round(bias_raw * 2**T).astype(np.int64)
194 |         conv.lshift_T = T
195 |         # Get bitlength
196 |         bitlength = lambda x: 1 + int(np.abs(x).max()).bit_length()
197 |         conv.incbit = bitlength(conv.inc)
198 |         conv.biasbit = bitlength(conv.bias)
199 |         print(f'incbit {conv.incbit}, biasbit {conv.biasbit}, lshift_T {conv.lshift_T}')
200 |     
201 |     conv_last = model_param[-1] # process lastbias
202 |     conv_last.inc = None
203 |     conv_last.div = 1/(conv_last.wstep * conv_last.astep)
204 |     #conv_last.bias = np.round(conv_last.convbias * conv_last.div).astype(np.int64)
205 |     #conv_last.bias_raw = conv_last.convbias * conv_last.div
206 |     #conv_last.biasbit = bitlength(conv_last.bias)
207 |     #print(f'conv_last biasbit {conv_last.biasbit}, div {conv_last.div}')
208 | 
209 | def reorder_weight(model_param, layers_simd, layers_pe, layers_actp, layers_pep):
210 |     '''reorder_weight(model_param)
211 |     Reorder array for hlscode.
212 |     '''
213 | 
214 |     for conv, simd, pe, actp, pep in zip(model_param, layers_simd, layers_pe, layers_actp, layers_pep):
215 |         print(f'Reorder conv_{conv.n}, w {conv.w.shape}', end='')
216 |         conv.simd = simd
217 |         conv.pe = pe
218 |         conv.actp = actp
219 |         conv.pep = pep
220 | 
221 |         # process batchnorm
222 |         if conv.inc is not None:
223 |             conv.inc = conv.inc.reshape(conv.och//conv.actp, conv.actp).T
224 |         if hasattr(conv, 'bias') and conv.bias is not None:
225 |             conv.bias = conv.bias.reshape(conv.och//conv.actp, conv.actp).T
226 |         
227 |         # process conv weight
228 |         if conv.k == 1:
229 |             w = conv.w    # [och, ich, kr, kc]
230 |             g_ich = w.shape[1]
231 |             assert conv.och%(conv.pe * conv.pep) == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}, pep {conv.pep}"
232 |             assert g_ich%simd == 0, f"conv_{conv.n}, ich {g_ich}, simd {conv.simd}"
233 | 
234 |             w = w.reshape(conv.och//(conv.pe * conv.pep), conv.pe, conv.pep, g_ich//simd, simd) # [och / (pe * pep), pe, pep, ich / simd, simd]
235 |             w = w.transpose(1,0,3,4,2)  #[pe, och / (pe * pep), ich / simd, simd, pep]
236 |             w = w.reshape(conv.pe, -1, g_ich//simd, simd*conv.pep) # [pe, och / (pe * pep), ich / simd, simd * pep]
237 |             w = w.reshape(conv.pe, -1, simd*conv.pep)   # hls format [pe, och/(pe * pep) * ich/simd, simd * pep]
238 |         else:
239 |             w = conv.w    # [och, ich, kr, kc]
240 |             g_ich = w.shape[1]
241 |             assert conv.och%conv.pe == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}"
242 |             assert conv.k*g_ich%simd == 0, f"conv_{conv.n}, ich {g_ich}, k {conv.k}, simd {conv.simd}"
243 | 
244 |             # if conv.n==0: # first layer is different
245 |             #    w = w.transpose(0, 2, 3, 1) # [och, kr, kc, ich]
246 |             # else:
247 |             w = w.transpose(0, 3, 2, 1) # [och, kc, kr, ich]
248 | 
249 |             w = w.reshape(conv.och//conv.pe, conv.pe, conv.k, conv.k*g_ich//simd, simd)
250 |             w = w.transpose(1,2,0,3,4) # [pe, k, och/pe, k*ich/simd, simd]
251 |             w = w.reshape(conv.pe, conv.k, -1, simd) # hls format [pe, k, och/pe*k*ich/simd, simd]          
252 |     
253 |         print(' ->', w.shape)
254 | 
255 |         conv.w = w
256 | 
257 | def print_ndarray_recursion(arr, str_func=str, file=sys.stdout, stop=0):
258 |     if not hasattr(arr, '__iter__') or len(arr.shape) == stop:
259 |         print(str_func(arr), file=file, end='')
260 |         return
261 |     ends = '' if (len(arr.shape)==stop+1) else '\n'
262 |     print('{', file=file, end='')
263 |     for i, item in enumerate(arr):
264 |         print_ndarray_recursion(item, str_func, file, stop)
265 |         if i!=len(arr)-1: print(',', file=file, end=ends)
266 |     print(ends+'}', file=file, end='')
267 | 
268 | def write_hls_weights(model_param, path):
269 |     '''write_hls_weights(model_param, path)
270 |     Write hls weights+inc+bias array code according to numpy shape.
271 |     '''
272 |     f = open(path + 'weights.hpp', 'w')
273 | 
274 |     print(f'''/********************************************************************************
275 | * Filename: weights.hpp
276 | * Date: {time.ctime()}
277 | * Description: This file is generated by {parser.prog}
278 | *   ptfilename: {opt.weight} 
279 | ********************************************************************************/
280 | 
281 | #ifndef _WEIGHTS_HPP_
282 | #define _WEIGHTS_HPP_
283 | #include <ap_int.h>
284 | ''', file=f)
285 | 
286 |     for conv in model_param:
287 |         n = conv.n
288 |         def pack1d_str(arr): # x: 1d-array
289 |             x = 0
290 |             for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention
291 |                 v = int(v) # use python bignumber, not np.int
292 |                 assert -1<<conv.wbit-1 <= v < 1<<conv.wbit-1, f'got v={v} while wbit={conv.wbit}'
293 |                 x=(x<<conv.wbit) + (v&(2**conv.wbit-1))
294 |             return hex_str(x)
295 | 
296 |         if conv.k == 1:
297 |             print(f"Write conv_{n} weight, pe {conv.pe}, pep{conv.pep}, simd {conv.simd}, actp {conv.actp}, wbit {conv.wbit}")
298 |             print(f"// layer: {n}, PE: {conv.pe}, PEP: {conv.pep}, SIMD: {conv.simd}, ACTP: {conv.actp}, wbit: {conv.wbit}", file=f)
299 | 
300 |             # print conv weight,  merge [SIMD] value into one ap_uint
301 |             print(f"const ap_uint<{conv.wbit * conv.pep * conv.simd}> conv_{n}_w[{conv.pe}][{conv.w.shape[1]}]=", file=f)
302 |             hex_str = lambda x: '"' + hex(x) + '"'
303 |             print_ndarray_recursion(conv.w, pack1d_str, f, stop=1)
304 |             print(';', file=f)
305 |         else:
306 |             print(f"Write conv_{n} weight, pe {conv.pe}, simd {conv.simd}, actp {conv.actp}, wbit {conv.wbit}")
307 |             print(f"// layer: {n}, PE: {conv.pe}, SIMD: {conv.simd}, ACTP: {conv.actp}, wbit: {conv.wbit}", file=f)
308 | 
309 |             # print conv weight,  merge [SIMD] value into one ap_uint
310 |             print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.k}][{conv.w.shape[2]}]=", file=f)
311 |             hex_str = lambda x: '"' + hex(x) + '"'
312 |             print_ndarray_recursion(conv.w, pack1d_str, f, stop=1)
313 |             print(';', file=f)
314 | 
315 |         # print inc, bias
316 |         if conv.inc is not None:
317 |             print(f"const ap_int<{conv.incbit}> conv_{n}_inc[{conv.actp}][{conv.och//conv.actp}]=", file=f)
318 |             print_ndarray_recursion(conv.inc, hex_str, f)
319 |             print(';', file=f)
320 |         if hasattr(conv, 'bias') and conv.bias is not None:
321 |             print(f"const ap_int<{conv.biasbit}> conv_{n}_bias[{conv.actp}][{conv.och//conv.actp}]=", file=f)
322 |             print_ndarray_recursion(conv.bias, hex_str, f)
323 |             print(';', file=f)
324 |     
325 |     print('#endif', file=f)
326 |     f.close()
327 | 
328 | def adjust_weight(model_param):
329 |     special_wa_bit = ((4,2),(5,3),(5,4),(5,5),(5,6),(5,7),(5,8),(7,2),(7,3)) 
330 |     # These packing can't quantize to -2**(wbit-1)
331 |     for conv in model_param:
332 |         if (conv.wbit, conv.abit) in special_wa_bit:
333 |             print(f'Adjust conv_{conv.n} wbit={conv.wbit}')
334 |             conv.w = np.maximum(conv.w, -2**(conv.wbit-1)+1)
335 | 
336 | if __name__=='__main__':
337 |     parser = argparse.ArgumentParser()
338 |     parser.add_argument('-w', '--weight', default=None, help='.pt file name in ./weights/')
339 |     parser.add_argument('-m', '--model', default='SkyNet_FixQ', help = 'model class name in mymodel.py')
340 |     parser.add_argument('-c', '--config-simd-pe', default='config_simd_pe_skynet', help = '.txt file in ./hls/')
341 |     opt = parser.parse_args()
342 |     if opt.weight is None: opt.weight = select_weight_file()
343 | 
344 |     simd_pe = np.loadtxt('hls/'+opt.config_simd_pe+'.txt', dtype=int, skiprows=1)
345 |     dir_output = 'hls/' + opt.weight + '/'
346 |     if not os.path.exists(dir_output): os.makedirs(dir_output)
347 | 
348 |     # load model and state_dict
349 |     ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu')
350 |     model = getattr(mymodel, opt.model)(**ptfile.setdefault('model_params', {}))
351 |     model.load_state_dict(ptfile['model'])
352 | 
353 |     # processs
354 |     model_param = extract_model([1, 160, 320])
355 |     # adjust_weight(model_param)
356 |     process_batchnorm(model_param) # get bn param before write hls config
357 |     torch.save(model_param, dir_output + 'model_param.pkl')
358 |     
359 |     reorder_weight(model_param, simd_pe[:,0], simd_pe[:,1], simd_pe[:,2], simd_pe[:,3]) # get pe, simd, actp, pep param before write hls config
360 |     write_hls_config(model_param, dir_output)
361 |     write_hls_weights(model_param, dir_output)
362 | 


--------------------------------------------------------------------------------
/dacsdc/hls/config_simd_pe.txt:
--------------------------------------------------------------------------------
 1 | simd pe
 2 | 3    16
 3 | 16   4
 4 | 8    8
 5 | 8    4
 6 | 4    2
 7 | 4    2
 8 | 4    2
 9 | 4    2
10 | 4    2


--------------------------------------------------------------------------------
/dacsdc/main_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch.distributed as dist
  4 | import torch.optim as optim
  5 | import torch.optim.lr_scheduler as lr_scheduler
  6 | import time
  7 | 
  8 | import sys
  9 | sys.path.append('..')
 10 | import localconfig
 11 | import test
 12 | from datasets import *
 13 | from yolo_utils import *
 14 | 
 15 | from mymodel import *
 16 | import mymodel
 17 | 
 18 | wdir = 'weights' + os.sep  # weights dir
 19 | 
 20 | # Hyperparameters (results68: 59.9 mAP@0.5 yolov3-spp-416) https://github.com/ultralytics/yolov3/issues/310
 21 | 
 22 | hyp = {'giou': 3.54,  # giou loss gain
 23 |        'cls': 37.4,  # cls loss gain
 24 |        'cls_pw': 1.0,  # cls BCELoss positive_weight
 25 |        'obj': 64.3,  # obj loss gain (*=img_size/320 if img_size != 320)
 26 |        'obj_pw': 1.0,  # obj BCELoss positive_weight
 27 |        'iou_t': 0.225,  # iou training threshold
 28 |        'lr0': 0.01,  # initial learning rate (SGD=5E-3, Adam=5E-4)
 29 |        'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
 30 |        'momentum': 0.937,  # SGD momentum
 31 |        'weight_decay': 0.000484,  # optimizer weight decay
 32 |        'fl_gamma': 0.5,  # focal loss gamma
 33 |        'hsv_h': 0.0138,  # image HSV-Hue augmentation (fraction)
 34 |        'hsv_s': 0.678,  # image HSV-Saturation augmentation (fraction)
 35 |        'hsv_v': 0.36,  # image HSV-Value augmentation (fraction)
 36 |        'degrees': 1.98,  # image rotation (+/- deg)
 37 |        'translate': 0.05,  # image translation (+/- fraction)
 38 |        'scale': 0.05,  # image scale (+/- gain)
 39 |        'shear': 0.641}  # image shear (+/- deg)
 40 | 
 41 | # Overwrite hyp with hyp*.txt (optional)
 42 | f = glob.glob('hyp*.txt')
 43 | if f:
 44 |     print('Using %s' % f[0])
 45 |     for k, v in zip(hyp.keys(), np.loadtxt(f[0])):
 46 |         hyp[k] = v
 47 | 
 48 | def train():
 49 |     img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2  # train, test sizes
 50 |     epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
 51 |     batch_size = opt.batch_size
 52 |     accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
 53 |     weights = opt.weights  # initial training weights
 54 | 
 55 |     # Initialize
 56 |     init_seeds()
 57 | 
 58 |     # Configure run
 59 |     train_path = localconfig.train_path
 60 |     test_path = localconfig.test_path
 61 |     nc = 1 
 62 | 
 63 |     results_file = 'results/%s.txt'%opt.name
 64 | 
 65 |     # Initialize model
 66 |     if opt.model != '':
 67 |         model = getattr(mymodel, opt.model)(opt.bitw, opt.bita).to(device)
 68 |     else:
 69 |         if opt.bypass:
 70 |             model = UltraNetBypass_FixQ(opt.bitw, opt.bita).to(device)
 71 |         else:
 72 |             model = UltraNet_FixQ(opt.bitw, opt.bita).to(device)
 73 | 
 74 |     # Optimizer
 75 |     pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
 76 |     for k, v in dict(model.named_parameters()).items():
 77 |         if '.bias' in k:
 78 |             pg2 += [v]  # biases
 79 |         elif 'Conv2d.weight' in k:
 80 |             pg1 += [v]  # apply weight_decay
 81 |         else:
 82 |             pg0 += [v]  # all else
 83 | 
 84 |     if opt.adam:
 85 |         # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
 86 |         optimizer = optim.Adam(pg0, lr=hyp['lr0'])
 87 |     else:
 88 |         optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
 89 |     optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
 90 |     optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
 91 |     optimizer.param_groups[2]['lr'] *= 2.0  # bias lr
 92 | 
 93 |     del pg0, pg1, pg2
 94 | 
 95 |     start_epoch = 0
 96 |     test_best_iou = 0.0
 97 | 
 98 |     # load weights
 99 |     if weights.endswith('.pt'):  # pytorch format
100 |         # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
101 |         chkpt = torch.load(weights, map_location=device)
102 | 
103 |         # load model
104 |         try:
105 |             chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
106 |             model.load_state_dict(chkpt['model'], strict=False)
107 |         except KeyError as e:
108 |             s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " % (opt.weights, opt.cfg, opt.weights)
109 |             raise KeyError(s) from e
110 | 
111 |         if opt.resume:
112 |         # load optimizer
113 |             if chkpt['optimizer'] is not None:
114 |                 optimizer.load_state_dict(chkpt['optimizer'])
115 |                 best_fitness = chkpt['best_fitness']
116 | 
117 |             # load results
118 |             if chkpt.get('training_results') is not None:
119 |                 with open(results_file, 'w') as file:
120 |                     file.write(chkpt['training_results'])  # write results.txt
121 | 
122 |             start_epoch = chkpt['epoch'] + 1
123 | 
124 |         del chkpt
125 | 
126 |     # Scheduler https://github.com/ultralytics/yolov3/issues/238
127 |     lf = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.999 + 0.001  # cosine https://arxiv.org/pdf/1812.01187.pdf
128 |     scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
129 |     scheduler.last_epoch = start_epoch
130 | 
131 |     # Initialize distributed training
132 |     if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
133 |         dist.init_process_group(backend='nccl',  # 'distributed backend'
134 |                                 init_method='tcp://127.0.0.1:5000',  # distributed training init method
135 |                                 world_size=1,  # number of nodes for distributed training
136 |                                 rank=0)  # distributed training node rank
137 |         model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
138 |         model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level
139 | 
140 |     # Dataloader
141 |     #batch_size = min(batch_size, len(dataset))
142 |     nw = min([os.cpu_count()//4, batch_size//4 if batch_size > 1 else 0, 8])  # number of workers
143 | 
144 |     # Testloader
145 |     testset = LoadImagesAndLabels(test_path, img_size_test, batch_size,
146 |                                                                  hyp=hyp,
147 |                                                                  rect=False,
148 |                                                                  cache_images=opt.cache_images,
149 |                                                                  single_cls=opt.single_cls)
150 |     testloader = torch.utils.data.DataLoader(testset,
151 |                                              batch_size=batch_size,
152 |                                              num_workers=0,
153 |                                              pin_memory=True,
154 |                                              collate_fn=testset.collate_fn)
155 |     
156 |     test.test(batch_size=batch_size,
157 |                                 img_size=img_size_test,
158 |                                 model=model,
159 |                                 dataloader=testloader) # make forward
160 |     bops, bita, bitw, dsps, brams = model.fetch_arch_info()
161 |     print('model with bops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M, bram: {:.3f}K'.format(bops, bita, bitw, dsps, brams))
162 | 
163 |     # Dataset
164 |     dataset = LoadImagesAndLabels(train_path, img_size, batch_size,
165 |                                   augment=True,
166 |                                   hyp=hyp,  # augmentation hyperparameters
167 |                                   rect=opt.rect,  # rectangular training
168 |                                   cache_images=opt.cache_images,
169 |                                   single_cls=opt.single_cls)
170 |                                              
171 |     dataloader = torch.utils.data.DataLoader(dataset,
172 |                                              batch_size=batch_size,
173 |                                              num_workers=nw,
174 |                                              shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
175 |                                              pin_memory=True,
176 |                                              collate_fn=dataset.collate_fn)
177 | 
178 |     # Start training
179 |     nb = len(dataloader)
180 |     prebias = start_epoch == 0
181 |     model.nc = nc  # attach number of classes to model
182 |     model.arc = opt.arc  # attach yolo architecture
183 |     model.hyp = hyp  # attach hyperparameters to model
184 |     model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
185 |     # torch.autograd.set_detect_anomaly(True)
186 |     results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
187 |     t0 = time.time()
188 |     torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
189 |     print('Using %g dataloader workers' % nw)
190 |     print('Starting training for %g epochs...' % epochs)
191 |             
192 |     for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
193 |         model.train()
194 |         model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2  # GIoU <-> 1.0 loss ratio
195 | 
196 |         # Prebias
197 |         if prebias:
198 |             ne = max(round(30 / nb), 3)  # number of prebias epochs
199 |             ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \
200 |                  np.interp(epoch, [0, ne], [0.9, hyp['momentum']])  # prebias settings (lr=0.1, momentum=0.9)
201 |             if epoch == ne:
202 |                 # print_model_biases(model)
203 |                 prebias = False
204 | 
205 |             # Bias optimizer settings
206 |             optimizer.param_groups[2]['lr'] = ps[0]
207 |             if optimizer.param_groups[2].get('momentum') is not None:  # for SGD but not Adam
208 |                 optimizer.param_groups[2]['momentum'] = ps[1]
209 | 
210 |         mloss = torch.zeros(4).to(device)  # mean losses
211 |         print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'iouloss', 'objloss', 'triou', 'mloss', 'targets', 'img_size'))
212 |         pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
213 |         for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
214 |             ni = i + nb * epoch  # number integrated batches (since train start)
215 |             imgs = imgs.to(device).float() / 256.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
216 |             targets = targets.to(device)
217 | 
218 |             # Run model
219 |             pred = model(imgs)
220 | 
221 |             # Compute loss
222 |             loss, loss_items = compute_loss(pred, targets, model)
223 |             if not torch.isfinite(loss):
224 |                 print('WARNING: non-finite loss, ending training ', loss_items)
225 |                 return results
226 | 
227 |             # Scale loss by nominal batch_size of 64
228 |             loss *= batch_size / 64
229 | 
230 |             loss.backward()
231 | 
232 |             # Optimize accumulated gradient
233 |             if ni % accumulate == 0:
234 |                 optimizer.step()
235 |                 optimizer.zero_grad()
236 |             # Print batch results
237 |             mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
238 |             mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
239 |             s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size)
240 |             pbar.set_description(s)
241 | 
242 |             # end batch ------------------------------------------------------------------------------------------------
243 |         
244 |         # Update scheduler
245 |         scheduler.step()
246 | 
247 |         train_iou = mloss[2]
248 | 
249 |         # Process epoch results
250 |         final_epoch = epoch + 1 == epochs
251 |         if not opt.notest or final_epoch:  # Calculate mAP
252 |             results = test.test(batch_size=batch_size,
253 |                                 img_size=img_size_test,
254 |                                 model=model,
255 |                                 dataloader=testloader)
256 |         
257 |         # Write epoch results
258 |         with open(results_file, 'a') as f:
259 |             f.write(s + '%10.3g' * len(results) % results + '\n')  # test_losses=(iou, loss_sum, lobj, lcls)
260 | 
261 |         # Update best mAP
262 |         results =  torch.tensor(results, device = 'cpu')
263 |                
264 |         test_iou = results[0]
265 |         if test_iou > test_best_iou:
266 |             test_best_iou = test_iou
267 | 
268 |         # Save training results
269 |         save = (not opt.nosave) or (final_epoch)
270 |         if save:
271 |             with open(results_file, 'r') as f:
272 |                 # Create checkpoint
273 |                 chkpt = {'epoch': epoch,
274 |                          'training_results': f.read(),
275 |                          'model': model.module.state_dict() if type(
276 |                              model) is nn.parallel.DistributedDataParallel else model.state_dict(),
277 |                          'optimizer': None if final_epoch else optimizer.state_dict(),
278 |                          'model_params':model.model_params, # arch param
279 |                          'extra': {'time': time.ctime(), 'name': opt.name}}
280 | 
281 |             # Save last checkpoint
282 |             torch.save(chkpt, wdir + '%s_last.pt'%opt.name)
283 |             
284 |             if test_iou == test_best_iou:
285 |                 torch.save(chkpt, wdir + '%s_best.pt'%opt.name)
286 | 
287 |             # Delete checkpoint
288 |             del chkpt
289 | 
290 |         # end epoch ----------------------------------------------------------------------------------------------------
291 | 
292 |     # end training
293 |     n = opt.name
294 | 
295 |     print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
296 |     dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
297 |     torch.cuda.empty_cache()
298 |     
299 |     with open('results.csv', 'a') as f:
300 |         print("fixed,%s,%d/%d, , ,%s,%s,%.1f,%.1f, , , ,%d, ,%.3f, "%
301 |               (opt.name,epochs-1,epochs,opt.bitw,opt.bita,train_iou*100,(test_iou+test_best_iou)*50,
302 |                int(round(bops)), dsps), file=f)
303 | 
304 |     return results
305 | 
306 | 
307 | if __name__ == '__main__':
308 |     parser = argparse.ArgumentParser()
309 |     parser.add_argument('--bypass', action='store_true', help='use bypass model')
310 |     parser.add_argument('--epochs', type=int, default=200)  # 500200 batches at bs 16, 117263 COCO images = 273 epochs
311 |     parser.add_argument('--batch-size', type=int, default=64)  # effective bs = batch_size * accumulate = 16 * 4 = 64
312 |     parser.add_argument('--accumulate', type=int, default=1, help='batches to accumulate before optimizing')
313 |     parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path')
314 |     parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path')
315 |     parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes')
316 |     parser.add_argument('--rect', action='store_true', help='rectangular training')
317 |     parser.add_argument('--resume', action='store_true', help='resume training from last.pt')
318 |     parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
319 |     parser.add_argument('--notest', action='store_true', help='only test final epoch')
320 |     parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
321 |     parser.add_argument('--weights', type=str, default='', help='initial weights path')
322 |     parser.add_argument('--arc', type=str, default='default', help='yolo architecture')  # default, uCE, uBCE
323 |     parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
324 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)')
325 |     parser.add_argument('--adam', action='store_true', help='use adam optimizer')
326 |     parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
327 |     parser.add_argument('--mixm', type=str)
328 |     parser.add_argument('--bitw', type=str, default='')
329 |     parser.add_argument('--bita', type=str, default='')
330 |     parser.add_argument('--var', type=float, help='debug variable')
331 |     parser.add_argument('--model', type=str, default='', help='use specific model')
332 |   
333 |     opt = parser.parse_args()
334 |     
335 |     if opt.mixm is not None:
336 |         wmix = torch.load('weights/%s.pt'%opt.mixm)
337 |         opt.bitw = wmix['extra']['bestw']
338 |         opt.bita = wmix['extra']['besta']
339 |         del wmix
340 |     last = wdir + 'last_%s.pt'%opt.name
341 |     opt.weights = last if opt.resume else opt.weights
342 |     print(opt)
343 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
344 | 
345 |     train()  # train normally
346 | 


--------------------------------------------------------------------------------
/dacsdc/pareto_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | cds = {
 5 | 'cd':['3e-5', '6e-5', '1e-4', '2e-4', '3e-4'],
 6 | 'cdt':['1e-5', '2e-5', '3e-5', '6e-5', '1e-4'],
 7 | }
 8 | 
 9 | def search_train():
10 |     for cd in cds[opt.arg]:
11 |         name = '%d_%s_'%(opt.it, opt.arg)+cd.replace('-','').replace('.','')
12 |         os.system('python search_train.py --name %s --cd %s'%('f'+name, cd))
13 | 
14 | def main_train():
15 |     for cd in cds[opt.arg]:
16 |         name = '%d_%s_'%(opt.it, opt.arg)+cd.replace('-','').replace('.','')
17 |         os.system('python main_train.py --name %s --mixm %s'%('x'+name, 'f'+name+'_last'))
18 | 
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('--search', action='store_true')
21 | parser.add_argument('--main', action='store_true')
22 | parser.add_argument('--it', type=int)
23 | parser.add_argument('--arg', type=str)
24 | opt = parser.parse_args()
25 | 
26 | if opt.search:
27 |     search_train()
28 | 
29 | if opt.main:
30 |     main_train()
31 | 


--------------------------------------------------------------------------------
/dacsdc/quant_dorefa.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | # 改动了权重数据的量化
  7 | def uniform_quantize(k):
  8 |   class qfn(torch.autograd.Function):
  9 | 
 10 |     @staticmethod
 11 |     def forward(ctx, input):
 12 |       if k == 32:
 13 |         out = input
 14 |       # elif k == 1:
 15 |       #   out = torch.sign(input)
 16 |       else:
 17 |         # cang
 18 |         n = float(2 ** k)
 19 |         out = torch.round(input * n).clamp_(-n, n-1) / n
 20 |         # normal
 21 |         # n = float(2 ** k  - 1)
 22 |         # out = torch.round(input * n) / n
 23 | 
 24 |       return out
 25 | 
 26 |     @staticmethod
 27 |     def backward(ctx, grad_output):
 28 |       grad_input = grad_output.clone()
 29 |       return grad_input
 30 | 
 31 |   return qfn().apply
 32 | 
 33 | 
 34 | class weight_quantize_fn(nn.Module):
 35 |   def __init__(self, w_bit):
 36 |     super(weight_quantize_fn, self).__init__()
 37 |     assert w_bit <= 8 or w_bit == 32
 38 |     self.w_bit = w_bit
 39 |     # 符号位 占一位
 40 |     self.uniform_q = uniform_quantize(k=w_bit - 1) 
 41 | 
 42 |   def forward(self, x):
 43 |     # print('===================')
 44 |     if self.w_bit == 32:
 45 |       # weight_q = x
 46 |       weight = torch.tanh(x)
 47 |       # weight = weight / 2 / torch.max(torch.abs(weight)) + 0.5
 48 |       # weight_q = 2 * self.uniform_q(weight) - 1
 49 |       weight_q = weight / torch.max(torch.abs(weight))
 50 |     elif self.w_bit == 1:
 51 |       E = torch.mean(torch.abs(x)).detach()
 52 |       weight_q = (self.uniform_q(x / E) + 1) / 2 * E
 53 |     else:
 54 |       weight = torch.tanh(x)
 55 |       # weight = weight / 2 / torch.max(torch.abs(weight)) + 0.5
 56 |       # weight_q = 2 * self.uniform_q(weight) - 1
 57 |       weight = weight / torch.max(torch.abs(weight))
 58 |       # 想量化到带符号的 k bit
 59 |       weight_q = self.uniform_q(weight)
 60 |     return weight_q 
 61 | 
 62 | 
 63 | class activation_quantize_fn(nn.Module):
 64 |   def __init__(self, a_bit):
 65 |     super(activation_quantize_fn, self).__init__()
 66 |     assert a_bit <= 8 or a_bit == 32
 67 |     self.a_bit = a_bit
 68 |     self.uniform_q = uniform_quantize(k=a_bit)
 69 | 
 70 |   def forward(self, x):
 71 |     if self.a_bit == 32:
 72 |       activation_q = torch.clamp(x, 0, 6)
 73 |     else:
 74 |       activation_q = self.uniform_q(torch.clamp(x, 0, 1))
 75 |       # print(np.unique(activation_q.detach().numpy()))
 76 |     return activation_q
 77 | 
 78 | class ActQuant_PACT(nn.Module):
 79 |     def __init__(self, act_bit=4, scale_coef=1.0):
 80 |         super(ActQuant_PACT, self).__init__()
 81 |         self.act_bit=act_bit
 82 |         self.scale_coef = nn.Parameter(torch.ones(1)*scale_coef)
 83 |         
 84 |         self.uniform_q = uniform_quantize(k=act_bit)
 85 | 
 86 |         # self.uniform_q = uniform_quantize(k=act_bit)
 87 | 
 88 |     def forward(self, x):
 89 |         if self.act_bit==32:
 90 |             out=0.5*(x.abs() - (x-self.scale_coef.abs()).abs()+self.scale_coef.abs())/self.scale_coef.abs()
 91 |         else:
 92 |             out = 0.5*(x.abs() - (x-self.scale_coef.abs()).abs()+self.scale_coef.abs())
 93 |             activation_q = self.uniform_q(out / self.scale_coef)
 94 | #        print(self.scale_coef)
 95 | 	    
 96 | #            out = torch.round(out * (2**self.act_bit - 1) / self.scale_coef) / (2**self.act_bit - 1)
 97 |         return activation_q
 98 | 
 99 | 
100 | def conv2d_Q_fn(w_bit):
101 |   class Conv2d_Q(nn.Conv2d):
102 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
103 |                  padding=0, dilation=1, groups=1, bias=True):
104 |       super(Conv2d_Q, self).__init__(in_channels, out_channels, kernel_size, stride,
105 |                                      padding, dilation, groups, bias)
106 |       self.w_bit = w_bit
107 |       self.quantize_fn = weight_quantize_fn(w_bit=w_bit)
108 | 
109 |     def forward(self, input, order=None):
110 |       weight_q = self.quantize_fn(self.weight)
111 |       # print(np.unique(weight_q.detach().numpy()))
112 |       return F.conv2d(input, weight_q, self.bias, self.stride,
113 |                       self.padding, self.dilation, self.groups)
114 | 
115 |   return Conv2d_Q
116 | 
117 | class activation_quantize_fn_test(nn.Module):
118 |   def __init__(self, a_bit):
119 |     super(activation_quantize_fn_test, self).__init__()
120 |     assert a_bit <= 8 or a_bit == 32
121 |     self.a_bit = a_bit
122 |     self.uniform_q = uniform_quantize(k=a_bit)
123 | 
124 |   def forward(self, x):
125 |     if self.a_bit == 32:
126 |       activation_q = torch.clamp(x, 0, 6)
127 |     else:
128 |       activation_q = self.uniform_q(torch.clamp(x, 0, 6)/6)*6
129 |     return activation_q
130 | 
131 | class weight_quantize_fn_test(nn.Module):
132 |   def __init__(self, w_bit):
133 |     super(weight_quantize_fn_test, self).__init__()
134 |     assert w_bit <= 8 or w_bit == 32
135 |     self.w_bit = w_bit
136 |     # 符号位 占一位
137 |     self.uniform_q = uniform_quantize(k=w_bit - 1) 
138 | 
139 |   def forward(self, x):
140 |     # print('===================')
141 |     assert(1<self.w_bit<32)
142 |     
143 |     weight = x
144 |     
145 |     scale = torch.max(torch.abs(weight).reshape(weight.shape[0], -1), dim = 1).values
146 |     scale = scale.reshape(scale.shape[0], 1, 1, 1)
147 | 
148 |     weight = weight / scale
149 |     # 想量化到带符号的 k bit
150 |     weight_q = self.uniform_q(weight) * scale
151 | 
152 |     return weight_q 
153 | 
154 | def conv2d_Q_fn_test(w_bit):
155 |   class Conv2d_Q(nn.Conv2d):
156 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
157 |                  padding=0, dilation=1, groups=1, bias=True):
158 |       super(Conv2d_Q, self).__init__(in_channels, out_channels, kernel_size, stride,
159 |                                      padding, dilation, groups, bias)
160 |       self.w_bit = w_bit
161 |       self.quantize_fn = weight_quantize_fn_test(w_bit=w_bit)
162 | 
163 |     def forward(self, input, order=None):
164 |       weight_q = self.quantize_fn(self.weight)
165 |       # print(np.unique(weight_q.detach().numpy()))
166 |       return F.conv2d(input, weight_q, self.bias, self.stride,
167 |                       self.padding, self.dilation, self.groups)
168 | 
169 |   return Conv2d_Q
170 | 
171 | def batchNorm2d_Q_fn(w_bit):
172 |   class BatchNorm2d_Q(nn.BatchNorm2d):
173 |     def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
174 |                  track_running_stats=True):
175 |         super(BatchNorm2d_Q, self).__init__(num_features, eps, momentum, affine,
176 |                  track_running_stats)
177 |         self.w_bit = w_bit
178 |         self.quantize_fn = uniform_quantize(k= w_bit)
179 | 
180 |     def forward(self, input):
181 |       # return input
182 |       gamma = self.weight
183 |       var = self.running_var
184 |       mean = self.running_mean
185 |       eps = self.eps
186 |       bias = self.bias
187 |       w = gamma / (torch.sqrt(var) + eps)
188 |       b = bias -  (mean / (torch.sqrt(var) + eps)) * gamma
189 | 
190 |       w = torch.clamp(w, -1, 1) / 2 + 0.5
191 |       # w = w / 2 / torch.max(torch.abs(w)) + 0.5
192 |       w_q = 2 * self.quantize_fn(w) - 1 
193 | 
194 |       b = torch.clamp(b, -1, 1) / 2 + 0.5
195 |       b_q = 2 * self.quantize_fn(b) - 1
196 |       # b_q = self.quantize_fn(torch.clamp())
197 |       # return w_q * input + b_q
198 |       return F.batch_norm(input, running_mean=mean * 0, running_var=torch.sign(torch.abs(var) + 1), weight=w_q, bias=b_q, eps=eps * 0)
199 | 
200 |   return BatchNorm2d_Q
201 | 
202 | def batchNorm1d_Q_fn(w_bit):
203 |   class BatchNorm1d_Q(nn.BatchNorm1d):
204 |     def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
205 |                  track_running_stats=True):
206 |         super(BatchNorm1d_Q, self).__init__(num_features, eps, momentum, affine,
207 |                  track_running_stats)
208 |         self.w_bit = w_bit
209 |         self.quantize_fn = uniform_quantize(k= w_bit)
210 | 
211 |     # def forward(self, input):
212 |     #   # return input
213 |     #   gamma = self.weight
214 |     #   var = self.running_var
215 |     #   mean = self.running_mean
216 |     #   eps = self.eps
217 |     #   bias = self.bias
218 |     #   w = gamma / (torch.sqrt(var) + eps)
219 |     #   b = (bias -  mean / (torch.sqrt(var) + eps)) * gamma
220 | 
221 |     #   # w = torch.clamp(w, -1, 1) / 2 + 0.5
222 |     #   # w = w / 2 / torch.max(torch.abs(w)) + 0.5
223 |     #   # w_q = 2 * self.quantize_fn(w) - 1 
224 |     #   w_q = self.quantize_fn(w)
225 | 
226 |     #   # b = torch.clamp(b, -1, 1) / 2 + 0.5
227 |     #   b_q = self.quantize_fn(b)
228 |     #   # b_q = self.quantize_fn(torch.clamp())
229 |     #   # return w_q * input + b_q
230 |     #   # return F.batch_norm(input, running_mean=mean * 0, running_var=torch.sign(torch.abs(var) + 1), weight=w, bias=b, eps=eps * 0)
231 |     #   return F.batch_norm(input, running_mean=mean, running_var=var, weight=gamma, bias=bias, eps=eps)
232 |     def forward(self, input):
233 |         self._check_input_dim(input)
234 | 
235 |         # exponential_average_factor is self.momentum set to
236 |         # (when it is available) only so that if gets updated
237 |         # in ONNX graph when this node is exported to ONNX.
238 |         if self.momentum is None:
239 |             exponential_average_factor = 0.0
240 |         else:
241 |             exponential_average_factor = self.momentum
242 | 
243 |         if self.training and self.track_running_stats:
244 |             # TODO: if statement only here to tell the jit to skip emitting this when it is None
245 |             if self.num_batches_tracked is not None:
246 |                 self.num_batches_tracked += 1
247 |                 if self.momentum is None:  # use cumulative moving average
248 |                     exponential_average_factor = 1.0 / float(self.num_batches_tracked)
249 |                 else:  # use exponential moving average
250 |                     exponential_average_factor = self.momentum
251 |         
252 |         gamma = self.weight
253 |         var = self.running_var
254 |         mean = self.running_mean
255 |         eps = self.eps
256 |         bias = self.bias
257 |         w = gamma / (torch.sqrt(var) + eps)    # this is a small bug in previous code   https://github.com/heheda365/ultra_net/issues/9
258 |         b = bias -  (mean / (torch.sqrt(var) + eps)) * gamma
259 | 
260 |         # w = torch.clamp(w, -1, 1) / 2 + 0.5
261 |         # w = w / 2 / torch.max(torch.abs(w)) + 0.5
262 |         # w_q = 2 * self.quantize_fn(w) - 1 
263 |         w_q = self.quantize_fn(w)
264 | 
265 |         # return F.batch_norm(
266 |         #     input, self.running_mean, self.running_var, self.weight, self.bias,
267 |         #     self.training or not self.track_running_stats,
268 |         #     exponential_average_factor, self.eps)
269 |         return F.batch_norm(
270 |             input, mean * 0, torch.sign(var + 1) , w, b,
271 |             self.training or not self.track_running_stats,
272 |             exponential_average_factor, eps * 0)
273 | 
274 |   return BatchNorm1d_Q
275 | 
276 | 
277 | 
278 | def linear_Q_fn(w_bit):
279 |   class Linear_Q(nn.Linear):
280 |     def __init__(self, in_features, out_features, bias=True):
281 |       super(Linear_Q, self).__init__(in_features, out_features, bias)
282 |       self.w_bit = w_bit
283 |       self.quantize_fn = weight_quantize_fn(w_bit=w_bit)
284 | 
285 |     def forward(self, input):
286 |       weight_q = self.quantize_fn(self.weight)
287 |       # print(np.unique(weight_q.detach().numpy()))
288 |       return F.linear(input, weight_q, self.bias)
289 | 
290 |   return Linear_Q
291 | 
292 | 
293 | if __name__ == '__main__':
294 |   import numpy as np
295 |   import matplotlib.pyplot as plt
296 | 
297 |   a = torch.rand(1, 3, 32, 32)
298 | 
299 |   Conv2d = conv2d_Q_fn(w_bit=2)
300 |   conv = Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
301 |   act = activation_quantize_fn(a_bit=3)
302 | 
303 |   b = conv(a)
304 |   b.retain_grad()
305 |   c = act(b)
306 |   d = torch.mean(c)
307 |   d.retain_grad()
308 | 
309 |   d.backward()
310 |   pass
311 | 


--------------------------------------------------------------------------------
/dacsdc/search_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch.distributed as dist
  4 | import torch.optim as optim
  5 | import torch.optim.lr_scheduler as lr_scheduler
  6 | 
  7 | import sys
  8 | sys.path.append('..')
  9 | import localconfig
 10 | import test
 11 | from datasets import *
 12 | from yolo_utils import *
 13 | 
 14 | from mymodel import *
 15 | import mymodel
 16 | 
 17 | wdir = 'weights' + os.sep  # weights dir
 18 | 
 19 | # Hyperparameters (results68: 59.9 mAP@0.5 yolov3-spp-416) https://github.com/ultralytics/yolov3/issues/310
 20 | 
 21 | hyp = {'giou': 3.54,  # giou loss gain
 22 |        'cls': 37.4,  # cls loss gain
 23 |        'cls_pw': 1.0,  # cls BCELoss positive_weight
 24 |        'obj': 64.3,  # obj loss gain (*=img_size/320 if img_size != 320)
 25 |        'obj_pw': 1.0,  # obj BCELoss positive_weight
 26 |        'iou_t': 0.225,  # iou training threshold
 27 |        'lr0': 0.01,  # initial learning rate (SGD=5E-3, Adam=5E-4)
 28 |        'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
 29 |        'momentum': 0.937,  # SGD momentum
 30 |        'weight_decay': 0.000484,  # optimizer weight decay
 31 |        'fl_gamma': 0.5,  # focal loss gamma
 32 |        'hsv_h': 0.0138,  # image HSV-Hue augmentation (fraction)
 33 |        'hsv_s': 0.678,  # image HSV-Saturation augmentation (fraction)
 34 |        'hsv_v': 0.36,  # image HSV-Value augmentation (fraction)
 35 |        'degrees': 1.98,  # image rotation (+/- deg)
 36 |        'translate': 0.05,  # image translation (+/- fraction)
 37 |        'scale': 0.05,  # image scale (+/- gain)
 38 |        'shear': 0.641}  # image shear (+/- deg)
 39 | 
 40 | # Overwrite hyp with hyp*.txt (optional)
 41 | f = glob.glob('hyp*.txt')
 42 | if f:
 43 |     print('Using %s' % f[0])
 44 |     for k, v in zip(hyp.keys(), np.loadtxt(f[0])):
 45 |         hyp[k] = v
 46 | 
 47 | def train():
 48 |     img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2  # train, test sizes
 49 |     epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
 50 |     batch_size = opt.batch_size
 51 |     accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
 52 |     weights = opt.weights  # initial training weights
 53 | 
 54 |     # Initialize
 55 |     init_seeds()
 56 | 
 57 |     # Configure run
 58 |     train_path = localconfig.train_path
 59 |     test_path = localconfig.test_path
 60 |     nc = 1 
 61 | 
 62 |     results_file = 'results/%s.txt'%opt.name
 63 | 
 64 |     # Initialize model
 65 |     if opt.model != '':
 66 |         model = getattr(mymodel, opt.model)(not opt.no_share).to(device)
 67 |     else:
 68 |         if opt.bypass:
 69 |             model = UltraNetBypass_MixQ(not opt.no_share).to(device)
 70 |         else:
 71 |             model = UltraNet_MixQ(not opt.no_share).to(device)
 72 | 
 73 |     # Optimizer
 74 |     pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
 75 |     for k, v in dict(model.named_parameters()).items():
 76 |         if '.bias' in k:
 77 |             pg2 += [v]  # biases
 78 |         elif 'Conv2d.weight' in k:
 79 |             pg1 += [v]  # apply weight_decay
 80 |         elif 'alpha' not in k:
 81 |             pg0 += [v]  # all else
 82 | 
 83 |     if opt.adam:
 84 |         # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
 85 |         optimizer = optim.Adam(pg0, lr=hyp['lr0'])
 86 |         # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
 87 |     else:
 88 |         optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
 89 |     optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
 90 |     optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
 91 |     optimizer.param_groups[2]['lr'] *= 2.0  # bias lr
 92 | 
 93 |     # arch_optimizer
 94 |     alpha_params = []
 95 |     for name, param in model.named_parameters():
 96 |         if 'alpha' in name:
 97 |             alpha_params += [param]
 98 |     arch_optimizer = torch.optim.SGD(alpha_params, opt.lra, momentum=hyp['momentum'],
 99 |                                weight_decay=hyp['weight_decay'])
100 | 
101 |     del pg0, pg1, pg2
102 | 
103 |     start_epoch = 0
104 |     test_best_iou = 0.0
105 | 
106 |     # load weights
107 |     if weights.endswith('.pt'):  # pytorch format
108 |         # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
109 |         chkpt = torch.load(weights, map_location=device)
110 | 
111 |         # load model
112 |         try:
113 |             chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
114 |             model.load_state_dict(chkpt['model'], strict=False)
115 |         except KeyError as e:
116 |             s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " % (opt.weights, opt.cfg, opt.weights)
117 |             raise KeyError(s) from e
118 | 
119 |         if opt.resume:
120 |         # load optimizer
121 |             if chkpt['optimizer'] is not None:
122 |                 optimizer.load_state_dict(chkpt['optimizer'])
123 |                 best_fitness = chkpt['best_fitness']
124 | 
125 |             # load results
126 |             if chkpt.get('training_results') is not None:
127 |                 with open(results_file, 'w') as file:
128 |                     file.write(chkpt['training_results'])  # write results.txt
129 | 
130 |             start_epoch = chkpt['epoch'] + 1
131 | 
132 |         del chkpt
133 | 
134 |     # Scheduler https://github.com/ultralytics/yolov3/issues/238
135 |     lf = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.999 + 0.001  # cosine https://arxiv.org/pdf/1812.01187.pdf
136 |     lf2 = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.9 + 0.1  # cosine https://arxiv.org/pdf/1812.01187.pdf
137 |     scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
138 |     scheduler.last_epoch = start_epoch
139 | 
140 |     arch_scheduler = lr_scheduler.LambdaLR(arch_optimizer, lr_lambda=lf2)
141 |     arch_scheduler.last_epoch = start_epoch
142 | 
143 |     # Initialize distributed training
144 |     if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
145 |         dist.init_process_group(backend='nccl',  # 'distributed backend'
146 |                                 init_method='tcp://127.0.0.1:5000',  # distributed training init method
147 |                                 world_size=1,  # number of nodes for distributed training
148 |                                 rank=0)  # distributed training node rank
149 |         model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
150 |         model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level
151 | 
152 |     # Dataloader
153 |     #batch_size = min(batch_size, len(dataset))
154 |     nw = min([os.cpu_count()//4, batch_size//4 if batch_size > 1 else 0, 8])  # number of workers
155 | 
156 |     # Testloader
157 |     testset = LoadImagesAndLabels(test_path, img_size_test, batch_size,
158 |                                                                  hyp=hyp,
159 |                                                                  rect=False,
160 |                                                                  cache_images=opt.cache_images,
161 |                                                                  single_cls=opt.single_cls)
162 |     testloader = torch.utils.data.DataLoader(testset,
163 |                                              batch_size=batch_size,
164 |                                              num_workers=0,
165 |                                              pin_memory=True,
166 |                                              collate_fn=testset.collate_fn)
167 | 
168 |     # Dataset
169 |     dataset = LoadImagesAndLabels(train_path, img_size, batch_size,
170 |                                   augment=True,
171 |                                   hyp=hyp,  # augmentation hyperparameters
172 |                                   rect=opt.rect,  # rectangular training
173 |                                   cache_images=opt.cache_images,
174 |                                   single_cls=opt.single_cls)
175 |                                              
176 |     dataloader = torch.utils.data.DataLoader(dataset,
177 |                                              batch_size=batch_size,
178 |                                              num_workers=nw,
179 |                                              shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
180 |                                              pin_memory=True,
181 |                                              collate_fn=dataset.collate_fn)
182 | 
183 |     # Start training
184 |     nb = len(dataloader)
185 |     prebias = start_epoch == 0
186 |     model.nc = nc  # attach number of classes to model
187 |     model.arc = opt.arc  # attach yolo architecture
188 |     model.hyp = hyp  # attach hyperparameters to model
189 |     model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
190 |     maps = np.zeros(nc)  # mAP per class
191 |     # torch.autograd.set_detect_anomaly(True)
192 |     results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
193 |     t0 = time.time()
194 |     torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
195 |     print('Using %g dataloader workers' % nw)
196 |     print('Starting training for %g epochs...' % epochs)
197 | 
198 |     for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
199 |         model.train()
200 |         model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2  # GIoU <-> 1.0 loss ratio
201 | 
202 |         # Prebias
203 |         if prebias:
204 |             ne = max(round(30 / nb), 3)  # number of prebias epochs
205 |             ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \
206 |                  np.interp(epoch, [0, ne], [0.9, hyp['momentum']])  # prebias settings (lr=0.1, momentum=0.9)
207 |             if epoch == ne:
208 |                 # print_model_biases(model)
209 |                 prebias = False
210 | 
211 |             # Bias optimizer settings
212 |             optimizer.param_groups[2]['lr'] = ps[0]
213 |             if optimizer.param_groups[2].get('momentum') is not None:  # for SGD but not Adam
214 |                 optimizer.param_groups[2]['momentum'] = ps[1]
215 | 
216 |         curr_lr = optimizer.param_groups[0]['lr']
217 |         curr_lra = arch_optimizer.param_groups[0]['lr']
218 |         print(f'lr:{curr_lr}, lra:{curr_lra}')
219 | 
220 |         mloss = torch.zeros(4).to(device)  # mean losses
221 |         print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'iouloss', 'objloss', 'triou', 'mloss', 'targets', 'img_size'))
222 |         pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
223 |         for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
224 |             ni = i + nb * epoch  # number integrated batches (since train start)
225 |             imgs = imgs.to(device).float() / 256.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
226 |             targets = targets.to(device)
227 | 
228 |             # Run model
229 |             pred = model(imgs)
230 | 
231 |             # Compute loss
232 |             loss, loss_items = compute_loss(pred, targets, model)
233 |             if not torch.isfinite(loss):
234 |                 print('WARNING: non-finite loss, ending training ', loss_items)
235 |                 return results
236 | 
237 |             # Scale loss by nominal batch_size of 64
238 |             loss *= batch_size / 64
239 | 
240 |             # complexity penalty
241 |             if opt.complexity_decay != 0:
242 |                 loss_complexity = opt.complexity_decay * model.complexity_loss()
243 |                 loss += loss_complexity * 4.0
244 | 
245 |             if opt.complexity_decay_trivial != 0:
246 |                 loss_complexity_trivial = opt.complexity_decay_trivial * model.complexity_loss_trivial()
247 |                 loss += loss_complexity_trivial * 4.0
248 |             
249 |             if opt.bram_decay != 0:
250 |                 if hasattr(model, 'module'):
251 |                     loss_bram = opt.bram_decay * model.bram_loss()
252 |                 loss += loss_bram * 4.0
253 | 
254 |             loss.backward()
255 | 
256 |             # Optimize accumulated gradient
257 |             if ni % accumulate == 0:
258 |                 optimizer.step()
259 |                 arch_optimizer.step()
260 |                 optimizer.zero_grad()
261 |                 arch_optimizer.zero_grad()
262 | 
263 |             # Print batch results
264 |             mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
265 |             mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
266 |             s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size)
267 |             pbar.set_description(s)
268 | 
269 |             # end batch ------------------------------------------------------------------------------------------------
270 | 
271 |         print('========= architecture =========')
272 |         if hasattr(model, 'module'):
273 |             best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache = model.module.fetch_best_arch()
274 |         else:
275 |             best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache  = model.fetch_best_arch()
276 |         print('best model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format(
277 |             bitops, bita, bitw, dsps))
278 |         print('expected model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M, bram_wa:({:.3f},{:.3f})K'.format(
279 |             mixbitops, mixbita, mixbitw, mixdsps, mixbram_weight, mixbram_cache))
280 |         
281 |         bestw_str = "".join([str(x+2) for x in best_arch["best_weight"]])
282 |         besta_str = "".join([str(x+2) for x in best_arch["best_activ"]])
283 |         print(f'best_weight: {best_arch["best_weight"]}')
284 |         print(f'best_activ: {best_arch["best_activ"]}')
285 | 
286 |         # Update scheduler
287 |         scheduler.step()
288 |         arch_scheduler.step()
289 | 
290 |         train_iou = mloss[2]
291 | 
292 |         # Process epoch results
293 |         final_epoch = epoch + 1 == epochs
294 |         if not opt.notest or final_epoch:  # Calculate mAP
295 |             results = test.test(batch_size=batch_size,
296 |                                 img_size=img_size_test,
297 |                                 model=model,
298 |                                 dataloader=testloader)
299 |         
300 |         # Write epoch results
301 |         with open(results_file, 'a') as f:
302 |             f.write(s + '%10.3g' * len(results) % results + '\n')  # test_losses=(iou, loss_sum, lobj, lcls)
303 | 
304 |         # Update best mAP
305 |         results =  torch.tensor(results, device = 'cpu')
306 |                
307 |         test_iou = results[0]
308 |         if test_iou > test_best_iou:
309 |             test_best_iou = test_iou
310 | 
311 |         # Save training results
312 |         save = (not opt.nosave) or (final_epoch)
313 |         if save:
314 |             with open(results_file, 'r') as f:
315 |                 # Create checkpoint
316 |                 chkpt = {'epoch': epoch,
317 |                          'training_results': f.read(),
318 |                          'model': model.module.state_dict() if type(
319 |                              model) is nn.parallel.DistributedDataParallel else model.state_dict(),
320 |                          'optimizer': None if final_epoch else optimizer.state_dict(),
321 |                          'extra': {'time': time.ctime(), 'name': opt.name, 'bestw': bestw_str, 'besta': besta_str}}
322 | 
323 |             # Save last checkpoint
324 |             torch.save(chkpt, wdir + '%s_last.pt'%opt.name)
325 |             
326 |             if test_iou == test_best_iou:
327 |                 torch.save(chkpt, wdir + '%s_best.pt'%opt.name)
328 | 
329 |             # Delete checkpoint
330 |             del chkpt
331 | 
332 |         # end epoch ----------------------------------------------------------------------------------------------------
333 | 
334 |     # end training
335 |     n = opt.name
336 | 
337 |     print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
338 |     dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
339 |     torch.cuda.empty_cache()
340 |     
341 |     with open('results.csv', 'a') as f:
342 |         print("mixed,%s,%d/%d, , , , ,%.1f,%.1f, ,%s,%s,%d,%d,%.3f,%.3f"%
343 |               (opt.name,epochs-1,epochs,train_iou*100,(test_iou+test_best_iou)*50,
344 |                bestw_str,besta_str,
345 |                int(round(bitops)), int(round(mixbitops)), dsps, mixdsps), file=f)
346 | 
347 |     return results
348 | 
349 | 
350 | if __name__ == '__main__':
351 |     parser = argparse.ArgumentParser()
352 |     parser.add_argument('--bypass', action='store_true', help='use bypass model')
353 |     parser.add_argument('--epochs', type=int, default=35)  # 500200 batches at bs 16, 117263 COCO images = 273 epochs
354 |     parser.add_argument('--batch-size', type=int, default=64)  # effective bs = batch_size * accumulate = 16 * 4 = 64
355 |     parser.add_argument('--accumulate', type=int, default=1, help='batches to accumulate before optimizing')
356 |     parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path')
357 |     parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path')
358 |     parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes')
359 |     parser.add_argument('--rect', action='store_true', help='rectangular training')
360 |     parser.add_argument('--resume', action='store_true', help='resume training from last.pt')
361 |     parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
362 |     parser.add_argument('--notest', action='store_true', help='only test final epoch')
363 |     parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
364 |     parser.add_argument('--weights', type=str, default='', help='initial weights path')
365 |     parser.add_argument('--arc', type=str, default='default', help='yolo architecture')  # default, uCE, uBCE
366 |     parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
367 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)')
368 |     parser.add_argument('--adam', action='store_true', help='use adam optimizer')
369 |     parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
370 |     parser.add_argument('--var', type=float, help='debug variable')
371 |     parser.add_argument('--complexity-decay', '--cd', default=0, type=float, metavar='W', help='complexity decay (default: 0)')
372 |     parser.add_argument('--complexity-decay-trivial', '--cdt', default=0, type=float, metavar='W', help='complexity decay (default: 0)')
373 |     parser.add_argument('--bram-decay', '--bd', default=0, type=float, metavar='W', help='complexity decay (default: 0)')
374 |     parser.add_argument('--lra', '--learning-rate-alpha', default=0.01, type=float, metavar='LR', help='initial alpha learning rate')
375 |     parser.add_argument('--no-share', action='store_true', help='no share weight quantization')
376 |     parser.add_argument('--model', type=str, default='', help='use specific model')
377 |     
378 |     opt = parser.parse_args()
379 |     last = wdir + 'last_%s.pt'%opt.name
380 |     opt.weights = last if opt.resume else opt.weights
381 |     print(opt)
382 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
383 | 
384 |     train()  # train normally
385 | 


--------------------------------------------------------------------------------
/dacsdc/simulate_hw.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | from export_hls import ConvParam
  7 | from mymodel import YOLOLayer
  8 | from test import get_prebox, hyp, bbox_iou, select_weight_file
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from datasets import LoadImagesAndLabels
 12 | 
 13 | class QConvLayer:
 14 |     def __init__(self, conv_param):
 15 |         self.conv = conv_param
 16 |         self.w = torch.tensor(self.conv.w, dtype = torch.int64)
 17 |     
 18 |     def __call__(self, x):
 19 |         if self.conv.icol < x.shape[-1]: # maxpool
 20 |             assert self.conv.irow*2, self.conv.icol*2 == x.shape[2:]
 21 |             x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64)
 22 |         # print('convi', self.conv.n, x[0,0,:,0])
 23 | 
 24 |         groups = self.conv.groups if hasattr(self.conv, 'groups') else 1
 25 |         x = F.conv2d(x, self.w, bias=None, stride=self.conv.s, padding=self.conv.p, groups=groups) # [N, OCH, OROW, OCOL]
 26 |         # print('convo', self.conv.n, x[0,0,:,0])
 27 |         och = x.shape[1]
 28 |         if True:
 29 |             if self.conv.inc is not None:
 30 |                 inc_ch = self.conv.inc.reshape((1, och, 1, 1))
 31 |                 x *= inc_ch
 32 |             if hasattr(self.conv, 'bias'):
 33 |                 bias_ch = self.conv.bias.reshape((1, och, 1, 1))
 34 |                 x += bias_ch
 35 | 
 36 |             # print('biaso', self.conv.n, x[0,0,:,:]/2**self.conv.lshift_T)
 37 |             if hasattr(self.conv, 'lshift'):
 38 |                 x += 1 << self.conv.lshift_T-1
 39 |                 x >>= self.conv.lshift_T
 40 | 
 41 |         else: ## no inc/bias quantization
 42 |             if self.conv.inc is not None:
 43 |                 inc_ch = self.conv.inc_raw.reshape((1, och, 1, 1))
 44 |                 x *= inc_ch
 45 |             if hasattr(self.conv, 'bias'):
 46 |                 bias_ch = self.conv.bias_raw.reshape((1, och, 1, 1))
 47 |                 x += bias_ch
 48 |             # if hasattr(self.conv, 'max_pool'): # maxpool
 49 |             #     x = F.max_pool2d(x, kernel_size = 2, stride = 2)
 50 |             # print('biaso', self.conv.n, x[0,0,:,0])
 51 |             x = torch.round(x).to(dtype = torch.int64)
 52 |         
 53 |         if hasattr(self.conv, 'obit'):
 54 |             x.clip_(0, 2**(self.conv.obit)-1)
 55 | 
 56 |         return x
 57 | 
 58 | def reorg(x):
 59 |     stride = 2
 60 |     B = x.data.size(0)
 61 |     C = x.data.size(1)
 62 |     H = x.data.size(2)
 63 |     W = x.data.size(3)
 64 |     ws = stride
 65 |     hs = stride
 66 |     x = x.view([B, C, H//hs, hs, W//ws, ws]).transpose(3, 4).contiguous()
 67 |     x = x.view([B, C, H//hs*W//ws, hs*ws]).transpose(2, 3).contiguous()
 68 |     x = x.view([B, C, hs*ws, H//hs, W//ws]).transpose(1, 2).contiguous()
 69 |     x = x.view([B, hs*ws*C, H//hs, W//ws])
 70 |     return x
 71 | 
 72 | class HWModel:
 73 |     def __init__(self, model_param):
 74 |         self.layers = [QConvLayer(conv_param) for conv_param in model_param]
 75 |         self.yololayer = YOLOLayer([[20,20], [20,20], [20,20], [20,20], [20,20], [20,20]])
 76 |         self.yololayer.eval()
 77 | 
 78 |     def __call__(self, x):
 79 |         assert len(x.shape) == 4 and x.dtype == torch.int64
 80 |         img_size = x.shape[-2:]
 81 | 
 82 |         if self.layers[0].conv.abit<8: # ImageInputQ
 83 |             x=x>>(8-self.layers[0].conv.abit) 
 84 | 
 85 |         if not opt.bypass:
 86 |             for i, layer in enumerate(self.layers):
 87 |                 x = layer(x)
 88 |         else:
 89 |             for i in [0,1,2,3]:
 90 |                 x = self.layers[i](x)
 91 |             p4_in = torch.round(reorg(x) * 
 92 |                         self.layers[4].conv.astep / self.layers[7].conv.astep).to(dtype=torch.int64)
 93 |             for i in [4,5,6]:
 94 |                 x = self.layers[i](x)
 95 |             x = torch.cat([p4_in, x], 1)
 96 |             for i in [7,8]:
 97 |                 x= self.layers[i](x)
 98 |         
 99 |         x = x.float() / self.layers[-1].conv.div
100 | 
101 |         io, p = self.yololayer(x, img_size)
102 |         return io
103 | 
104 | def testdataset(hwmodel):
105 |     img_size = 320
106 |     dataset = LoadImagesAndLabels(opt.datapath, img_size, opt.batch_size, rect=False, cache_labels=True, hyp=hyp, augment=False)
107 |     dataloader = DataLoader(dataset,
108 |                             batch_size=opt.batch_size,
109 |                             #num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]),
110 |                             pin_memory=True,
111 |                             collate_fn=dataset.collate_fn)
112 |     
113 |     iou_sum = 0.0
114 |     test_n = 0
115 |     for batch_i, (imgs, targets, paths, shapes) in enumerate(dataloader):
116 |         if batch_i == opt.num_batch: break
117 |         bn, _, height, width = imgs.shape  # batch size, channels, height, width
118 |         test_n += bn
119 | 
120 |         imgs = imgs.to(dtype = torch.int64)
121 |         inf_out = hwmodel(imgs)
122 |         pre_box = get_prebox(inf_out)
123 | 
124 |         tbox = targets[..., 2:6] * torch.Tensor([width, height, width, height])
125 |         ious = bbox_iou(pre_box, tbox)
126 |         iou_sum += ious.sum()
127 | 
128 |         np.set_printoptions(precision = 2)
129 |         for p in range(len(imgs)):
130 |             print('pbox_xywh', pre_box[p].numpy(), 'tbox_xywh', tbox[p].numpy(), 'iou %.4f'%ious[p].item())
131 |     
132 |         meaniou = iou_sum / test_n
133 | 
134 |     print('iou', meaniou)
135 | 
136 | if __name__=='__main__':
137 |     parser = argparse.ArgumentParser()
138 |     parser.add_argument('-w', '--weight', help='weight folder name in ./hls/, which contians model_param.pkl')
139 |     parser.add_argument('-bp', '--bypass', action='store_true', help='use bypass model')
140 |     parser.add_argument('--datapath', default='', help = 'test dataset path')
141 |     parser.add_argument('-bs', '--batch-size', type=int, default=1, help = 'batch-size')
142 |     parser.add_argument('-nb', '--num-batch', type=int, default=1, help = 'num of batchs to run, -1 for full dataset')
143 |     opt = parser.parse_args()
144 |     
145 |     if opt.datapath == '':
146 |         try:
147 |             import localconfig
148 |             opt.datapath = localconfig.test_path
149 |         except Exception:
150 |             pass
151 |     print(opt)
152 |     if opt.weight is None: opt.weight = select_weight_file()
153 |     
154 |     x = torch.zeros([1,3,320,160], dtype=torch.int64)
155 |     hwmodel = HWModel(torch.load('hls/'+opt.weight+'/model_param.pkl'))
156 |     
157 |     testdataset(hwmodel)
158 | 


--------------------------------------------------------------------------------
/dacsdc/test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | from torch.utils.data import DataLoader
  4 | 
  5 | import sys
  6 | sys.path.append('..')
  7 | from datasets import *
  8 | from yolo_utils import *
  9 | 
 10 | import mymodel
 11 | from mymodel import *
 12 | from utils.view_pt import select_weight_file
 13 | import cv2
 14 | 
 15 | opt=None
 16 | 
 17 | hyp = {'giou': 3.54,  # giou loss gain
 18 |        'cls': 37.4,  # cls loss gain
 19 |        'cls_pw': 1.0,  # cls BCELoss positive_weight
 20 |        'obj': 64.3,  # obj loss gain (*=img_size/320 if img_size != 320)
 21 |        'obj_pw': 1.0,  # obj BCELoss positive_weight
 22 |        'iou_t': 0.225,  # iou training threshold
 23 |        'lr0': 0.01,  # initial learning rate (SGD=5E-3, Adam=5E-4)
 24 |        'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
 25 |        'momentum': 0.937,  # SGD momentum
 26 |        'weight_decay': 0.000484,  # optimizer weight decay
 27 |        'fl_gamma': 0.5,  # focal loss gamma
 28 |        'hsv_h': 0.0138,  # image HSV-Hue augmentation (fraction)
 29 |        'hsv_s': 0.678,  # image HSV-Saturation augmentation (fraction)
 30 |        'hsv_v': 0.36,  # image HSV-Value augmentation (fraction)
 31 |        'degrees': 1.98,  # image rotation (+/- deg)
 32 |        'translate': 0.05,  # image translation (+/- fraction)
 33 |        'scale': 0.05,  # image scale (+/- gain)
 34 |        'shear': 0.641}  # image shear (+/- deg)
 35 | 
 36 | 
 37 | def save_test_pic(filename, img, pbox, tbox):
 38 |     img=img.numpy().transpose((1,2,0))*255
 39 |     img=np.ascontiguousarray(img)
 40 | 
 41 |     pp1, pp2 = (int(pbox[0]-pbox[2]/2), int(pbox[1]-pbox[3]/2)), (int(pbox[0]+pbox[2]/2), int(pbox[1]+pbox[3]/2))
 42 |     tp1, tp2 = (int(tbox[0]-tbox[2]/2), int(tbox[1]-tbox[3]/2)), (int(tbox[0]+tbox[2]/2), int(tbox[1]+tbox[3]/2))
 43 | 
 44 |     cv2.rectangle(img, pp1, pp2, color=(0,0,255), thickness=1) # red pbox
 45 |     cv2.rectangle(img, tp1, tp2, color=(0,255,0), thickness=1) # green tbox
 46 |     cv2.putText(img, text=str((pp1,pp2))+str((tp1, tp2)),
 47 |                 org = (0, 10),
 48 |                 fontFace=cv2.FONT_HERSHEY_SCRIPT_SIMPLEX,
 49 |                 fontScale=0.35,
 50 |                 color = (255,255,255))
 51 | 
 52 |     cv2.imwrite('test_result/'+filename+'.jpg', img)
 53 | 
 54 | def bbox_iou(box1, box2):
 55 |     """
 56 |     Returns the IoU of two bounding boxes
 57 |     """
 58 |  
 59 |     # Transform from center and width to exact coordinates
 60 |     b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
 61 |     b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
 62 |     b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
 63 |     b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
 64 |     
 65 |     # get the corrdinates of the intersection rectangle
 66 |     inter_rect_x1 = torch.max(b1_x1, b2_x1)
 67 |     inter_rect_y1 = torch.max(b1_y1, b2_y1)
 68 |     inter_rect_x2 = torch.min(b1_x2, b2_x2)
 69 |     inter_rect_y2 = torch.min(b1_y2, b2_y2)
 70 |     # Intersection area
 71 |     inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, min=0) * torch.clamp(
 72 |         inter_rect_y2 - inter_rect_y1, min=0
 73 |     )
 74 |     # Union Area
 75 |     b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
 76 |     b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
 77 | 
 78 |     iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
 79 | 
 80 |     return iou
 81 | 
 82 | def select_boxes(pred_boxes, pred_conf):
 83 |     n = pred_boxes.size(0)
 84 |     # pred_boxes = pred_boxes.view(n, -1, 4)
 85 |     # pred_conf = pred_conf.view(n, -1, 1)
 86 |     FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor
 87 |     p_boxes = FloatTensor(n, 4)
 88 |     # print(pred_boxes.shape, pred_conf.shape)
 89 | 
 90 |     for i in range(n):
 91 |         _, index = pred_conf[i].max(0)
 92 |         p_boxes[i] = pred_boxes[i][index]
 93 | 
 94 |     return p_boxes
 95 | 
 96 | def get_prebox(inf_out):
 97 |     inf_out = inf_out.view(inf_out.shape[0], 6, -1) # bs, anchors, nw*nh*6
 98 |     inf_out_t = torch.zeros_like(inf_out[:, 0, :])
 99 |     for i in range(inf_out.shape[1]):
100 |         inf_out_t += inf_out[:, i, :]
101 |     inf_out_t = inf_out_t.view(inf_out_t.shape[0], -1, 6) / 6 # average anchors: box, conf
102 | 
103 |     pre_box = select_boxes(inf_out_t[..., :4], inf_out_t[..., 4]) # get pbox by max conf
104 |     return pre_box
105 | 
106 | def test(weights=None,
107 |          batch_size=16,
108 |          img_size=416,
109 |          model=None,
110 |          dataloader=None,
111 |          num_batch=-1):
112 |     # torch.set_default_tensor_type(torch.DoubleTensor)
113 |     # Initialize/load model and set device
114 |     if model is None or type(model)==str:
115 |         device = torch_utils.select_device(opt.device, batch_size=batch_size)
116 | 
117 |         # Remove previous
118 |         for f in glob.glob('test_batch*.jpg'):
119 |             os.remove(f)
120 | 
121 |         ptfile: Dict = torch.load('weights/' + weights+'.pt', map_location=device)
122 |         model_params = ptfile.setdefault('model_params')
123 |         print('model_params', model_params)
124 |         model = getattr(mymodel, model)(**model_params).to(device)
125 | 
126 |         model.hyp = hyp
127 |         model.nc = 1
128 |         model.arc = 'default'
129 | 
130 |         # Load weights
131 |         model.load_state_dict(ptfile['model'])
132 | 
133 |         if torch.cuda.device_count() > 1:
134 |             model = nn.DataParallel(model)
135 |     else:  # called by train.py
136 |         device = next(model.parameters()).device  # get model device
137 |     
138 |     # Dataloader
139 |     if dataloader is None:
140 |         dataset = LoadImagesAndLabels(opt.datapath, img_size, batch_size, rect=False, cache_labels=True, hyp=hyp, augment=False)
141 |         batch_size = min(batch_size, len(dataset))
142 |         dataloader = DataLoader(dataset,
143 |                                 batch_size=batch_size,
144 |                                 #num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]),
145 |                                 pin_memory=True,
146 |                                 collate_fn=dataset.collate_fn)
147 | 
148 |     model.eval()
149 |     loss = torch.zeros(2)
150 |     iou_sum = 0
151 |     test_n = 0
152 |     
153 |     # model.layers[0].weight.data = torch.tensor(model.layers[0].weight.data.numpy()[:,::-1].copy()) # swap RGB<->BGR
154 | 
155 |     print(('\n' + '%10s' * 4) % ('IOU', 'l', 'Giou-l', 'obj-l'))
156 |     pbar = tqdm(enumerate(dataloader), total=len(dataloader))    
157 |     for batch_i, (imgs, targets, paths, shapes) in pbar:
158 |         if batch_i == num_batch: break
159 | 
160 |         imgs = imgs.to(device).float() / 256.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
161 |         targets = targets.to(device)
162 |         bn, _, height, width = imgs.shape  # batch size, channels, height, width
163 |         test_n += bn
164 | 
165 |         with torch.no_grad():
166 |             # Run model
167 |             inf_out, train_out = model(imgs)  # inference and training outputs, inf_out = bs*anchors*nw*nh*6
168 |             # Compute loss
169 |             if hasattr(model, 'hyp'):  # if model has loss hyperparameters
170 |                 loss += compute_loss(train_out, targets, model)[1][:2].cpu()  # GIoU, obj
171 | 
172 |             pre_box = get_prebox(inf_out) # anchor average, select max
173 | 
174 |             tbox = targets[..., 2:6] * torch.Tensor([width, height, width, height]).to(device)
175 | 
176 |             ious = bbox_iou(pre_box, tbox)
177 |             iou_sum += ious.sum()
178 |             loss_o = loss / (batch_i + 1)
179 | 
180 |             iou = iou_sum / test_n
181 |             s = (('%10.4f')*4+'%10d') % (iou, loss_o.sum(), loss_o[0], loss_o[1], len(targets))
182 | 
183 | 
184 |             if opt and opt.verbose:
185 |                 np.set_printoptions(precision = 2)
186 |                 for p in range(len(imgs)):
187 |                     print(paths[p], 'pbox_xywh', pre_box[p].numpy(), 'tbox_xywh', tbox[p].numpy())
188 | 
189 |             if opt and opt.save_pic:
190 |                 for p in range(len(imgs)):
191 |                     save_test_pic(str(p+test_n-batch_size), imgs[p], pre_box[p], tbox[p])
192 | 
193 |             pbar.set_description(s)
194 |             
195 |     return iou, loss_o.sum(), loss_o[0], loss_o[1] # iou, loss_sum, lobj, lcls
196 |            
197 | 
198 | if __name__ == '__main__':
199 |     parser = argparse.ArgumentParser(prog='test.py')
200 |     parser.add_argument('-m', '--model', type=str, default='UltraNet_FixQ', help='model name')
201 |     parser.add_argument('-w', '--weight', default=None, help='weights path')
202 |     parser.add_argument('-bs', '--batch-size', type=int, default=16, help='size of each image batch')
203 |     parser.add_argument('--img-size', type=int, default=320, help='inference size (pixels)')
204 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
205 |     parser.add_argument('--datapath', default='', help = 'test dataset path')
206 |     parser.add_argument('-v', '--verbose', action='store_true', help = 'show predict value result')
207 |     parser.add_argument('--save-pic', action='store_true', help = 'save predict output picture')
208 |     parser.add_argument('-nb', '--num-batch', type=int, default='-1', help='num of batchs to run, -1 for full dataset')
209 |     opt = parser.parse_args()
210 |     print(opt)
211 |     if opt.weight is None: opt.weight = select_weight_file()
212 | 
213 |     if opt.datapath == '':
214 |         try:
215 |             import localconfig
216 |             opt.datapath = localconfig.test_path
217 |         except Exception:
218 |             pass
219 | 
220 |     # Test
221 |     res = test(
222 |             opt.weight,
223 |             opt.batch_size,
224 |             opt.img_size,
225 |             opt.model,
226 |             num_batch = opt.num_batch)
227 | 
228 |     print(('%s %s.pt\niou %.5f, lsum %.4f, lobj %.4f, lcls %.4f')%(opt.model, opt.weight, *res))
229 | 


--------------------------------------------------------------------------------
/dacsdc/train_old.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch.distributed as dist
  4 | import torch.optim as optim
  5 | import torch.optim.lr_scheduler as lr_scheduler
  6 | 
  7 | import sys
  8 | sys.path.append('..')
  9 | import localconfig
 10 | import test  # import test.py to get mAP after each epoch
 11 | from datasets import *
 12 | from yolo_utils import *
 13 | 
 14 | from mymodel import *
 15 | import mymodel
 16 | 
 17 | wdir = 'weights' + os.sep  # weights dir
 18 | 
 19 | # Hyperparameters (results68: 59.9 mAP@0.5 yolov3-spp-416) https://github.com/ultralytics/yolov3/issues/310
 20 | 
 21 | hyp = {'giou': 3.54,  # giou loss gain
 22 |        'cls': 37.4,  # cls loss gain
 23 |        'cls_pw': 1.0,  # cls BCELoss positive_weight
 24 |        'obj': 64.3,  # obj loss gain (*=img_size/320 if img_size != 320)
 25 |        'obj_pw': 1.0,  # obj BCELoss positive_weight
 26 |        'iou_t': 0.225,  # iou training threshold
 27 |        'lr0': 0.01,  # initial learning rate (SGD=5E-3, Adam=5E-4)
 28 |        'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
 29 |        'momentum': 0.937,  # SGD momentum
 30 |        'weight_decay': 0.000484,  # optimizer weight decay
 31 |        'fl_gamma': 0.5,  # focal loss gamma
 32 |        'hsv_h': 0.0138,  # image HSV-Hue augmentation (fraction)
 33 |        'hsv_s': 0.678,  # image HSV-Saturation augmentation (fraction)
 34 |        'hsv_v': 0.36,  # image HSV-Value augmentation (fraction)
 35 |        'degrees': 1.98,  # image rotation (+/- deg)
 36 |        'translate': 0.05,  # image translation (+/- fraction)
 37 |        'scale': 0.05,  # image scale (+/- gain)
 38 |        'shear': 0.641}  # image shear (+/- deg)
 39 | 
 40 | # Overwrite hyp with hyp*.txt (optional)
 41 | f = glob.glob('hyp*.txt')
 42 | if f:
 43 |     print('Using %s' % f[0])
 44 |     for k, v in zip(hyp.keys(), np.loadtxt(f[0])):
 45 |         hyp[k] = v
 46 | 
 47 | 
 48 | def train():
 49 |     cfg = opt.cfg
 50 |     data = opt.data
 51 |     img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2  # train, test sizes
 52 |     epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
 53 |     batch_size = opt.batch_size
 54 |     accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
 55 |     weights = opt.weights  # initial training weights
 56 | 
 57 |     # Initialize
 58 |     init_seeds()
 59 |     if opt.multi_scale:
 60 |         img_sz_min = round(img_size / 32 / 1.5)
 61 |         img_sz_max = round(img_size / 32* 1.5)
 62 |         img_size = img_sz_max * 32  # initiate with maximum multi_scale size
 63 |         print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))
 64 | 
 65 |     # Configure run
 66 |     # data_dict = parse_data_cfg(data)
 67 |     train_path = localconfig.train_path
 68 |     test_path = localconfig.test_path
 69 |     nc = 1 
 70 | 
 71 |     results_file = 'results/%s.txt'%opt.name
 72 |     # Remove previous results
 73 |     for f in glob.glob('*_batch*.png') + glob.glob(results_file):
 74 |         os.remove(f)
 75 | 
 76 |     # Initialize model
 77 |     model = getattr(mymodel, opt.model)().to(device)
 78 | 
 79 |     # Optimizer
 80 |     pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
 81 |     for k, v in dict(model.named_parameters()).items():
 82 |         if '.bias' in k:
 83 |             pg2 += [v]  # biases
 84 |         elif 'Conv2d.weight' in k:
 85 |             pg1 += [v]  # apply weight_decay
 86 |         else:
 87 |             pg0 += [v]  # all else
 88 | 
 89 |     if opt.adam:
 90 |         # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
 91 |         optimizer = optim.Adam(pg0, lr=hyp['lr0'])
 92 |         # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
 93 |     else:
 94 |         optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
 95 |     optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
 96 |     optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
 97 |     optimizer.param_groups[2]['lr'] *= 2.0  # bias lr
 98 |     del pg0, pg1, pg2
 99 | 
100 |     start_epoch = 0
101 |     best_fitness = 0.0
102 |     test_best_iou = 0.0
103 | 
104 |     # attempt_download(weights)
105 |     # 加载权重
106 |     if weights.endswith('.pt'):  # pytorch format
107 |         # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
108 |         chkpt = torch.load(weights, map_location=device)
109 | 
110 |         # load model
111 |         try:
112 |             chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
113 |             model.load_state_dict(chkpt['model'], strict=False)
114 |         except KeyError as e:
115 |             s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " % (opt.weights, opt.cfg, opt.weights)
116 |             raise KeyError(s) from e
117 | 
118 |         if opt.resume:
119 |         # load optimizer
120 |             if chkpt['optimizer'] is not None:
121 |                 optimizer.load_state_dict(chkpt['optimizer'])
122 |                 best_fitness = chkpt['best_fitness']
123 | 
124 |             # load results
125 |             if chkpt.get('training_results') is not None:
126 |                 with open(results_file, 'w') as file:
127 |                     file.write(chkpt['training_results'])  # write results.txt
128 | 
129 |             start_epoch = chkpt['epoch'] + 1
130 | 
131 |         del chkpt
132 | 
133 |     elif len(weights) > 0:  # darknet format
134 |         # possible weights are '*.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
135 |         load_darknet_weights(model, weights)
136 | 
137 |     # Scheduler https://github.com/ultralytics/yolov3/issues/238
138 |     # lf = lambda x: 1 - x / epochs  # linear ramp to zero
139 |     # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
140 |     # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
141 |     lf = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.99 + 0.01  # cosine https://arxiv.org/pdf/1812.01187.pdf
142 |     scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
143 |     # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(epochs * x) for x in [0.8, 0.9]], gamma=0.1)
144 |     scheduler.last_epoch = start_epoch
145 | 
146 |     # # Plot lr schedule
147 |     # y = []
148 |     # for _ in range(epochs):
149 |     #     scheduler.step()
150 |     #     y.append(optimizer.param_groups[0]['lr'])
151 |     # plt.plot(y, '.-', label='LambdaLR')
152 |     # plt.xlabel('epoch')
153 |     # plt.ylabel('LR')
154 |     # plt.tight_layout()
155 |     # plt.savefig('LR.png', dpi=300)
156 |     # Initialize distributed training
157 |     if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
158 |         dist.init_process_group(backend='nccl',  # 'distributed backend'
159 |                                 init_method='tcp://127.0.0.1:5000',  # distributed training init method
160 |                                 world_size=1,  # number of nodes for distributed training
161 |                                 rank=0)  # distributed training node rank
162 |         model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
163 |         model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level
164 | 
165 |     # Dataloader
166 |     #batch_size = min(batch_size, len(dataset))
167 |     nw = min([os.cpu_count()//4, batch_size//4 if batch_size > 1 else 0, 8])  # number of workers
168 | 
169 |     # Testloader
170 |     testset = LoadImagesAndLabels(test_path, img_size_test, batch_size,
171 |                                                                  hyp=hyp,
172 |                                                                  rect=False,
173 |                                                                  cache_images=opt.cache_images,
174 |                                                                  single_cls=opt.single_cls)
175 |     testloader = torch.utils.data.DataLoader(testset,
176 |                                              batch_size=batch_size,
177 |                                              num_workers=0,
178 |                                              pin_memory=True,
179 |                                              collate_fn=testset.collate_fn)
180 | 
181 |     # Dataset
182 |     dataset = LoadImagesAndLabels(train_path, img_size, batch_size,
183 |                                   augment=True,
184 |                                   hyp=hyp,  # augmentation hyperparameters
185 |                                   rect=opt.rect,  # rectangular training
186 |                                   cache_images=opt.cache_images,
187 |                                   single_cls=opt.single_cls)
188 |                                              
189 |     dataloader = torch.utils.data.DataLoader(dataset,
190 |                                              batch_size=batch_size,
191 |                                              num_workers=nw,
192 |                                              shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
193 |                                              pin_memory=True,
194 |                                              collate_fn=dataset.collate_fn)
195 | 
196 |     # Start training
197 |     nb = len(dataloader)
198 |     prebias = start_epoch == 0
199 |     model.nc = nc  # attach number of classes to model
200 |     model.arc = opt.arc  # attach yolo architecture
201 |     model.hyp = hyp  # attach hyperparameters to model
202 |     model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
203 |     maps = np.zeros(nc)  # mAP per class
204 |     # torch.autograd.set_detect_anomaly(True)
205 |     results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
206 |     t0 = time.time()
207 |     torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
208 |     print('Using %g dataloader workers' % nw)
209 |     print('Starting training for %g epochs...' % epochs)
210 |     for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
211 |         model.train()
212 |         model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2  # GIoU <-> 1.0 loss ratio
213 | 
214 |         # Prebias
215 |         if prebias:
216 |             ne = max(round(30 / nb), 3)  # number of prebias epochs
217 |             ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \
218 |                  np.interp(epoch, [0, ne], [0.9, hyp['momentum']])  # prebias settings (lr=0.1, momentum=0.9)
219 |             if epoch == ne:
220 |                 # print_model_biases(model)
221 |                 prebias = False
222 | 
223 |             # Bias optimizer settings
224 |             optimizer.param_groups[2]['lr'] = ps[0]
225 |             if optimizer.param_groups[2].get('momentum') is not None:  # for SGD but not Adam
226 |                 optimizer.param_groups[2]['momentum'] = ps[1]
227 | 
228 |         mloss = torch.zeros(4).to(device)  # mean losses
229 |         print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
230 |         pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
231 |         for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
232 |             ni = i + nb * epoch  # number integrated batches (since train start)
233 |             imgs = imgs.to(device).float() / 256.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
234 |             targets = targets.to(device)
235 | 
236 |             # Hyperparameter burn-in
237 |             # n_burn = nb - 1  # min(nb // 5 + 1, 1000)  # number of burn-in batches
238 |             # if ni <= n_burn:
239 |             #     for m in model.named_modules():
240 |             #         if m[0].endswith('BatchNorm2d'):
241 |             #             m[1].momentum = 1 - i / n_burn * 0.99  # BatchNorm2d momentum falls from 1 - 0.01
242 |             #     g = (i / n_burn) ** 4  # gain rises from 0 - 1
243 |             #     for x in optimizer.param_groups:
244 |             #         x['lr'] = hyp['lr0'] * g
245 |             #         x['weight_decay'] = hyp['weight_decay'] * g
246 | 
247 |             # Plot images with bounding boxes
248 |             if ni < 1:
249 |                 f = 'train_batch%g.png' % i  # filename
250 |                 # plot_images(imgs=imgs, targets=targets, paths=paths, fname=f)
251 |                 if tb_writer:
252 |                     tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC')
253 | 
254 |             # Multi-Scale training
255 |             if opt.multi_scale:
256 |                 if ni / accumulate % 1 == 0:  #  adjust img_size (67% - 150%) every 1 batch
257 |                     img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32
258 |                 sf = img_size / max(imgs.shape[2:])  # scale factor
259 |                 if sf != 1:
260 |                     ns = [math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]]  # new shape (stretched to 16-multiple)
261 |                     imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
262 | 
263 |             # Run model
264 |             pred = model(imgs)
265 | 
266 |             # Compute loss
267 |             loss, loss_items = compute_loss(pred, targets, model)
268 |             if not torch.isfinite(loss):
269 |                 print('WARNING: non-finite loss, ending training ', loss_items)
270 |                 return results
271 | 
272 |             # Scale loss by nominal batch_size of 64
273 |             loss *= batch_size / 64
274 | 
275 |             
276 |             loss.backward()
277 | 
278 |             # Optimize accumulated gradient
279 |             if ni % accumulate == 0:
280 |                 optimizer.step()
281 |                 optimizer.zero_grad()
282 | 
283 |             # Print batch results
284 |             mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
285 |             mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
286 |             s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size)
287 |             pbar.set_description(s)
288 | 
289 |             # end batch ------------------------------------------------------------------------------------------------
290 | 
291 |         # Update scheduler
292 |         scheduler.step()
293 | 
294 |         # Process epoch results
295 |         final_epoch = epoch + 1 == epochs
296 |         if not opt.notest or final_epoch:  # Calculate mAP
297 |             results = test.test(batch_size=batch_size,
298 |                                 img_size=img_size_test,
299 |                                 model=model,
300 |                                 dataloader=testloader)
301 |         
302 |         # Write epoch results
303 |         with open(results_file, 'a') as f:
304 |             f.write(s + '%10.3g' * len(results) % results + '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
305 |         if len(opt.name) and opt.bucket:
306 |             os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name))
307 | 
308 |         # Write Tensorboard results
309 |         if tb_writer:
310 |             x = list(mloss) + list(results)
311 |             titles = ['GIoU', 'Objectness', 'Classification', 'Train loss',
312 |                       'iou', 'loss', 'Giou loss', 'obj loss']
313 |             for xi, title in zip(x, titles):
314 |                 tb_writer.add_scalar(title, xi, epoch)
315 | 
316 |         # Update best mAP
317 |         results =  torch.tensor(results, device = 'cpu')
318 |         fi = fitness(np.array(results).reshape(1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
319 |         if fi > best_fitness:
320 |             best_fitness = fi
321 |         
322 |         test_iou = results[0]
323 |         if test_iou > test_best_iou:
324 |             test_best_iou = test_iou
325 | 
326 |         # Save training results
327 |         save = (not opt.nosave) or (final_epoch and not opt.evolve)
328 |         if save:
329 |             with open(results_file, 'r') as f:
330 |                 # Create checkpoint
331 |                 chkpt = {'epoch': epoch,
332 |                          'best_fitness': best_fitness,
333 |                          'training_results': f.read(),
334 |                          'model': model.module.state_dict() if type(
335 |                              model) is nn.parallel.DistributedDataParallel else model.state_dict(),
336 |                          'optimizer': None if final_epoch else optimizer.state_dict()}
337 | 
338 |             # Save last checkpoint
339 |             torch.save(chkpt, wdir + '%s_last.pt'%opt.name)
340 |             
341 |             if test_iou == test_best_iou:
342 |                 torch.save(chkpt, wdir + '%s_best.pt'%opt.name)
343 | 
344 |             # Save backup every 10 epochs (optional)
345 |             # if epoch > 0 and epoch % 10 == 0:
346 |             #     torch.save(chkpt, wdir + 'backup%g.pt' % epoch)
347 | 
348 |             # Delete checkpoint
349 |             del chkpt
350 | 
351 |         # end epoch ----------------------------------------------------------------------------------------------------
352 | 
353 |     # end training
354 |     n = opt.name
355 |     if len(n) and False:
356 |         n = '_' + n if not n.isnumeric() else n
357 |         fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n
358 |         os.rename('results.txt', fresults)
359 |         os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None
360 |         os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None
361 |         if opt.bucket:  # save to cloud
362 |             os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket))
363 |             os.system('gsutil cp %s gs://%s/weights' % (wdir + flast, opt.bucket))
364 |             # os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket))
365 | 
366 |     #if not opt.evolve:
367 |     #    plot_results()  # save as results.png
368 |     print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
369 |     dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
370 |     torch.cuda.empty_cache()
371 | 
372 |     return results
373 | 
374 | 
375 | if __name__ == '__main__':
376 |     parser = argparse.ArgumentParser()
377 |     parser.add_argument('--epochs', type=int, default=200)  # 500200 batches at bs 16, 117263 COCO images = 273 epochs
378 |     parser.add_argument('--batch-size', type=int, default=64)  # effective bs = batch_size * accumulate = 16 * 4 = 64
379 |     parser.add_argument('--accumulate', type=int, default=1, help='batches to accumulate before optimizing')
380 |     parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path')
381 |     parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path')
382 |     parser.add_argument('--multi-scale', action='store_true', help='adjust (67% - 150%) img_size every 10 batches')
383 |     parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes')
384 |     parser.add_argument('--rect', action='store_true', help='rectangular training')
385 |     parser.add_argument('--resume', action='store_true', help='resume training from last.pt')
386 |     parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
387 |     parser.add_argument('--notest', action='store_true', help='only test final epoch')
388 |     parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')
389 |     parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
390 |     parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
391 |     parser.add_argument('--weights', type=str, default='', help='initial weights path')
392 |     parser.add_argument('--arc', type=str, default='default', help='yolo architecture')  # default, uCE, uBCE
393 |     parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
394 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)')
395 |     parser.add_argument('--adam', action='store_true', help='use adam optimizer')
396 |     parser.add_argument('--model', type=str, default='UltraNetFloat', help='model used')
397 |     parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
398 |     parser.add_argument('--var', type=float, help='debug variable')
399 |     opt = parser.parse_args()
400 |     last = wdir + 'last_%s.pt'%opt.name
401 |     opt.weights = last if opt.resume else opt.weights
402 |     print(opt)
403 |     device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
404 |     # scale hyp['obj'] by img_size (evolved at 320)
405 |     # hyp['obj'] *= opt.img_size[0] / 320.
406 | 
407 |     tb_writer = None
408 |     if not opt.evolve:  # Train normally
409 |         try:
410 |             # Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/
411 |             from torch.utils.tensorboard import SummaryWriter
412 | 
413 |             tb_writer = SummaryWriter()
414 |         except:
415 |             pass
416 | 
417 |         train()  # train normally
418 | 
419 |     else:  # Evolve hyperparameters (optional)
420 |         opt.notest, opt.nosave = True, True  # only test/save final epoch
421 |         if opt.bucket:
422 |             os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket)  # download evolve.txt if exists
423 | 
424 |         for _ in range(1):  # generations to evolve
425 |             if os.path.exists('evolve.txt'):  # if evolve.txt exists: select best hyps and mutate
426 |                 # Select parent(s)
427 |                 parent = 'single'  # parent selection method: 'single' or 'weighted'
428 |                 x = np.loadtxt('evolve.txt', ndmin=2)
429 |                 n = min(5, len(x))  # number of previous results to consider
430 |                 x = x[np.argsort(-fitness(x))][:n]  # top n mutations
431 |                 w = fitness(x) - fitness(x).min()  # weights
432 |                 if parent == 'single' or len(x) == 1:
433 |                     # x = x[random.randint(0, n - 1)]  # random selection
434 |                     x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
435 |                 elif parent == 'weighted':
436 |                     x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination
437 | 
438 |                 # Mutate
439 |                 method, mp, s = 3, 0.9, 0.2  # method, mutation probability, sigma
440 |                 npr = np.random
441 |                 npr.seed(int(time.time()))
442 |                 g = np.array([1, 1, 1, 1, 1, 1, 1, 0, .1, 1, 0, 1, 1, 1, 1, 1, 1, 1])  # gains
443 |                 ng = len(g)
444 |                 if method == 1:
445 |                     v = (npr.randn(ng) * npr.random() * g * s + 1) ** 2.0
446 |                 elif method == 2:
447 |                     v = (npr.randn(ng) * npr.random(ng) * g * s + 1) ** 2.0
448 |                 elif method == 3:
449 |                     v = np.ones(ng)
450 |                     while all(v == 1):  # mutate until a change occurs (prevent duplicates)
451 |                         # v = (g * (npr.random(ng) < mp) * npr.randn(ng) * s + 1) ** 2.0
452 |                         v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
453 |                 for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
454 |                     hyp[k] = x[i + 7] * v[i]  # mutate
455 | 
456 |             # Clip to limits
457 |             keys = ['lr0', 'iou_t', 'momentum', 'weight_decay', 'hsv_s', 'hsv_v', 'translate', 'scale', 'fl_gamma']
458 |             limits = [(1e-5, 1e-2), (0.00, 0.70), (0.60, 0.98), (0, 0.001), (0, .9), (0, .9), (0, .9), (0, .9), (0, 3)]
459 |             for k, v in zip(keys, limits):
460 |                 hyp[k] = np.clip(hyp[k], v[0], v[1])
461 | 
462 |             # Train mutation
463 |             results = train()
464 | 
465 |             # Write mutation results
466 |             print_mutation(hyp, results, opt.bucket)
467 | 
468 |             # Plot results
469 |             # plot_evolution_results(hyp)
470 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # DeepBurning-MixQ
  2 | 
  3 | This is part of the [DeepBurning project](https://github.com/groupsada/DeepBurning) developed for agile neural network accelerator design in Institute of Computing Technology, Chinese Academy of Sciences. It focuses on the software/hardware co-optimization of FPGA-based accelerators for low bit-width mixed-precision neural network models. In terms of hardware, we mainly explore the packing method of various low bit-width convolution operators, so that each primitive DSP in FPGAs can accommodate as many low bit-width operations as possible, thereby improving DSP utilization. In terms of the model, we mainly utilize differential NAS (Network Architecture Search) technique to perform mixed-precision quantization on the given model, while also considering the hardware implementation efficiency of the quantized model, in order to efficiently deploy the target convolutional neural network model onto FPGA under given resource constraints.
  4 | 
  5 | This work has been published at ICCAD'23 and please refer to the paper for more details. (DOI:[10.1109/ICCAD57390.2023.10323831](https://doi.org/10.1109/ICCAD57390.2023.10323831))
  6 | 
  7 | Erjing Luo#, Haitong Huang#, Cheng Liu*, Guoyu Li, Bing Yang, Ying Wang, Huawei Li, Xiaowei Li, "DeepBurning-MixQ: An Open Source Mixed-Precision Neural Network Accelerator Design Framework for FPGAs", ICCAD, 2023. (# equal contribution)
  8 | 
  9 | ## Status
 10 | This project mainly explores automatic HW/SW co-optimization of FPGA-based neural network accelerators for mixed-precision neural network models. Currently, we have the mixed-precision neural network models fully pipelined across the FPGA, so it mainly targets smaller neural network models with limited layers. A hybrid multi-core neural network accelerator that can accommodate generic mixed-precision neural network models will come coon.  
 11 | 
 12 | This repo includes the training, quantization, and weight export of hardware aware mixed precision neural network models. For efficient FPGA HLS operators and optimization code, please refer to [https://github.com/MatthewLuo7/MixQ_Gen_Acce](https://github.com/MatthewLuo7/MixQ_Gen_Accel).
 13 | 
 14 | ## Classification Model
 15 | 
 16 | ### Usage
 17 | ```bash
 18 | cd cifar/
 19 | 
 20 | # 1. Hardware-aware Mixed Precison NAS
 21 | python search_train.py --cd 3e-5 --name mix_vggtiny_cifar_cd3e5
 22 | # Params:
 23 | # --cd  Stands for complexity decay
 24 | # --name  Stands for checkpoint .pt and .log filename
 25 | # --model  Mixed precision supernet model, default is `VGGtiny_MixQ` 
 26 | # Then, the optimal bit width of each layer will converge after dozens of epochs, for example bitw={8,2,2,2,2,2}, bita = {8,3,3,3,6,3}
 27 | 
 28 | 
 29 | # 2. Main train
 30 | python main_train.py --bitw 822222 --bita 833363 --name vggtiny_cifar_cd3e5
 31 | # Trained weights are under weights/tiny_cifar_cd3e5.pt 
 32 | 
 33 | 
 34 | # 3. Test model
 35 | python test_acc.py
 36 | # You can choose tiny_cifar_cd3e5.pt for test if nothing wrong
 37 | 
 38 | # 4. HLS code generation: 
 39 | # Now can directly export HLS configuration header and weight file form .pt weight file.
 40 | # Adjust `simd, pe` parallelization factor of each layer firstly.
 41 | vim hls/config_simd_pe.txt
 42 | # Export `config.h` and `weights.hpp` to /hls/tiny_cifar_cd3e5/
 43 | python export_hls.py
 44 | 
 45 | 
 46 | # 5. Model-Level Hardware Simulation
 47 | # simulate_hls.py requires /hls/tiny_cifar_cd3e5/model_param.pkl file generated by export_hls.py
 48 | python simulate_hls.py
 49 | # This output should consist with hardware output or HLS C-Level simluation
 50 | 
 51 | ```
 52 | 
 53 | ## DAC-SDC Object Detection Model
 54 | 
 55 | The DAC System Design Contest focused on low-power object detection on an embedded FPGA system: https://www.dac.com/Conference/System-Design-Contest.
 56 | 
 57 | The target of this contest is optimize performance of the designs in terms of accuracy and power on a Ultra 96 v2 FPGA board. This contest was held 5 times, from 2018 to 2022, and the performance of optimal design in these years increased from 30 fps to thousands of fps.
 58 | 
 59 | Base models for anypacking bitwidth search:
 60 | 
 61 | - UltraNet: https://github.com/heheda365/ultra_net by BJUT_runner team, 1st place of 2020 DAC-SDC contest. UltraNet is a VGGNet-like model with much less parameters. UltraNet_iSmart is 2nd place of 2021 DAC-SDC design by UIUC ismart team, which have much better throughput by fixed packing optimize.
 62 | - UltraNet_Bypass: https://github.com/heymesut/SJTU_microe 21' SJTU, 3rd place of 2021 DAC-SDC contest. A variant of UltraNet with bypass connect. Bypass connect increases model accuracy, but makes design of NN acclerator based on pipeline architecture more difficult.
 63 | - SkyNet: https://github.com/jiangwx/SkrSkr SkrSkr by SHTECH, 1st place of 2021 DAC-SDC contest. SkyNet is a MobileNet-like lightweight model.
 64 | - SkyNetk5: SkyNet with 5x5 depthwise convolution kernel. Since dwconv uses much fewer calculations than pwconv, larger kernel brings higher accuracy with slight cost.
 65 | 
 66 | Dataset: See https://byuccl.github.io/dac_sdc_2022/info/.
 67 | 
 68 | **Usage**: First `cd dacsdc/`, then follow next steps.
 69 | 
 70 | ### 1) Hardware-aware Mixed Precison NAS for bit width
 71 | 
 72 | ```bash
 73 | # For UltraNet with mixed precision:
 74 | python search_train.py --cd 1e-5 --name mix_ultranet_cd1e5
 75 | 
 76 | # UltraNet with Bypass:
 77 | python search_train.py --cd 1e-5 --name mix_ultranet_bypass_cd1e5 --model UltraNetBypass_MixQ
 78 | 
 79 | # SkyNet/SkyNetk5
 80 | python search_train.py --cd 1e-5 --name mix_skynet_cd1e5 --model [SkyNet_MixQ | SkyNetk5_MixQ]
 81 | ```
 82 | 
 83 | ### 2) Main Train
 84 | 
 85 | For UltraNet:
 86 | ```bash
 87 | # UltraNet_BJTU use full 4bit wquantization
 88 | python main_train.py --bitw 444444444 --bita 844444444 --name ultranet_BJTU
 89 | 
 90 | # UltraNet_iSmart use full 4-8 mixed quantization for weight
 91 | python main_train.py --bitw 844444448 --bita 844444444 --name ultranet_iSmart
 92 | 
 93 | # Or use searched bitw, bita from search_train.py
 94 | python main_train.py --bitw <bitw> --bita <bita> --name ultranet_anypacking
 95 | ```
 96 | For UltraNet_Bypass/SkyNet/SkyNetk5 
 97 | ```bash
 98 | python main_train.py --bitw <bitw> --bita <bita> --name <ckptname> --model [UltraNet_Bypass | SkyNet | SkyNetk5]
 99 | ```
100 | 
101 | ### 3) Test model
102 | 
103 | ```bash
104 | python test.py [--model [UltraNet_Bypass_FixQ | SkyNet_FixQ | SkyNetk5_FixQ]]
105 | ```
106 | 
107 | ### 4) HLS export
108 | ```bash
109 | # For Ultranet or Ultranet_Bypass
110 | python export_hls.py [--model UltraNet_Bypass_FixQ]
111 | # For SkyNet or SkyNetk5
112 | python export_hls.py [--model SkyNetk5_FixQ]
113 | ```
114 | 
115 | ### 5) Model-Level Hardware Simulation
116 | ```bash
117 | python simulate_hls.py [--model [UltraNet_Bypass_FixQ | SkyNet_FixQ | SkyNetk5_FixQ]]
118 | ```
119 | 
120 | ## Reference
121 | - https://github.com/zhaoweicai/EdMIPS EdMIPS: Rethinking Differentiable Search for Mixed-Precision Neural Networks
122 | - https://github.com/kuangliu/pytorch-cifar Smaller models for cifar dataset
123 | - https://github.com/ultralytics/yolov3.git yolov3 training framework
124 | - https://github.com/jiangwx/SkyNet SkyNet by SHTECH, winner of 2019 DAC-SDC contest
125 | - https://github.com/jgoeders/dac_sdc_2020_designs Winner designs of 2020 DAC-SDC contest
126 | - https://github.com/heheda365/ultra_net BJUT_runner team, 1st place of 2020 DAC-SDC contest, UltraNet
127 | - https://github.com/jgoeders/dac_sdc_2021_designs Winner designs of 2021 DAC-SDC contest
128 | - https://github.com/jiangwx/SkrSkr SkrSkr by SHTECH, 1st place of 2021 DAC-SDC contest, SkyNet
129 | - https://github.com/xliu0709/DACSDC2021 iSmart team, 2nd place of 2021 DAC-SDC design, UltraNet with optimized packing method
130 | - https://github.com/heymesut/SJTU_microe 3rd place of 2021 DAC-SDC design by SJTU, a variant of UltraNet with bypass
131 | - https://github.com/jgoeders/dac_sdc_2022_designs Winner designs of 2022 DAC-SDC contest
132 | - https://github.com/MatthewLuo7/InvolutionNet 3rd place of 2022 DAC-SDC design (ours), without anypacking design
133 | 
134 | ## License
135 | 
136 | ![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg) ![License: AGPL](https://img.shields.io/badge/License-AGPL-red.svg)
137 | 
138 | NOTE that directories in this repo have different licenses. 
139 | 
140 | The main code `anypacking/` and example `cifar/` use MIT license. However, due to the fact that DACSDC's object detection model uses some code from YOLO v3, which is the AGPL license, `dacsdc/` example also uses AGPL license.
141 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fffasttime/AnyPackingNet/1d740bf0071bec024a745adc3bcd31426b29f601/utils/__init__.py


--------------------------------------------------------------------------------
/utils/torch_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | def init_seeds(seed=0):
  7 |     torch.manual_seed(seed)
  8 | 
  9 |     # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
 10 |     if seed == 0:
 11 |         torch.backends.cudnn.deterministic = True
 12 |         torch.backends.cudnn.benchmark = False
 13 | 
 14 | 
 15 | def select_device(device='', apex=False, batch_size=None):
 16 |     # device = 'cpu' or '0' or '0,1,2,3'
 17 |     cpu_request = device.lower() == 'cpu'
 18 |     if device and not cpu_request:  # if device requested other than 'cpu'
 19 |         os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable
 20 |         assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device  # check availablity
 21 | 
 22 |     cuda = False if cpu_request else torch.cuda.is_available()
 23 |     if cuda:
 24 |         c = 1024 ** 2  # bytes to MB
 25 |         ng = torch.cuda.device_count()
 26 |         # if ng > 1 and batch_size:  # check that batch_size is compatible with device_count
 27 |         #     assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng)
 28 |         x = [torch.cuda.get_device_properties(i) for i in range(ng)]
 29 |         s = 'Using CUDA ' + ('Apex ' if apex else '')  # apex for mixed precision https://github.com/NVIDIA/apex
 30 |         for i in range(0, ng):
 31 |             if i == 1:
 32 |                 s = ' ' * len(s)
 33 |             print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
 34 |                   (s, i, x[i].name, x[i].total_memory / c))
 35 |     else:
 36 |         print('Using CPU')
 37 |         
 38 |     return torch.device('cuda:0' if cuda else 'cpu')
 39 | 
 40 | 
 41 | def fuse_conv_and_bn(conv, bn):
 42 |     # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
 43 |     with torch.no_grad():
 44 |         # init
 45 |         fusedconv = torch.nn.Conv2d(conv.in_channels,
 46 |                                     conv.out_channels,
 47 |                                     kernel_size=conv.kernel_size,
 48 |                                     stride=conv.stride,
 49 |                                     padding=conv.padding,
 50 |                                     bias=True)
 51 | 
 52 |         # prepare filters
 53 |         w_conv = conv.weight.clone().view(conv.out_channels, -1)
 54 |         w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
 55 |         fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))
 56 | 
 57 |         # prepare spatial bias
 58 |         if conv.bias is not None:
 59 |             b_conv = conv.bias
 60 |         else:
 61 |             b_conv = torch.zeros(conv.weight.size(0))
 62 |         b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
 63 |         fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
 64 | 
 65 |         return fusedconv
 66 | 
 67 | 
 68 | def model_info(model, report='summary'):
 69 |     # Plots a line-by-line description of a PyTorch model
 70 |     n_p = sum(x.numel() for x in model.parameters())  # number parameters
 71 |     n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
 72 |     if report is 'full':
 73 |         print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
 74 |         for i, (name, p) in enumerate(model.named_parameters()):
 75 |             name = name.replace('module_list.', '')
 76 |             print('%5g %40s %9s %12g %20s %10.3g %10.3g' %
 77 |                   (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
 78 |     print('Model Summary: %g layers, %g parameters, %g gradients' % (len(list(model.parameters())), n_p, n_g))
 79 | 
 80 | 
 81 | def load_classifier(name='resnet101', n=2):
 82 |     # Loads a pretrained model reshaped to n-class output
 83 |     import pretrainedmodels  # https://github.com/Cadene/pretrained-models.pytorch#torchvision
 84 |     model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet')
 85 | 
 86 |     # Display model properties
 87 |     for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']:
 88 |         print(x + ' =', eval(x))
 89 | 
 90 |     # Reshape output to n classes
 91 |     filters = model.last_linear.weight.shape[1]
 92 |     model.last_linear.bias = torch.nn.Parameter(torch.zeros(n))
 93 |     model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters))
 94 |     model.last_linear.out_features = n
 95 |     return model
 96 | 
 97 | log_layerid = 0
 98 | def loglayer(x):
 99 |     global log_layerid
100 |     import numpy as np
101 |     x=x.numpy()
102 |     assert x.dtype == np.int
103 |     with open('_logs/test%d.txt'%log_layerid, 'w') as f:
104 |         for i in range(x.shape[0]):
105 |             print('C', i, file=f)
106 |             for j in range(x.shape[1]):
107 |                 for k in range(x.shape[2]):
108 |                     print('%3d'%x[i,j,k], end=',', file=f)
109 |                 print(file=f)
110 | 
111 |     log_layerid+=1
112 | 


--------------------------------------------------------------------------------
/utils/view_pt.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | from typing import Dict
 3 | import argparse
 4 | import torch
 5 | import glob
 6 | import os
 7 | 
 8 | def select_weight_file():
 9 |     files = glob.glob('weights/*.pt')
10 |     if len(files) == 0:
11 |         print('[Error] No pt file found in current folder')
12 |         exit(1)
13 |     for i, s in enumerate(files):
14 |         print('', i, s)
15 |     sel = int(input('Select one .pt file (0-%d): '%(len(files)-1)))
16 |     return os.path.split(files[sel])[-1][:-3]
17 | 
18 | if __name__=='__main__':
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('-w', '--weight', type=str, default=None, help='weights path')
21 |     opt = parser.parse_args()
22 |     if opt.weight is None: opt.weight = select_weight_file()
23 | 
24 |     model: Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu')
25 |     res = model['training_results']
26 |     print(res)
27 |     
28 |     if 'model_params' in model:
29 |         print(model['model_params'])
30 | 
31 |     if 'extra' in model:
32 |         print(model['extra'])
33 | 


--------------------------------------------------------------------------------