├── .gitignore ├── LICENSE ├── __init__.py ├── anypacking ├── dsp_packing.py └── quant_module.py ├── cifar ├── export_hls.py ├── hls │ └── config_simd_pe.txt ├── main_train.py ├── models.py ├── search_train.py ├── simulate_hw.py ├── test_acc.py └── train_normal.py ├── dacsdc ├── datasets.py ├── export_hls.py ├── export_hls_skynet.py ├── hls │ └── config_simd_pe.txt ├── main_train.py ├── mymodel.py ├── pareto_train.py ├── quant_dorefa.py ├── search_train.py ├── simulate_hw.py ├── test.py ├── train_old.py └── yolo_utils.py ├── readme.md └── utils ├── __init__.py ├── torch_utils.py └── view_pt.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode 2 | __pycache__ 3 | *.pt 4 | test_result 5 | results 6 | /train_log 7 | /weights/* 8 | */hls/* 9 | !*/hls/config_simd_pe.txt 10 | localconfig.py 11 | /*.txt 12 | _logs 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Haitong Huang, Erjing Luo, Cheng Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fffasttime/AnyPackingNet/1d740bf0071bec024a745adc3bcd31426b29f601/__init__.py -------------------------------------------------------------------------------- /anypacking/dsp_packing.py: -------------------------------------------------------------------------------- 1 | factors_k11=[ 2 | [12,8,8,6,6,4,4], 3 | [10,8,6,6,4,4,4], 4 | [8,6,6,4,4,4,3], 5 | [6,6,4,4,4,4,2], 6 | [6,4,4,4,2,2,2], 7 | [4,4,4,4,2,2,2], 8 | [4,4,3,2,2,2,2], 9 | ] 10 | 11 | factors_k33=[ 12 | [18,15,12,7.5,7.5,6,6], 13 | [15,12,7.5,6,6,6,3], 14 | [12,7.5,6,6,6,6,3], 15 | [9,6,6,6,6,3,3], 16 | [7.5,6,6,4.5,3,3,3], 17 | [6,6,4.5,3,3,3,2.25], 18 | [6,3,3,3,3,3,2], 19 | ] 20 | 21 | factors_k55=[ 22 | [20,15,10,7.5,7.5,5,5], 23 | [12.5,10,6.67,5,5,5,3.33], 24 | [10,7.5,5,5,5,5,3.33], 25 | [7.5,6.67,5,5,5,3.33,3.33], 26 | [6.67,5,5,5,3.33,2.5,2.5], 27 | [5,5,5,3.33,2.5,2.5,2.5], 28 | [5,3.33,3.33,3.33,2.5,2.5,2], 29 | ] -------------------------------------------------------------------------------- /anypacking/quant_module.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from turtle import forward 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.parameter import Parameter 7 | 8 | # load dsp packing factors 9 | from .dsp_packing import * 10 | 11 | gaussian_steps = {1: 1.596, 2: 0.996, 3: 0.586, 4: 0.336, 5: 0.190, 6: 0.106, 7: 0.059, 8: 0.032} 12 | hwgq_steps = {1: 0.799, 2: 0.538, 3: 0.3217, 4: 0.185, 5: 0.104, 6: 0.058, 7: 0.033, 8: 0.019} 13 | 14 | class _gauss_quantize_sym(torch.autograd.Function): 15 | 16 | @staticmethod 17 | def forward(ctx, x, step, bit): 18 | lvls = 2 ** bit / 2 19 | alpha = x.std().item() 20 | step *= alpha 21 | y = (torch.round(x/step+0.5)-0.5) * step 22 | thr = (lvls-0.5)*step 23 | y = y.clamp(min=-thr, max=thr) 24 | return y 25 | 26 | @staticmethod 27 | def backward(ctx, grad_output): 28 | return grad_output, None, None 29 | 30 | 31 | class _gauss_quantize_resclaed_step_sym(torch.autograd.Function): 32 | 33 | @staticmethod 34 | def forward(ctx, x, step, bit): 35 | lvls = 2 ** bit / 2 36 | y = (torch.round(x/step+0.5)-0.5) * step 37 | thr = (lvls-0.5)*step 38 | y = y.clamp(min=-thr, max=thr) 39 | return y 40 | 41 | @staticmethod 42 | def backward(ctx, grad_output): 43 | return grad_output, None, None 44 | 45 | 46 | class _gauss_quantize(torch.autograd.Function): 47 | 48 | @staticmethod 49 | def forward(ctx, x, step, bit): 50 | lvls = 2 ** bit / 2 51 | alpha = x.std().item() 52 | step *= alpha 53 | y = torch.clamp(torch.round(x/step), -lvls, lvls-1) * step 54 | return y 55 | 56 | @staticmethod 57 | def backward(ctx, grad_output): 58 | return grad_output, None, None 59 | 60 | def _gauss_quantize_export(x, step, bit): 61 | lvls = 2 ** bit / 2 62 | alpha = x.std().item() 63 | step *= alpha 64 | y = torch.clamp(torch.round(x/step), -lvls, lvls-1) 65 | return y.cpu().detach().int().numpy(), step 66 | 67 | class _gauss_quantize_resclaed_step(torch.autograd.Function): 68 | 69 | @staticmethod 70 | def forward(ctx, x, step, bit): 71 | lvls = 2 ** bit / 2 72 | y = torch.clamp(torch.round(x/step), -lvls, lvls-1) * step 73 | return y 74 | 75 | @staticmethod 76 | def backward(ctx, grad_output): 77 | return grad_output, None, None 78 | 79 | class _hwgq(torch.autograd.Function): 80 | 81 | @staticmethod 82 | def forward(ctx, x, step): 83 | y = torch.round(x / step) * step 84 | return y 85 | 86 | @staticmethod 87 | def backward(ctx, grad_output): 88 | return grad_output, None 89 | 90 | 91 | class HWGQ(nn.Module): 92 | def __init__(self, bit=2): 93 | super(HWGQ, self).__init__() 94 | self.bit = bit 95 | if bit < 32: 96 | self.step = hwgq_steps[bit] 97 | else: 98 | self.step = None 99 | 100 | def forward(self, x): 101 | if self.bit >= 32: 102 | return x.clamp(min=0.0) 103 | lvls = float(2 ** self.bit - 1) 104 | clip_thr = self.step * lvls 105 | y = x.clamp(min=0.0, max=clip_thr) 106 | out = _hwgq.apply(y, self.step) 107 | return out 108 | 109 | class ImageInputQ(nn.Module): 110 | ''' 111 | Assume image input are discrete value [0/256, 1/256, 2/256, ..., 255/256] 112 | ''' 113 | def __init__(self, bit = 8): 114 | super(ImageInputQ, self).__init__() 115 | self.bit = bit 116 | self.step = 1/2**bit 117 | 118 | def forward(self, x): 119 | if self.step==32: 120 | return out 121 | out = torch.floor(x/self.step) * self.step # [!] There will be no gradient on x 122 | return out 123 | 124 | class QuantConv2d(nn.Conv2d): 125 | 126 | def __init__(self, *kargs, **kwargs): 127 | self.bit = kwargs.pop('bit', 1) 128 | super(QuantConv2d, self).__init__(*kargs, **kwargs) 129 | assert self.bit > 0 130 | self.step = None if self.bit==32 else gaussian_steps[self.bit] 131 | 132 | def forward(self, input): 133 | # quantized conv, otherwise regular 134 | if self.bit < 32: 135 | quant_weight = _gauss_quantize.apply(self.weight, self.step, self.bit) 136 | out = F.conv2d( 137 | input, quant_weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 138 | else: 139 | out = F.conv2d( 140 | input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 141 | return out 142 | 143 | def export_quant(self): 144 | return _gauss_quantize_export(self.weight, self.step, self.bit) 145 | 146 | class QuantLinear(nn.Linear): 147 | 148 | def __init__(self, *kargs, **kwargs): 149 | self.bit = kwargs.pop('bit', 1) 150 | super(QuantLinear, self).__init__(*kargs, **kwargs) 151 | assert self.bit > 0 152 | self.step = gaussian_steps[self.bit] 153 | 154 | def forward(self, input): 155 | # quantized linear, otherwise regular 156 | if self.bit < 32: 157 | # assert self.bias is None 158 | quant_weight = _gauss_quantize.apply(self.weight, self.step, self.bit) 159 | out = F.linear(input, quant_weight, self.bias) 160 | else: 161 | out = F.linear(input, self.weight, self.bias) 162 | return out 163 | 164 | def export_quant(self): 165 | return _gauss_quantize_export(self.weight, self.step, self.bit) 166 | 167 | class QuantActivConv2d(nn.Module): 168 | 169 | def __init__(self, inplane, outplane, wbit=1, abit=2, ActQ = HWGQ, **kwargs): 170 | super(QuantActivConv2d, self).__init__() 171 | self.abit = abit 172 | self.wbit = wbit 173 | self.activ = ActQ(abit) 174 | self.conv = QuantConv2d(inplane, outplane, bit=wbit, **kwargs) 175 | # complexities 176 | stride = kwargs['stride'] if 'stride' in kwargs else 1 177 | if isinstance(kwargs['kernel_size'], tuple): 178 | kernel_size = kwargs['kernel_size'][0] * kwargs['kernel_size'][1] 179 | else: 180 | kernel_size = kwargs['kernel_size'] * kwargs['kernel_size'] 181 | self.kernel_size = kwargs['kernel_size'] 182 | if 'groups' in kwargs: groups = kwargs['groups'] 183 | else: groups = 1 184 | self.inplane = inplane 185 | self.outplane = outplane 186 | self.groups = groups 187 | self.param_size = inplane * outplane * kernel_size * 1e-6 / groups 188 | self.filter_size = self.param_size / float(stride ** 2.0) 189 | self.register_buffer('size_product', torch.tensor(0, dtype=torch.float)) 190 | self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float)) 191 | self.register_buffer('in_width', torch.tensor(0, dtype=torch.float)) 192 | 193 | def forward(self, input): 194 | in_shape = input.shape 195 | tmp = torch.tensor(in_shape[1] * in_shape[2] * in_shape[3] * 1e-3, dtype=torch.float) 196 | self.memory_size.copy_(tmp) 197 | tmp = torch.tensor(self.filter_size * in_shape[-1] * in_shape[-2], dtype=torch.float) 198 | self.size_product.copy_(tmp) 199 | out = self.activ(input) 200 | tmp = torch.tensor(input.shape[3], dtype=torch.float) 201 | self.in_width.copy_(tmp) 202 | ## print('ii',input[0,0,:,0]/self.activ.step) 203 | ## print('convi', torch.round(out[0,0,:,0]/self.activ.step).int()) 204 | ## wstd = self.conv.weight.std() 205 | out = self.conv(out) 206 | ## print('convo', torch.round(out[0,0,:,0]/(self.activ.step*self.conv.step*wstd)).int()) 207 | return out 208 | 209 | 210 | class QuantActivLinear(nn.Module): 211 | 212 | def __init__(self, inplane, outplane, wbit=1, abit=2, **kwargs): 213 | super(QuantActivLinear, self).__init__() 214 | self.abit = abit 215 | self.wbit = wbit 216 | self.activ = HWGQ(abit) 217 | self.linear = QuantLinear(inplane, outplane, bit=wbit, **kwargs) 218 | # complexities 219 | self.param_size = inplane * outplane * 1e-6 220 | self.register_buffer('size_product', torch.tensor(self.param_size, dtype=torch.float)) 221 | self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float)) 222 | 223 | def forward(self, input): 224 | tmp = torch.tensor(input.shape[1] * 1e-3, dtype=torch.float) 225 | self.memory_size.copy_(tmp) 226 | out = self.activ(input) 227 | ## print('ii',input[0,0,:,0]/self.activ.step) 228 | ## print('lineari', torch.round(out[0,:]/self.activ.step).int()) 229 | ## wstd = self.linear.weight.std() 230 | out = self.linear(out) 231 | ## print('linearo', torch.round(out[0,:]/(self.activ.step*self.linear.step*wstd)).int()) 232 | return out 233 | 234 | 235 | class MixQuantActiv(nn.Module): 236 | 237 | def __init__(self, bits, ActQ = HWGQ): 238 | super(MixQuantActiv, self).__init__() 239 | self.bits = bits 240 | self.alpha_activ = Parameter(torch.Tensor(len(self.bits))) 241 | self.alpha_activ.data.fill_(0.01) 242 | self.mix_activ = nn.ModuleList() 243 | for bit in self.bits: 244 | self.mix_activ.append(ActQ(bit=bit)) 245 | 246 | def forward(self, input): 247 | outs = [] 248 | sw = F.softmax(self.alpha_activ, dim=0) 249 | for i, branch in enumerate(self.mix_activ): 250 | outs.append(branch(input) * sw[i]) 251 | activ = sum(outs) 252 | return activ 253 | 254 | 255 | class MixQuantConv2d(nn.Module): 256 | 257 | def __init__(self, inplane, outplane, bits, **kwargs): 258 | super(MixQuantConv2d, self).__init__() 259 | assert not kwargs['bias'] 260 | self.bits = bits 261 | self.alpha_weight = Parameter(torch.Tensor(len(self.bits))) 262 | self.alpha_weight.data.fill_(0.01) 263 | self.conv_list = nn.ModuleList() 264 | self.steps = [] 265 | for bit in self.bits: 266 | assert 0 < bit < 32 267 | self.conv_list.append(nn.Conv2d(inplane, outplane, **kwargs)) 268 | self.steps.append(gaussian_steps[bit]) 269 | 270 | def forward(self, input): 271 | mix_quant_weight = [] 272 | sw = F.softmax(self.alpha_weight, dim=0) 273 | for i, bit in enumerate(self.bits): 274 | weight = self.conv_list[i].weight 275 | weight_std = weight.std().item() 276 | step = self.steps[i] * weight_std 277 | quant_weight = _gauss_quantize_resclaed_step.apply(weight, step, bit) 278 | scaled_quant_weight = quant_weight * sw[i] 279 | mix_quant_weight.append(scaled_quant_weight) 280 | mix_quant_weight = sum(mix_quant_weight) 281 | conv = self.conv_list[0] 282 | out = F.conv2d( 283 | input, mix_quant_weight, conv.bias, conv.stride, conv.padding, conv.dilation, conv.groups) 284 | return out 285 | 286 | 287 | class SharedMixQuantConv2d(nn.Module): 288 | 289 | def __init__(self, inplane, outplane, bits, **kwargs): 290 | super(SharedMixQuantConv2d, self).__init__() 291 | # assert not kwargs['bias'] 292 | self.bits = bits 293 | self.alpha_weight = Parameter(torch.Tensor(len(self.bits))) 294 | self.alpha_weight.data.fill_(0.01) 295 | self.conv = nn.Conv2d(inplane, outplane, **kwargs) 296 | self.steps = [] 297 | for bit in self.bits: 298 | assert 0 < bit < 32 299 | self.steps.append(gaussian_steps[bit]) 300 | 301 | def forward(self, input): 302 | mix_quant_weight = [] 303 | sw = F.softmax(self.alpha_weight, dim=0) 304 | conv = self.conv 305 | weight = conv.weight 306 | # save repeated std computation for shared weights 307 | weight_std = weight.std().item() 308 | for i, bit in enumerate(self.bits): 309 | step = self.steps[i] * weight_std 310 | quant_weight = _gauss_quantize_resclaed_step.apply(weight, step, bit) 311 | scaled_quant_weight = quant_weight * sw[i] 312 | mix_quant_weight.append(scaled_quant_weight) 313 | mix_quant_weight = sum(mix_quant_weight) 314 | out = F.conv2d( 315 | input, mix_quant_weight, conv.bias, conv.stride, conv.padding, conv.dilation, conv.groups) 316 | return out 317 | 318 | 319 | class MixActivConv2d(nn.Module): 320 | 321 | def __init__(self, inplane, outplane, wbits=None, abits=None, share_weight=False, ActQ = HWGQ, **kwargs): 322 | super(MixActivConv2d, self).__init__() 323 | if wbits is None: 324 | self.wbits = [1, 2] 325 | else: 326 | self.wbits = wbits 327 | if abits is None: 328 | self.abits = [1, 2] 329 | else: 330 | self.abits = abits 331 | # build mix-precision branches 332 | self.mix_activ = MixQuantActiv(self.abits, ActQ = ActQ) 333 | self.share_weight = share_weight 334 | if share_weight: 335 | self.mix_weight = SharedMixQuantConv2d(inplane, outplane, self.wbits, **kwargs) 336 | else: 337 | self.mix_weight = MixQuantConv2d(inplane, outplane, self.wbits, **kwargs) 338 | # complexities 339 | stride = kwargs['stride'] if 'stride' in kwargs else 1 340 | if isinstance(kwargs['kernel_size'], tuple): 341 | kernel_size = kwargs['kernel_size'][0] * kwargs['kernel_size'][1] 342 | else: 343 | kernel_size = kwargs['kernel_size'] * kwargs['kernel_size'] 344 | self.kernel_size = kwargs['kernel_size'] 345 | 346 | if 'groups' in kwargs: groups = kwargs['groups'] 347 | else: groups = 1 348 | self.inplane = inplane 349 | self.outplane = outplane 350 | self.groups = groups 351 | self.param_size = inplane * outplane * kernel_size * 1e-6 / groups 352 | self.filter_size = self.param_size / float(stride ** 2.0) 353 | self.register_buffer('size_product', torch.tensor(0, dtype=torch.float)) 354 | self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float)) 355 | self.register_buffer('in_width', torch.tensor(0, dtype=torch.float)) 356 | 357 | def forward(self, input): 358 | in_shape = input.shape 359 | tmp = torch.tensor(in_shape[1] * in_shape[2] * in_shape[3] * 1e-3, dtype=torch.float) 360 | self.memory_size.copy_(tmp) 361 | tmp = torch.tensor(self.filter_size * in_shape[-1] * in_shape[-2], dtype=torch.float) 362 | self.size_product.copy_(tmp) 363 | tmp = torch.tensor(input.shape[3], dtype=torch.float) 364 | self.in_width.copy_(tmp) 365 | out = self.mix_activ(input) 366 | out = self.mix_weight(out) 367 | return out 368 | 369 | def complexity_loss_trivial(self): 370 | sw = F.softmax(self.mix_activ.alpha_activ, dim=0) 371 | mix_abit = 0 372 | abits = self.mix_activ.bits 373 | for i in range(len(abits)): 374 | mix_abit += sw[i] * abits[i] 375 | sw = F.softmax(self.mix_weight.alpha_weight, dim=0) 376 | mix_wbit = 0 377 | wbits = self.mix_weight.bits 378 | for i in range(len(wbits)): 379 | mix_wbit += sw[i] * wbits[i] 380 | complexity = self.size_product.item() * mix_abit * mix_wbit 381 | return complexity 382 | 383 | def complexity_loss(self): 384 | sa = F.softmax(self.mix_activ.alpha_activ, dim=0) 385 | abits = self.mix_activ.bits 386 | sw = F.softmax(self.mix_weight.alpha_weight, dim=0) 387 | mix_scale = 0 388 | wbits = self.mix_weight.bits 389 | 390 | if self.kernel_size == 1: 391 | factors = factors_k11 392 | elif self.kernel_size == 3: 393 | factors = factors_k33 394 | elif self.kernel_size == 5: 395 | factors = factors_k55 396 | else: 397 | raise NotImplementedError 398 | for i in range(len(wbits)): 399 | for j in range(len(abits)): 400 | mix_scale += sw[i] * sa[j] / factors[wbits[i]-2][abits[j]-2] 401 | complexity = self.size_product.item() * 64 * mix_scale 402 | return complexity 403 | 404 | def bram_loss(self): 405 | sa = F.softmax(self.mix_activ.alpha_activ, dim=0) 406 | abits = self.mix_activ.bits 407 | sw = F.softmax(self.mix_weight.alpha_weight, dim=0) 408 | wbits = self.mix_weight.bits 409 | 410 | if self.kernel_size == 1: 411 | bram_sw = 2 * self.in_width.item() * self.inplane 412 | else: # sliding window size 413 | bram_sw = (self.kernel_size+1)*self.in_width.item()*self.inplane 414 | bram_sw *= 1e-3 415 | 416 | mix_wbit, mix_abit = 0, 0 417 | for i in range(len(wbits)): 418 | mix_wbit += sw[i] * wbits[i] 419 | for i in range(len(abits)): 420 | mix_abit += sa[i] * abits[i] 421 | 422 | bram_weight = self.param_size * 1e3 * mix_wbit # kbit 423 | bram_cache = bram_sw * mix_abit # kbit 424 | 425 | bram = (bram_weight + bram_cache) * 64 426 | return bram 427 | 428 | def fetch_best_arch(self, layer_idx): 429 | size_product = float(self.size_product.cpu().numpy()) 430 | memory_size = float(self.memory_size.cpu().numpy()) 431 | prob_activ = F.softmax(self.mix_activ.alpha_activ, dim=0) 432 | prob_activ = prob_activ.detach().cpu().numpy() 433 | best_activ = prob_activ.argmax() 434 | mix_abit = 0 435 | abits = self.mix_activ.bits 436 | for i in range(len(abits)): 437 | mix_abit += prob_activ[i] * abits[i] 438 | prob_weight = F.softmax(self.mix_weight.alpha_weight, dim=0) 439 | prob_weight = prob_weight.detach().cpu().numpy() 440 | best_weight = prob_weight.argmax() 441 | mix_wbit = 0 442 | wbits = self.mix_weight.bits 443 | for i in range(len(wbits)): 444 | mix_wbit += prob_weight[i] * wbits[i] 445 | if self.share_weight: 446 | weight_shape = list(self.mix_weight.conv.weight.shape) 447 | else: 448 | weight_shape = list(self.mix_weight.conv_list[0].weight.shape) 449 | 450 | if self.kernel_size == 1: 451 | bram_sw = 2 * self.in_width.item() * self.inplane 452 | else: 453 | bram_sw = (self.kernel_size+1)*self.in_width.item()*self.inplane*self.outplane/self.groups 454 | bram_sw *= 1e-3 455 | 456 | print('idx {} with shape {}, activ alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, ' 457 | 'memory: {:.3f}K * {:.3f}, cache: {:.3f}K'.format(layer_idx, weight_shape, prob_activ, size_product, 458 | mix_abit, mix_wbit, memory_size, mix_abit, bram_sw)) 459 | print('idx {} with shape {}, weight alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, ' 460 | 'param: {:.3f}M * {:.3f}'.format(layer_idx, weight_shape, prob_weight, size_product, 461 | mix_abit, mix_wbit, self.param_size, mix_wbit)) 462 | best_arch = {'best_activ': [best_activ], 'best_weight': [best_weight]} 463 | bitops = size_product * abits[best_activ] * wbits[best_weight] 464 | bita = memory_size * abits[best_activ] 465 | bitw = self.param_size * wbits[best_weight] 466 | 467 | if self.kernel_size == 1: 468 | factors = factors_k11 469 | elif self.kernel_size == 3: 470 | factors = factors_k33 471 | elif self.kernel_size == 5: 472 | factors = factors_k55 473 | else: 474 | raise NotImplementedError 475 | dsps = size_product / factors[wbits[best_weight]-2][abits[best_activ]-2] 476 | mixbitops = size_product * mix_abit * mix_wbit 477 | mixbita = memory_size * mix_abit 478 | mixbitw = self.param_size * mix_wbit 479 | mixdsps = 0 480 | for i in range(len(wbits)): 481 | for j in range(len(abits)): 482 | mixdsps += prob_weight[i] * prob_activ[j] / factors[wbits[i]-2][abits[j]-2] 483 | mixdsps *= size_product 484 | mixbram_weight = self.param_size * 1e3 * mix_wbit # kbit 485 | mixbram_cache = bram_sw * mix_abit # kbit 486 | 487 | return best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache 488 | 489 | 490 | class SharedMixQuantLinear(nn.Module): 491 | 492 | def __init__(self, inplane, outplane, bits, **kwargs): 493 | super(SharedMixQuantLinear, self).__init__() 494 | # assert not kwargs['bias'] 495 | self.bits = bits 496 | self.alpha_weight = Parameter(torch.Tensor(len(self.bits))) 497 | self.alpha_weight.data.fill_(0.01) 498 | self.linear = nn.Linear(inplane, outplane, **kwargs) 499 | self.steps = [] 500 | for bit in self.bits: 501 | assert 0 < bit < 32 502 | self.steps.append(gaussian_steps[bit]) 503 | 504 | def forward(self, input): 505 | mix_quant_weight = [] 506 | sw = F.softmax(self.alpha_weight, dim=0) 507 | linear = self.linear 508 | weight = linear.weight 509 | # save repeated std computation for shared weights 510 | weight_std = weight.std().item() 511 | for i, bit in enumerate(self.bits): 512 | step = self.steps[i] * weight_std 513 | quant_weight = _gauss_quantize_resclaed_step.apply(weight, step, bit) 514 | scaled_quant_weight = quant_weight * sw[i] 515 | mix_quant_weight.append(scaled_quant_weight) 516 | mix_quant_weight = sum(mix_quant_weight) 517 | out = F.linear(input, mix_quant_weight, linear.bias) 518 | return out 519 | 520 | class MixActivLinear(nn.Module): 521 | def __init__(self, inplane, outplane, wbits=None, abits=None, share_weight=True, **kwargs): 522 | super(MixActivLinear, self).__init__() 523 | if wbits is None: 524 | self.wbits = [1, 2] 525 | else: 526 | self.wbits = wbits 527 | if abits is None: 528 | self.abits = [1, 2] 529 | else: 530 | self.abits = abits 531 | # build mix-precision branches 532 | self.mix_activ = MixQuantActiv(self.abits) 533 | assert share_weight 534 | self.share_weight = share_weight 535 | self.mix_weight = SharedMixQuantLinear(inplane, outplane, self.wbits, **kwargs) 536 | # complexities 537 | self.param_size = inplane * outplane * 1e-6 538 | self.register_buffer('size_product', torch.tensor(self.param_size, dtype=torch.float)) 539 | self.register_buffer('memory_size', torch.tensor(0, dtype=torch.float)) 540 | 541 | def forward(self, input): 542 | tmp = torch.tensor(input.shape[1] * 1e-3, dtype=torch.float) 543 | self.memory_size.copy_(tmp) 544 | out = self.mix_activ(input) 545 | out = self.mix_weight(out) 546 | return out 547 | 548 | def complexity_loss_old(self): 549 | sw = F.softmax(self.mix_activ.alpha_activ, dim=0) 550 | mix_abit = 0 551 | abits = self.mix_activ.bits 552 | for i in range(len(abits)): 553 | mix_abit += sw[i] * abits[i] 554 | sw = F.softmax(self.mix_weight.alpha_weight, dim=0) 555 | mix_wbit = 0 556 | wbits = self.mix_weight.bits 557 | for i in range(len(wbits)): 558 | mix_wbit += sw[i] * wbits[i] 559 | complexity = self.size_product.item() * mix_abit * mix_wbit 560 | return complexity 561 | 562 | def complexity_loss(self): 563 | sa = F.softmax(self.mix_activ.alpha_activ, dim=0) 564 | abits = self.mix_activ.bits 565 | sw = F.softmax(self.mix_weight.alpha_weight, dim=0) 566 | mix_scale = 0 567 | wbits = self.mix_weight.bits 568 | for i in range(len(wbits)): 569 | for j in range(len(abits)): 570 | mix_scale += sw[i] * sa[j] / factors_k11[wbits[i]-2][abits[j]-2] 571 | complexity = self.size_product.item() * 64 * mix_scale 572 | return complexity 573 | 574 | def fetch_best_arch(self, layer_idx): 575 | size_product = float(self.size_product.cpu().numpy()) 576 | memory_size = float(self.memory_size.cpu().numpy()) 577 | prob_activ = F.softmax(self.mix_activ.alpha_activ, dim=0) 578 | prob_activ = prob_activ.detach().cpu().numpy() 579 | best_activ = prob_activ.argmax() 580 | mix_abit = 0 581 | abits = self.mix_activ.bits 582 | for i in range(len(abits)): 583 | mix_abit += prob_activ[i] * abits[i] 584 | prob_weight = F.softmax(self.mix_weight.alpha_weight, dim=0) 585 | prob_weight = prob_weight.detach().cpu().numpy() 586 | best_weight = prob_weight.argmax() 587 | mix_wbit = 0 588 | wbits = self.mix_weight.bits 589 | for i in range(len(wbits)): 590 | mix_wbit += prob_weight[i] * wbits[i] 591 | weight_shape = list(self.mix_weight.linear.weight.shape) 592 | print('idx {} with shape {}, activ alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, ' 593 | 'memory: {:.3f}K * {:.3f}'.format(layer_idx, weight_shape, prob_activ, size_product, 594 | mix_abit, mix_wbit, memory_size, mix_abit)) 595 | print('idx {} with shape {}, weight alpha: {}, comp: {:.3f}M * {:.3f} * {:.3f}, ' 596 | 'param: {:.3f}M * {:.3f}'.format(layer_idx, weight_shape, prob_weight, size_product, 597 | mix_abit, mix_wbit, self.param_size, mix_wbit)) 598 | best_arch = {'best_activ': [best_activ], 'best_weight': [best_weight]} 599 | bitops = size_product * abits[best_activ] * wbits[best_weight] 600 | bita = memory_size * abits[best_activ] 601 | bitw = self.param_size * wbits[best_weight] 602 | dsps = size_product / factors_k11[wbits[best_weight]-2][abits[best_activ]-2] 603 | mixbitops = size_product * mix_abit * mix_wbit 604 | mixbita = memory_size * mix_abit 605 | mixbitw = self.param_size * mix_wbit 606 | mixdsps = 0 607 | for i in range(len(wbits)): 608 | for j in range(len(abits)): 609 | mixdsps += prob_weight[i] * prob_activ[j] / factors_k11[wbits[i]-2][abits[j]-2] 610 | mixdsps *= size_product 611 | return best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps 612 | -------------------------------------------------------------------------------- /cifar/export_hls.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | from typing import Dict, List 4 | import torch 5 | import numpy as np 6 | import sys 7 | import os 8 | 9 | import sys 10 | sys.path.append('..') 11 | import models 12 | from utils.view_pt import select_weight_file 13 | from anypacking.quant_module import HWGQ, QuantConv2d, ImageInputQ, QuantLinear 14 | 15 | class ConvParam: ... 16 | 17 | def write_hls_config(model_param, path): 18 | name_mapping = { 19 | 'k': 'K', 20 | #'s': 'S', 21 | #'p': 'P', 22 | 'ich': 'IFM_CH', 23 | 'irow': 'IFM_ROW', 24 | 'icol': 'IFM_COL', 25 | 'och': 'OFM_CH', 26 | 'orow': 'OFM_ROW', 27 | 'ocol': 'OFM_COL', 28 | 'abit': 'IN_BIT', 29 | 'wbit': 'W_BIT', 30 | 'incbit': 'INC_BIT', 31 | 'biasbit': 'BIAS_BIT', 32 | 'simd': 'SIMD', 33 | 'pe': 'PE', 34 | 'lshift': 'L_SHIFT' 35 | } 36 | content = f'''/******************************************************************************** 37 | * Filename: config.h 38 | * Date: {time.ctime()} 39 | * Description: This file is generated by {parser.prog} 40 | * ptfilename: {opt.weight} 41 | ********************************************************************************/ 42 | 43 | #ifndef _CONFIG_H_ 44 | #define _CONFIG_H_ 45 | 46 | ''' 47 | for n, conv_param in enumerate(model_param): 48 | content += f'// {conv_param.type}_{n}\n' 49 | for k, v in name_mapping.items(): 50 | if hasattr(conv_param, k): # e.g. conv_last has no incbit 51 | content += f'#define {conv_param.type.upper()}_{n}_{v} {getattr(conv_param, k)}\n' 52 | content += '\n' 53 | content += '#endif' 54 | 55 | with open(path + 'config.h', 'w') as f: 56 | print(content, file=f) 57 | 58 | def extract_model(in_shape): 59 | model_param: List[ConvParam] = [] 60 | feature_map_shape = in_shape 61 | conv_cnt = 0 62 | conv_cur = None 63 | for sub_module in model.modules(): 64 | # expect [QAct] -> [Pooling] -> Conv -> [BN] -> [Pooling], state machine mode 65 | if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ): 66 | print(' Detected ActQ Layer', end='') 67 | if conv_cur is None: conv_cur = ConvParam() 68 | 69 | conv_cur.abit = sub_module.bit 70 | conv_cur.astep = sub_module.step 71 | 72 | conv_cur.actq_class = type(sub_module).__name__ 73 | print(f', abit {conv_cur.abit}, astep {conv_cur.astep}, class {conv_cur.actq_class}') 74 | 75 | if conv_cnt: # previous.obit = cur.abit 76 | model_param[conv_cnt-1].obit = conv_cur.abit 77 | model_param[conv_cnt-1].ostep = conv_cur.astep 78 | 79 | elif isinstance(sub_module, torch.nn.Conv2d): 80 | if conv_cur is None: conv_cur = ConvParam() 81 | conv_cur.n = conv_cnt 82 | print('Extract conv_%d'%conv_cnt, end='') 83 | 84 | conv_cur.k = sub_module.kernel_size[0] 85 | conv_cur.s = sub_module.stride[0] 86 | conv_cur.p = sub_module.padding[0] 87 | conv_cur.ich = sub_module.in_channels 88 | conv_cur.och = sub_module.out_channels 89 | conv_cur.irow = feature_map_shape[1] 90 | conv_cur.icol = feature_map_shape[2] 91 | 92 | feature_map_shape[0] = sub_module.out_channels 93 | feature_map_shape[1] = (feature_map_shape[1] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1 94 | feature_map_shape[2] = (feature_map_shape[2] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1 95 | conv_cur.orow = feature_map_shape[1] 96 | conv_cur.ocol = feature_map_shape[2] 97 | 98 | assert sub_module.bias is None, 'inner conv has no bias in this model' 99 | if isinstance(sub_module, QuantConv2d): # New quant 100 | conv_cur.wbit = sub_module.bit 101 | conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step because of alpha 102 | else: 103 | raise NotImplementedError(sub_module) 104 | print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur))) 105 | 106 | conv_cur.type = 'conv' 107 | model_param.append(conv_cur) 108 | conv_cur = None 109 | conv_cnt += 1 110 | 111 | elif isinstance(sub_module, torch.nn.Linear): 112 | if conv_cur is None: conv_cur = ConvParam() # TODO: independent type for linear layer 113 | conv_cur.n = conv_cnt 114 | print('Extract layer %d (linear layer)'%conv_cnt, end='') 115 | 116 | conv_cur.ich = sub_module.in_features 117 | conv_cur.och = sub_module.out_features 118 | conv_cur.irow = feature_map_shape[1] 119 | conv_cur.icol = feature_map_shape[2] 120 | 121 | if sub_module.bias is not None: 122 | conv_cur.convbias = sub_module.bias.detach().numpy() 123 | print(', +bias', end='') 124 | 125 | if isinstance(sub_module, QuantLinear): # New quant 126 | conv_cur.wbit = sub_module.bit 127 | conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantLinear.step because of alpha 128 | 129 | print(', ich {ich}, och {och}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur))) 130 | 131 | conv_cur.type = 'linear' 132 | model_param.append(conv_cur) 133 | conv_cur = None 134 | conv_cnt += 1 135 | 136 | elif isinstance(sub_module, torch.nn.BatchNorm2d): 137 | print(' Detected BatchNorm2d') 138 | gamma = sub_module.weight 139 | beta = sub_module.bias 140 | mean = sub_module.running_mean 141 | var = sub_module.running_var 142 | eps = sub_module.eps 143 | 144 | model_param[-1].bn_w = (gamma / (torch.sqrt(var + eps))).detach().numpy() 145 | model_param[-1].bn_b = (beta - (mean / (torch.sqrt(var + eps)) * gamma)).detach().numpy() 146 | 147 | elif isinstance(sub_module, torch.nn.MaxPool2d): 148 | print(' Detected MaxPool2d') 149 | feature_map_shape[1] = feature_map_shape[1] // sub_module.kernel_size 150 | feature_map_shape[2] = feature_map_shape[2] // sub_module.kernel_size 151 | 152 | assert hasattr(model_param[0], 'abit') 153 | 154 | return model_param 155 | 156 | def process_batchnorm(model_param): 157 | '''process_batchnorm(model_param) 158 | Merge wstep, astep, ostep scale into batchnorm, then quantize. 159 | 160 | Method: 161 | Define MAC = Conv(w, a), out = MAC*BN_w + BN_b, 162 | wq = w/wstep, aq = a/astep, MACq = MAC/MACstep, outq = out/ostep. 163 | 164 | outq = (MAC*BN_w + BN_b) / ostep 165 | = MACq * (MACstep/ostep)*BN_w + BN_b/ostep 166 | = MACq * inc_raw + bias_raw 167 | next layer activation a' = ActQ(out), i.e. a'q = clip(round(outq)) 168 | 169 | Quantiaztion of inc_raw & bias_raw: 170 | outq_real = round((MACq*round(inc_raw*scale) + round(bias_raw*scale)) / scale) ; where scale=2**T 171 | = (MACq*round(inc_raw*scale) + round(bias_raw*scale) + 0.5 * scale) // scale ; div floor 172 | = (MACq* inc + bias + 2**(T-1) ) >> T ; [!] the 2**(T-1) bias is done by hls code 173 | 174 | Params: 175 | T = (wbit-1)+abit+lshift # This comes from dorefa quant, not optimal 176 | MBIT = wbit+abit+ceil(log2(sum_number)) 177 | incbit = len(bit(inc)); biasbit = len(bit(bias)) 178 | larger lshift is better, but MBIT+incbit<48 179 | ''' 180 | lshift = 16 181 | 182 | for conv in model_param[:-1]: 183 | print(f'Process bn_{conv.n}, shape {conv.bn_w.shape},', end = ' ') 184 | 185 | # Merge step to BN 186 | conv.lshift = lshift 187 | MACstep = conv.wstep * conv.astep 188 | ostep = conv.ostep 189 | inc_raw = conv.bn_w * MACstep / ostep 190 | bias_raw = conv.bn_b / ostep 191 | conv.inc_raw = inc_raw 192 | conv.bias_raw = bias_raw 193 | 194 | # Quantization 195 | T = lshift+conv.wbit+conv.abit-1 196 | conv.inc = np.round(inc_raw * 2**T).astype(np.int64) 197 | conv.bias = np.round(bias_raw * 2**T).astype(np.int64) 198 | conv.lshift_T = T 199 | # Get bitlength 200 | bitlength = lambda x: 1 + int(np.abs(x).max()).bit_length() 201 | conv.incbit = bitlength(conv.inc) 202 | conv.biasbit = bitlength(conv.bias) 203 | print(f'incbit {conv.incbit}, biasbit {conv.biasbit}, lshift_T {conv.lshift_T}') 204 | 205 | conv_last = model_param[-1] # process lastbias 206 | conv_last.inc = None 207 | conv_last.div = 1/(conv_last.wstep * conv_last.astep) 208 | conv_last.bias = np.round(conv_last.convbias * conv_last.div).astype(np.int64) 209 | conv_last.biasbit = bitlength(conv_last.bias) 210 | print(f'conv_last biasbit {conv_last.biasbit}, div {conv_last.div}') 211 | 212 | def reorder_weight(model_param, layers_simd, layers_pe): 213 | '''reorder_weight(model_param) 214 | Reorder array for hlscode. 215 | ''' 216 | 217 | for conv in model_param: 218 | if conv.type == 'linear': #new reorder 219 | pe_l = 1 220 | simd_l = 1 221 | in_pe_l = 8 222 | w = conv.w.reshape(10, -1, 4, 4) 223 | w = w.reshape(10 // (2 * pe_l), pe_l, 2, 256 // in_pe_l, in_pe_l // simd_l, simd_l, 4, 4) #[OUT_CH/2PE, PE, 2, IN_CH/IN_PE, IN_PE/SIMD, SIMD, H, W] 224 | w = w.transpose(1, 6, 3, 7, 0, 4, 5, 2) #[PE, H, IN_CH/IN_PE, W, OUT_CH/2PE, IN_PE/SIMD, SIMD, 2] 225 | w = w.reshape(w.shape[0], w.shape[1], w.shape[2], w.shape[3], w.shape[4], w.shape[5], -1) #[PE, H, IN_CH/IN_PE, W, OUT_CH/2PE, IN_PE/SIMD, SIMD * 2] 226 | print(w.shape) 227 | conv.w = w 228 | continue 229 | 230 | print(f'Reorder conv_{conv.n}, w {conv.w.shape}', end='') 231 | conv.simd = layers_simd[conv.n] 232 | conv.pe = layers_pe[conv.n] 233 | 234 | # process batchnorm 235 | if conv.inc is not None: 236 | conv.inc = conv.inc.reshape(conv.och//conv.pe, conv.pe).T 237 | if conv.bias is not None: 238 | conv.bias = conv.bias.reshape(conv.och//conv.pe, conv.pe).T 239 | 240 | # process conv weight 241 | w = conv.w # [och, ich, kr, kc] 242 | assert conv.och%conv.pe == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}" 243 | assert conv.k*conv.ich%conv.simd == 0, f"conv_{conv.n}, ich {conv.ich}, k {conv.k}, simd {conv.simd}" 244 | 245 | # if conv.n==0: # first layer is different 246 | # w = w.transpose(0, 2, 3, 1) # [och, kr, kc, ich] 247 | # else: 248 | w = w.transpose(0, 3, 2, 1) # [och, kc, kr, ich] 249 | 250 | w = w.reshape(conv.och//conv.pe, conv.pe, conv.k, conv.k*conv.ich//conv.simd, conv.simd) 251 | w = w.transpose(1,2,0,3,4) # [pe, k, och/pe, k*ich/simd, simd] 252 | w = w.reshape(conv.pe, conv.k, -1, conv.simd) # hls format [pe, k, och/pe*k*ich/simd, simd] 253 | 254 | if conv.k == 1: # kernel size=1 255 | w = w.reshape(conv.pe, -1, conv.simd) 256 | print(' ->', w.shape) 257 | 258 | conv.w = w 259 | 260 | def print_ndarray_recursion(arr, str_func=str, file=sys.stdout, stop=0): 261 | if not hasattr(arr, '__iter__') or len(arr.shape) == stop: 262 | print(str_func(arr), file=file, end='') 263 | return 264 | ends = '' if (len(arr.shape)==stop+1) else '\n' 265 | print('{', file=file, end='') 266 | for i, item in enumerate(arr): 267 | print_ndarray_recursion(item, str_func, file, stop) 268 | if i!=len(arr)-1: print(',', file=file, end=ends) 269 | print(ends+'}', file=file, end='') 270 | 271 | def write_hls_linearlayer(layer, f): 272 | n = layer.n 273 | print(f"// layer: {n}, wbit: {layer.wbit}", file=f) 274 | hex_str = lambda x: '"' + hex(x) + '"' 275 | print(f"const ap_int<{layer.wbit}> linear_{n}_w[{layer.och}][{layer.ich}]=", file=f) 276 | print_ndarray_recursion(layer.w, hex_str, f) 277 | print(';', file=f) 278 | 279 | if layer.bias is not None: 280 | print(f"const ap_int<{layer.biasbit}> linear_{n}_bias[{layer.och}]=", file=f) 281 | print_ndarray_recursion(layer.bias, hex_str, f) 282 | print(';', file=f) 283 | 284 | def write_hls_linearlayer_reorder(layer, d0, d1, d2, d3, d4, d5, d6, f): 285 | n = layer.n 286 | print(f"// layer: {n}, wbit: {layer.wbit}", file=f) 287 | hex_str = lambda x: '"' + hex(x) + '"' 288 | def pack1d_str(arr): # x: 1d-array 289 | x = 0 290 | # print(arr.shape) 291 | for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention 292 | v = int(v) # use python bignumber, not np.int 293 | assert -1< linear_{n}_w[{d0}][{d1}][{d2}][{d3}][{d4}][{d5}]=", file=f) 297 | print_ndarray_recursion(layer.w, pack1d_str, f, stop=1) 298 | print(';', file=f) 299 | 300 | if layer.bias is not None: 301 | print(f"const ap_int<{layer.biasbit}> linear_{n}_bias[{layer.och}]=", file=f) 302 | print_ndarray_recursion(layer.bias, hex_str, f) 303 | print(';', file=f) 304 | 305 | def write_hls_weights(model_param, path): 306 | '''write_hls_weights(model_param, path) 307 | Write hls weights+inc+bias array code according to numpy shape. 308 | ''' 309 | f = open(path + 'weights.hpp', 'w') 310 | 311 | print(f'''/******************************************************************************** 312 | * Filename: weights.hpp 313 | * Date: {time.ctime()} 314 | * Description: This file is generated by {parser.prog} 315 | * ptfilename: {opt.weight} 316 | ********************************************************************************/ 317 | 318 | #ifndef _WEIGHTS_HPP_ 319 | #define _WEIGHTS_HPP_ 320 | #include 321 | ''', file=f) 322 | 323 | for conv in model_param: 324 | if conv.type == 'linear': 325 | pe_pr = conv.w.shape[0] 326 | h_pr = conv.w.shape[1] 327 | inch_inpe_pr = conv.w.shape[2] 328 | w_pr = conv.w.shape[3] 329 | outch_2pe_pr = conv.w.shape[4] 330 | inpe_simd_pr = conv.w.shape[5] 331 | simd2_pr = conv.w.shape[6] 332 | write_hls_linearlayer_reorder(conv, pe_pr, h_pr, inch_inpe_pr, w_pr, outch_2pe_pr, inpe_simd_pr, simd2_pr, f) 333 | continue 334 | 335 | n = conv.n 336 | print(f"Write conv_{n} weight, pe {conv.pe}, simd {conv.simd}, wbit {conv.wbit}") 337 | print(f"// layer: {n}, PE: {conv.pe}, SIMD: {conv.simd}, wbit: {conv.wbit}", file=f) 338 | 339 | # print conv weight, merge [SIMD] value into one ap_uint 340 | if conv.k>1: 341 | print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.k}][{conv.w.shape[2]}]=", file=f) 342 | else: 343 | print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.w.shape[1]}]=", file=f) 344 | hex_str = lambda x: '"' + hex(x) + '"' 345 | def pack1d_str(arr): # x: 1d-array 346 | x = 0 347 | for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention 348 | v = int(v) # use python bignumber, not np.int 349 | assert -1< conv_{n}_inc[{conv.pe}][{conv.och//conv.pe}]=", file=f) 358 | print_ndarray_recursion(conv.inc, hex_str, f) 359 | print(';', file=f) 360 | if conv.bias is not None: 361 | print(f"const ap_int<{conv.biasbit}> conv_{n}_bias[{conv.pe}][{conv.och//conv.pe}]=", file=f) 362 | print_ndarray_recursion(conv.bias, hex_str, f) 363 | print(';', file=f) 364 | 365 | print('#endif', file=f) 366 | f.close() 367 | 368 | def adjust_weight(model_param): 369 | # special_wa_bit = ((5,6), (7,3)) # These packing can't quantize to -2**(wbit-1) 370 | special_wa_bit = ((4, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (7, 2), (7, 3)) # These packing can't quantize to -2**(wbit-1) 371 | for conv in model_param: 372 | if (conv.wbit, conv.abit) in special_wa_bit: 373 | print(f'Adjust conv_{conv.n} wbit={conv.wbit}') 374 | conv.w = np.maximum(conv.w, -2**(conv.wbit-1)+1) 375 | 376 | if __name__=='__main__': 377 | parser = argparse.ArgumentParser() 378 | parser.add_argument('-w', '--weight', default=None, help='.pt file name in ./weights/') 379 | parser.add_argument('-m', '--model', default='VGG_tiny_FixQ', help = 'model class name in models.py') 380 | parser.add_argument('-c', '--config-simd-pe', default='config_simd_pe', help = '.txt file in ./hls/') 381 | opt = parser.parse_args() 382 | if opt.weight is None: opt.weight = select_weight_file() 383 | 384 | simd_pe = np.loadtxt('hls/'+opt.config_simd_pe+'.txt', dtype=int, skiprows=1) 385 | dir_output = 'hls/' + opt.weight + '/' 386 | if not os.path.exists(dir_output): os.makedirs(dir_output) 387 | 388 | # load model and state_dict 389 | ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu') 390 | model = getattr(models, opt.model)(**ptfile.setdefault('model_params', {})) 391 | model.load_state_dict(ptfile['model'], strict = False) 392 | 393 | # processs 394 | model_param = extract_model([1, 32, 32]) 395 | adjust_weight(model_param) 396 | process_batchnorm(model_param) # get bn param before write hls config 397 | torch.save(model_param, dir_output + 'model_param.pkl') 398 | 399 | reorder_weight(model_param, simd_pe[:,0], simd_pe[:,1]) # get pe, simd param before write hls config 400 | write_hls_config(model_param, dir_output) 401 | write_hls_weights(model_param, dir_output) 402 | -------------------------------------------------------------------------------- /cifar/hls/config_simd_pe.txt: -------------------------------------------------------------------------------- 1 | simd pe 2 | 3 4 3 | 8 8 4 | 8 8 5 | 8 8 6 | 8 8 7 | 8 8 8 | -------------------------------------------------------------------------------- /cifar/main_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | import torch 5 | import torch.nn as nn 6 | import torchvision 7 | import torchvision.transforms as transforms 8 | import torch.optim as optim 9 | import numpy as np 10 | from tqdm import tqdm 11 | 12 | import sys 13 | sys.path.append('..') 14 | 15 | from localconfig import data_path 16 | import models 17 | from test_acc import test 18 | from utils import torch_utils 19 | 20 | transform_train = transforms.Compose([ 21 | transforms.RandomCrop(32, padding=4), 22 | transforms.RandomHorizontalFlip(), 23 | transforms.ToTensor(), 24 | models.InputFactor(), 25 | ]) 26 | 27 | trainset = torchvision.datasets.CIFAR10(root=data_path, train=True, 28 | download=False, transform=transform_train) 29 | classes = ('plane', 'car', 'bird', 'cat', 30 | 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') 31 | 32 | 33 | def train(): 34 | torch_utils.init_seeds() 35 | 36 | model = models.VGG_tiny_FixQ(bitw = opt.bitw, bita = opt.bita) 37 | model.to(device) 38 | if opt.weights is not None: 39 | weights_file = 'weights/' + opt.weights + '.pt' 40 | chkpt = torch.load(weights_file, map_location=device) 41 | chkpt['model'] = {k: v for k, v in chkpt['model'].items() if 42 | model.state_dict()[k].numel() == v.numel()} 43 | model.load_state_dict(chkpt['model'], strict=False) 44 | 45 | results_file = 'results/%s.txt'%opt.name 46 | 47 | criterion = nn.CrossEntropyLoss() 48 | optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4) 49 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( 50 | optimizer, T_max=opt.epochs, eta_min=opt.lr*0.01) 51 | 52 | model.train() 53 | 54 | start_epoch, epochs = 0, opt.epochs 55 | train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True, num_workers=2) 56 | test_best_acc = 0.0 57 | 58 | test(model, device) 59 | bops, bita, bitw, dsps = model.fetch_arch_info() 60 | print('model with bops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format(bops, bita, bitw, dsps)) 61 | 62 | for epoch in range(start_epoch, epochs): 63 | model.train() 64 | mloss = macc = 0. 65 | pbar = tqdm(enumerate(train_loader), total=len(train_loader)) 66 | for i, (inputs, labels) in pbar: 67 | inputs, labels = inputs.to(device), labels.to(device) 68 | 69 | optimizer.zero_grad() 70 | 71 | outputs = model(inputs) 72 | _, predicted = torch.max(outputs.data, 1) 73 | correct = (predicted == labels).sum().item() 74 | loss = criterion(outputs, labels) 75 | loss.backward() 76 | optimizer.step() 77 | 78 | mloss = (mloss*i + loss.item()) / (i+1) 79 | macc = (macc*i + correct/opt.batch_size) / (i+1) 80 | s = '%10s%10.2f%10.3g'%('%d/%d'%(epoch,epochs-1), macc*100, mloss) 81 | pbar.set_description(s) 82 | 83 | scheduler.step() 84 | results = test(model, device) 85 | with open(results_file, 'a') as f: 86 | f.write(s + '%10.2f%10.3g'% results + '\n') 87 | test_acc = results[0] 88 | test_best_acc = max(test_best_acc, test_acc) 89 | 90 | final_epoch = epoch == epochs-1 91 | if True or final_epoch: 92 | with open(results_file, 'r') as f: 93 | chkpt = {'epoch': epoch, 94 | 'training_results': f.read(), 95 | 'model': model.module.state_dict() if type( 96 | model) is nn.parallel.DistributedDataParallel else model.state_dict(), 97 | 'optimizer': None if final_epoch else optimizer.state_dict(), 98 | 'model_params': model.model_params, # arch param 99 | 'extra': {'time': time.ctime(), 'name': opt.name}} 100 | # Save last checkpoint 101 | torch.save(chkpt, wdir + '%s_last.pt'%opt.name) 102 | 103 | if test_acc == test_best_acc: 104 | torch.save(chkpt, wdir + '%s_best.pt'%opt.name) 105 | 106 | print('Finished Training') 107 | 108 | with open('results.csv', 'a') as f: 109 | print("fixed,%s,%d/%d, , ,%s,%s,%.1f,%.1f, , , ,%d, ,%.3f, "% 110 | (opt.name,epochs-1,epochs,opt.bitw,opt.bita,macc*100,(test_acc+test_best_acc)/2, 111 | int(round(bops)), dsps), file=f) 112 | 113 | # torch.save(net.state_dict(), 'lenet_cifar10.pth') 114 | 115 | if __name__ == '__main__': 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument('-n', '--name', default='VGG_tiny_FixQ', help='result and weight file name') 118 | parser.add_argument('-w', '--weights', default=None, help='weights path') 119 | parser.add_argument('-e', '--epochs', type=int, default=200) 120 | parser.add_argument('--batch-size', type=int, default=128) 121 | parser.add_argument('--bypass', action='store_true', help='use bypass model') 122 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)') 123 | parser.add_argument('--lr', type=float, default=0.03) 124 | parser.add_argument('--mixm', type=str) 125 | parser.add_argument('--bitw', type=str, default='') 126 | parser.add_argument('--bita', type=str, default='') 127 | 128 | opt = parser.parse_args() 129 | 130 | if opt.mixm is not None: 131 | wmix = torch.load('weights/%s.pt'%opt.mixm) 132 | opt.bitw = wmix['extra']['bestw'] 133 | opt.bita = wmix['extra']['besta'] 134 | del wmix 135 | 136 | print(opt) 137 | 138 | wdir = 'weights' + os.sep # weights dir 139 | last = wdir + '%s_last.pt'%opt.name 140 | 141 | device = torch_utils.select_device(opt.device, batch_size=opt.batch_size) 142 | 143 | train() 144 | -------------------------------------------------------------------------------- /cifar/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from anypacking import quant_module as qm 5 | 6 | class InputFactor: 7 | def __call__(self, pic): 8 | return pic * 255.0 / 256.0 9 | 10 | class LeNet(nn.Module): 11 | def __init__(self): 12 | super(LeNet,self).__init__() 13 | conv=nn.Conv2d 14 | self.conv1 = conv(3,6,5) 15 | self.conv2 = conv(6,16,5) 16 | self.fc1 = nn.Linear(16*5*5,120) 17 | self.fc2 = nn.Linear(120,84) 18 | self.fc3 = nn.Linear(84,10) 19 | 20 | def forward(self,x): 21 | x = F.max_pool2d(F.relu(self.conv1(x)),(2,2)) 22 | x = F.max_pool2d(F.relu(self.conv2(x)),2) 23 | x = x.view(x.size()[0],-1) 24 | x = F.relu(self.fc1(x)) 25 | x = F.relu(self.fc2(x)) 26 | x = self.fc3(x) 27 | return x 28 | 29 | class VGG_small(nn.Module): 30 | def __init__(self, num_classes=10): 31 | super(VGG_small, self).__init__() 32 | self.pooling = nn.MaxPool2d(kernel_size=2, stride=2) 33 | self.nonlinear = nn.ReLU(inplace=True) 34 | 35 | self.layers = nn.Sequential( 36 | nn.Conv2d(3, 128, kernel_size=3, padding=1, bias=False), # 0 37 | nn.BatchNorm2d(128), 38 | self.nonlinear, 39 | 40 | nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False), # 1 41 | self.pooling, 42 | nn.BatchNorm2d(128), 43 | self.nonlinear, 44 | 45 | nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False), # 2 46 | nn.BatchNorm2d(256), 47 | self.nonlinear, 48 | 49 | nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False), # 3 50 | self.pooling, 51 | nn.BatchNorm2d(256), 52 | self.nonlinear, 53 | 54 | nn.Conv2d(256, 512, kernel_size=3, padding=1, bias=False), # 4 55 | nn.BatchNorm2d(512), 56 | self.nonlinear, 57 | 58 | nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False), # 5 59 | self.pooling, 60 | nn.BatchNorm2d(512), 61 | self.nonlinear, 62 | 63 | nn.Flatten(), 64 | nn.Linear(512*4*4, num_classes) 65 | ) 66 | 67 | 68 | def forward(self, x): 69 | return self.layers(x) 70 | 71 | class VGG_tiny(nn.Module): 72 | def __init__(self, num_classes=10): 73 | super(VGG_tiny, self).__init__() 74 | self.pooling = nn.MaxPool2d(kernel_size=2, stride=2) 75 | self.nonlinear = nn.ReLU(inplace=True) 76 | 77 | self.layers = nn.Sequential( 78 | nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False), # 0 79 | nn.BatchNorm2d(64), 80 | self.nonlinear, 81 | 82 | nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=False), # 1 83 | self.pooling, 84 | nn.BatchNorm2d(64), 85 | self.nonlinear, 86 | 87 | nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=False), # 2 88 | nn.BatchNorm2d(128), 89 | self.nonlinear, 90 | 91 | nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False), # 3 92 | self.pooling, 93 | nn.BatchNorm2d(128), 94 | self.nonlinear, 95 | 96 | nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False), # 4 97 | nn.BatchNorm2d(256), 98 | self.nonlinear, 99 | 100 | nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False), # 5 101 | self.pooling, 102 | nn.BatchNorm2d(256), 103 | self.nonlinear, 104 | 105 | nn.Flatten(), 106 | nn.Linear(256*4*4, num_classes) 107 | ) 108 | 109 | def forward(self, x): 110 | return self.layers(x) 111 | 112 | 113 | class VGG_tiny_MixQ(nn.Module): 114 | def __init__(self, num_classes=10, share_weight = True): 115 | super(VGG_tiny_MixQ, self).__init__() 116 | self.pooling = nn.MaxPool2d(kernel_size=2, stride=2) 117 | self.conv_func = qm.MixActivConv2d 118 | conv_func = self.conv_func 119 | 120 | conv_kwargs = {'kernel_size':3, 'stride':1, 'padding':1, 'bias':False} 121 | qspace = {'wbits':[2,3,4,5,6,7,8], 'abits':[2,3,4,5,6,7,8], 'share_weight': share_weight} 122 | 123 | self.layers = nn.Sequential( 124 | conv_func(3, 64, ActQ = qm.ImageInputQ, **conv_kwargs, **qspace), # 0 125 | nn.BatchNorm2d(64), 126 | 127 | conv_func(64, 64, **conv_kwargs, **qspace), # 1 128 | nn.BatchNorm2d(64), 129 | self.pooling, 130 | 131 | conv_func(64, 128, **conv_kwargs, **qspace), # 2 132 | nn.BatchNorm2d(128), 133 | 134 | conv_func(128, 128, **conv_kwargs, **qspace), # 3 135 | nn.BatchNorm2d(128), 136 | self.pooling, 137 | 138 | conv_func(128, 256, **conv_kwargs, **qspace), # 4 139 | nn.BatchNorm2d(256), 140 | 141 | conv_func(256, 256, **conv_kwargs, **qspace), # 5 142 | nn.BatchNorm2d(256), 143 | self.pooling, 144 | 145 | nn.Flatten(), 146 | qm.QuantActivLinear(256*4*4, num_classes, bias=True, wbit=8, abit=8) 147 | ) 148 | 149 | def forward(self, x): 150 | return self.layers(x) 151 | 152 | def fetch_best_arch(self): 153 | sum_bitops, sum_bita, sum_bitw, sum_dsps = 0, 0, 0, 0 154 | sum_mixbitops, sum_mixbita, sum_mixbitw, sum_mixdsps = 0, 0, 0, 0 155 | layer_idx = 0 156 | best_arch = None 157 | for m in self.modules(): 158 | if isinstance(m, self.conv_func): 159 | layer_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache = m.fetch_best_arch(layer_idx) 160 | if best_arch is None: 161 | best_arch = layer_arch 162 | else: 163 | for key in layer_arch.keys(): 164 | if key not in best_arch: 165 | best_arch[key] = layer_arch[key] 166 | else: 167 | best_arch[key].append(layer_arch[key][0]) 168 | sum_bitops += bitops 169 | sum_bita += bita 170 | sum_bitw += bitw 171 | sum_mixbitops += mixbitops 172 | sum_mixbita += mixbita 173 | sum_mixbitw += mixbitw 174 | sum_dsps += dsps 175 | sum_mixdsps += mixdsps 176 | layer_idx += 1 177 | return best_arch, sum_bitops, sum_bita, sum_bitw, sum_mixbitops, sum_mixbita, sum_mixbitw, sum_dsps, sum_mixdsps 178 | 179 | def complexity_loss(self): 180 | size_product = [] 181 | loss = 0 182 | for m in self.modules(): 183 | if isinstance(m, self.conv_func): 184 | loss += m.complexity_loss() 185 | size_product += [m.size_product] 186 | normalizer = size_product[0].item() 187 | loss /= normalizer 188 | return loss 189 | 190 | def complexity_loss_trivial(self): 191 | size_product = [] 192 | loss = 0 193 | for m in self.modules(): 194 | if isinstance(m, self.conv_func): 195 | loss += m.complexity_loss_trivial() 196 | size_product += [m.size_product] 197 | normalizer = size_product[0].item() 198 | loss /= normalizer 199 | return loss 200 | 201 | class VGG_tiny_FixQ(nn.Module): 202 | def __init__(self, num_classes=10, bitw = '444444', bita = '844444'): 203 | super(VGG_tiny_FixQ, self).__init__() 204 | self.conv_func = qm.QuantActivConv2d 205 | conv_func = self.conv_func 206 | 207 | assert(len(bitw)==0 or len(bitw)==6) 208 | assert(len(bita)==0 or len(bita)==6) 209 | if isinstance(bitw, str): 210 | bitw=list(map(int, bitw)) 211 | if isinstance(bita, str): 212 | bita=list(map(int, bita)) 213 | 214 | self.bitw = bitw 215 | self.bita = bita 216 | self.model_params = {'bitw': bitw, 'bita': bita} 217 | 218 | conv_kwargs = {'kernel_size':3, 'stride':1, 'padding':1, 'bias':False} 219 | 220 | self.layers = nn.Sequential( 221 | conv_func(3, 64, ActQ = qm.ImageInputQ, **conv_kwargs, wbit=bitw[0], abit=bita[0]), # 0 222 | nn.BatchNorm2d(64), 223 | 224 | conv_func(64, 64, **conv_kwargs, wbit=bitw[1], abit=bita[1]), # 1 225 | nn.BatchNorm2d(64), 226 | nn.MaxPool2d(kernel_size=2, stride=2), 227 | 228 | conv_func(64, 128, **conv_kwargs, wbit=bitw[2], abit=bita[2]), # 2 229 | nn.BatchNorm2d(128), 230 | 231 | conv_func(128, 128, **conv_kwargs, wbit=bitw[3], abit=bita[3]), # 3 232 | nn.BatchNorm2d(128), 233 | nn.MaxPool2d(kernel_size=2, stride=2), 234 | 235 | conv_func(128, 256, **conv_kwargs, wbit=bitw[4], abit=bita[4]), # 4 236 | nn.BatchNorm2d(256), 237 | 238 | conv_func(256, 256, **conv_kwargs, wbit=bitw[5], abit=bita[5]), # 5 239 | nn.BatchNorm2d(256), 240 | nn.MaxPool2d(kernel_size=2, stride=2), 241 | 242 | nn.Flatten(), 243 | qm.QuantActivLinear(256*4*4, num_classes, bias=True, wbit=8, abit=8) 244 | ) 245 | 246 | def forward(self, x): 247 | return self.layers(x) 248 | 249 | def fetch_arch_info(self): 250 | sum_bitops, sum_bita, sum_bitw, sum_dsps = 0, 0, 0, 0 251 | layer_idx = 0 252 | for m in self.modules(): 253 | if isinstance(m, self.conv_func): 254 | size_product = m.size_product.item() 255 | memory_size = m.memory_size.item() 256 | bitops = size_product * m.abit * m.wbit 257 | bita = m.memory_size.item() * m.abit 258 | bitw = m.param_size * m.wbit 259 | dsps = size_product / qm.dsp_factors_k33[m.wbit-2][m.abit-2] 260 | weight_shape = list(m.conv.weight.shape) 261 | print('idx {} with shape {}, bitops: {:.3f}M * {} * {}, memory: {:.3f}K * {}, ' 262 | 'param: {:.3f}M * {}, dsps: {:.3f}M'.format(layer_idx, weight_shape, size_product, m.abit, 263 | m.wbit, memory_size, m.abit, m.param_size, m.wbit, dsps)) 264 | sum_bitops += bitops 265 | sum_bita += bita 266 | sum_bitw += bitw 267 | sum_dsps += dsps 268 | layer_idx += 1 269 | return sum_bitops, sum_bita, sum_bitw, sum_dsps 270 | -------------------------------------------------------------------------------- /cifar/search_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | import torch 5 | import torch.nn as nn 6 | import torchvision 7 | import torchvision.transforms as transforms 8 | import torch.optim as optim 9 | import numpy as np 10 | from tqdm import tqdm 11 | 12 | import sys 13 | sys.path.append('..') 14 | 15 | from localconfig import data_path 16 | import models 17 | from utils import torch_utils 18 | from test_acc import test 19 | 20 | transform_train = transforms.Compose([ 21 | transforms.RandomCrop(32, padding=4), 22 | transforms.RandomHorizontalFlip(), 23 | transforms.ToTensor(), 24 | models.InputFactor(), 25 | ]) 26 | 27 | trainset = torchvision.datasets.CIFAR10(root=data_path, train=True, 28 | download=False, transform=transform_train) 29 | classes = ('plane', 'car', 'bird', 'cat', 30 | 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') 31 | 32 | 33 | def train(): 34 | torch_utils.init_seeds() 35 | 36 | model = models.VGG_tiny_MixQ(10, not opt.noshare) 37 | model.to(device) 38 | 39 | results_file = 'results/%s.txt'%opt.name 40 | 41 | criterion = nn.CrossEntropyLoss() 42 | 43 | params, alpha_params = [], [] 44 | for name, param in model.named_parameters(): 45 | if 'alpha' in name: 46 | alpha_params += [param] 47 | else: 48 | params += [param] 49 | optimizer = optim.SGD(params, lr=opt.lr, momentum=0.9, weight_decay=5e-4) 50 | arch_optimizer = torch.optim.SGD(alpha_params, opt.lra, momentum=0.9, weight_decay=5e-4) 51 | 52 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( 53 | optimizer, T_max=opt.epochs, eta_min=opt.lr*0.01) 54 | arch_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( 55 | arch_optimizer, T_max=opt.epochs, eta_min=opt.lr*0.3) 56 | 57 | model.train() 58 | 59 | start_epoch, epochs = 0, opt.epochs 60 | train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True, num_workers=2) 61 | test_best_acc = 0.0 62 | 63 | for epoch in range(start_epoch, epochs): 64 | model.train() 65 | mloss = macc = 0. 66 | pbar = tqdm(enumerate(train_loader), total=len(train_loader)) 67 | for i, (inputs, labels) in pbar: 68 | inputs, labels = inputs.to(device), labels.to(device) 69 | 70 | optimizer.zero_grad() 71 | arch_optimizer.zero_grad() 72 | 73 | outputs = model(inputs) 74 | _, predicted = torch.max(outputs.data, 1) 75 | correct = (predicted == labels).sum().item() 76 | loss = criterion(outputs, labels) 77 | 78 | if opt.complexity_decay != 0 or opt.complexity_decay_trivial!=0: 79 | loss_complexity = opt.complexity_decay * model.complexity_loss() + \ 80 | opt.complexity_decay_trivial * model.complexity_loss_trivial() 81 | loss += loss_complexity 82 | 83 | loss.backward() 84 | optimizer.step() 85 | arch_optimizer.step() 86 | 87 | mloss = (mloss*i + loss.item()) / (i+1) 88 | macc = (macc*i + correct/opt.batch_size) / (i+1) 89 | s = '%10s%10.2f%10.3g'%('%d/%d'%(epoch,epochs-1), macc*100, mloss) 90 | pbar.set_description(s) 91 | 92 | print('========= architecture =========') 93 | best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps = model.fetch_best_arch() 94 | print('best model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format( 95 | bitops, bita, bitw, dsps)) 96 | print('expected model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format( 97 | mixbitops, mixbita, mixbitw, mixdsps)) 98 | bestw_str = "".join([str(x+2) for x in best_arch["best_weight"]]) 99 | besta_str = "".join([str(x+2) for x in best_arch["best_activ"]]) 100 | print(f'best_weight: {best_arch["best_weight"]}') 101 | print(f'best_activ: {best_arch["best_activ"]}') 102 | 103 | scheduler.step() 104 | arch_scheduler.step() 105 | 106 | results = test(model, device) 107 | with open(results_file, 'a') as f: 108 | f.write(s + '%10.2f%10.3g'% results + '\n') 109 | test_acc = results[0] 110 | test_best_acc = max(test_best_acc, test_acc) 111 | 112 | final_epoch = epoch == epochs-1 113 | if True or final_epoch: 114 | with open(results_file, 'r') as f: 115 | chkpt = {'epoch': epoch, 116 | 'training_results': f.read(), 117 | 'model': model.module.state_dict() if type( 118 | model) is nn.parallel.DistributedDataParallel else model.state_dict(), 119 | 'optimizer': None if final_epoch else optimizer.state_dict(), 120 | 'arch_optimizer': None if final_epoch else arch_optimizer.state_dict(), 121 | 'extra': {'time': time.ctime(), 'name': opt.name, 'bestw': bestw_str, 'besta': besta_str}} 122 | # Save last checkpoint 123 | torch.save(chkpt, wdir + '%s_last.pt'%opt.name) 124 | 125 | if test_acc == test_best_acc: 126 | torch.save(chkpt, wdir + '%s_best.pt'%opt.name) 127 | 128 | print('Finished Training') 129 | 130 | with open('results.csv', 'a') as f: 131 | print("mixed,%s,%d/%d, , , , ,%.1f,%.1f, ,%s,%s,%d,%d,%.3f,%.3f"% 132 | (opt.name,epochs-1,epochs,macc*100,(test_acc+test_best_acc)/2, 133 | bestw_str,besta_str, 134 | int(round(bitops)), int(round(mixbitops)), dsps, mixdsps), file=f) 135 | 136 | # torch.save(net.state_dict(), 'lenet_cifar10.pth') 137 | 138 | if __name__ == '__main__': 139 | parser = argparse.ArgumentParser() 140 | parser.add_argument('--epochs', type=int, default=40) 141 | parser.add_argument('--batch-size', type=int, default=128) 142 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)') 143 | parser.add_argument('--lr', type=float, default=0.1) 144 | parser.add_argument('--name', default='', help='result and weight file name') 145 | parser.add_argument('--noshare', action='store_true', help='no share weight') 146 | parser.add_argument('--complexity-decay', '--cd', default=0, type=float, metavar='W', help='complexity decay (default: 0)') 147 | parser.add_argument('--complexity-decay-trivial', '--cdt', default=0, type=float, metavar='W', help='complexity decay w/o hardware-aware') 148 | parser.add_argument('--lra', '--learning-rate-alpha', default=0.1, type=float, metavar='LR', help='initial alpha learning rate') 149 | 150 | opt = parser.parse_args() 151 | print(opt) 152 | wdir = 'weights' + os.sep # weights dir 153 | last = wdir + '%s_last.pt'%opt.name 154 | 155 | device = torch_utils.select_device(opt.device, batch_size=opt.batch_size) 156 | 157 | train() 158 | -------------------------------------------------------------------------------- /cifar/simulate_hw.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import torch.nn as nn 6 | 7 | from export_hls import ConvParam 8 | from test_acc import testset 9 | from utils.view_pt import select_weight_file 10 | 11 | class QConvLayer: 12 | def __init__(self, conv_param): 13 | self.conv = conv_param 14 | self.w = torch.tensor(self.conv.w, dtype = torch.int64) 15 | 16 | def __call__(self, x: torch.Tensor, downsampling): 17 | if self.conv.icol < x.shape[-1]: # Maxpool. Note: Order of Maxpool and BN is IMPORTANT when BN.inc can be negative 18 | assert self.conv.irow*2, self.conv.icol*2 == x.shape[2:] 19 | x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64) 20 | 21 | if self.conv.type == 'linear': 22 | x = x.flatten(1) 23 | x = F.linear(x, self.w) 24 | x += self.conv.bias 25 | return x 26 | # print('convi', self.conv.n, x[0,0,:,:]) 27 | 28 | x = F.conv2d(x, self.w, bias=None, stride=self.conv.s, padding=self.conv.p) # [N, OCH, OROW, OCOL] 29 | # print('convo', self.conv.n, x[0,0,:,:]) 30 | #if downsampling: # Maxpool 31 | # x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64) 32 | och = x.shape[1] 33 | if True: 34 | if self.conv.inc is not None: 35 | inc_ch = self.conv.inc.reshape((1, och, 1, 1)) 36 | x *= inc_ch 37 | if hasattr(self.conv, 'bias'): 38 | bias_ch = self.conv.bias.reshape((1, och, 1, 1)) 39 | x += bias_ch 40 | 41 | # print('biaso', self.conv.n, x[0,0,:,:]/2**self.conv.lshift_T) 42 | if hasattr(self.conv, 'lshift'): 43 | x += 1 << self.conv.lshift_T-1 44 | x >>= self.conv.lshift_T 45 | 46 | else: ## no inc/bias quantization 47 | if self.conv.inc is not None: 48 | inc_ch = self.conv.inc_raw.reshape((1, och, 1, 1)) 49 | x *= inc_ch 50 | if hasattr(self.conv, 'bias'): 51 | bias_ch = self.conv.bias_raw.reshape((1, och, 1, 1)) 52 | x += bias_ch 53 | # print('biaso', self.conv.n, x[0,0,:,:]) 54 | x = torch.round(x).to(dtype = torch.int64) 55 | 56 | if hasattr(self.conv, 'obit'): 57 | x.clip_(0, 2**(self.conv.obit)-1) 58 | return x 59 | 60 | class HWModel: 61 | def __init__(self, model_param): 62 | self.layers = [QConvLayer(conv_param) for conv_param in model_param] 63 | 64 | def __call__(self, x): 65 | assert len(x.shape) == 4 and x.dtype == torch.int64 66 | img_size = x.shape[-2:] 67 | 68 | if self.layers[0].conv.abit<8: # ImageInputQ 69 | x=x>>(8-self.layers[0].conv.abit) 70 | 71 | for i, layer in enumerate(self.layers): 72 | x = layer(x, self.layers[i+1].conv.icol [Pooling] -> Conv -> [BN] -> [Pooling], state machine mode 66 | if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ) or isinstance(sub_module, activation_quantize_fn): 67 | print(' Detected ActQ Layer', end='') 68 | if conv_cur is None: conv_cur = ConvParam() 69 | if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ): 70 | conv_cur.abit = sub_module.bit 71 | conv_cur.astep = sub_module.step 72 | else: 73 | conv_cur.abit = sub_module.a_bit 74 | conv_cur.astep = 1/2**conv_cur.abit 75 | 76 | conv_cur.actq_class = type(sub_module).__name__ 77 | print(f', abit {conv_cur.abit}, astep {conv_cur.astep}, class {conv_cur.actq_class}') 78 | 79 | if conv_cnt: # previous.obit = cur.abit 80 | model_param[conv_cnt-1].obit = conv_cur.abit 81 | model_param[conv_cnt-1].ostep = conv_cur.astep 82 | 83 | elif isinstance(sub_module, torch.nn.Conv2d): 84 | if conv_cur is None: conv_cur = ConvParam() 85 | conv_cur.n = conv_cnt 86 | print('Extract conv_%d'%conv_cnt, end='') 87 | 88 | conv_cur.k = sub_module.kernel_size[0] 89 | conv_cur.s = sub_module.stride[0] 90 | conv_cur.p = sub_module.padding[0] 91 | conv_cur.ich = sub_module.in_channels 92 | conv_cur.och = sub_module.out_channels 93 | conv_cur.irow = feature_map_shape[1] 94 | conv_cur.icol = feature_map_shape[2] 95 | 96 | feature_map_shape[0] = sub_module.out_channels 97 | feature_map_shape[1] = (feature_map_shape[1] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1 98 | feature_map_shape[2] = (feature_map_shape[2] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1 99 | conv_cur.orow = feature_map_shape[1] 100 | conv_cur.ocol = feature_map_shape[2] 101 | 102 | if sub_module.bias is not None: 103 | conv_cur.convbias = sub_module.bias.detach().numpy() 104 | print(', +bias', end='') 105 | 106 | if isinstance(sub_module, QuantConv2d): # New quant 107 | conv_cur.wbit = sub_module.bit 108 | conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step becuause of alpha 109 | 110 | elif type(sub_module).__name__ == 'Conv2d_Q': # Old dorefa quant 111 | conv_cur.wbit = sub_module.w_bit 112 | conv_cur.wstep = 1/2**(conv_cur.wbit-1) 113 | weight = np.tanh(sub_module.weight.detach().numpy()) 114 | weight = weight / np.max(np.abs(weight)) 115 | n = 2**(conv_cur.wbit-1) 116 | weight_q = weight * n 117 | weight_q = np.clip(np.round(weight_q),-n, n-1) 118 | weight_q = weight_q.astype(np.int32) 119 | conv_cur.w = weight_q 120 | else: 121 | raise NotImplementedError(sub_module) 122 | print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur))) 123 | 124 | model_param.append(conv_cur) 125 | conv_cur = None 126 | conv_cnt += 1 127 | 128 | elif isinstance(sub_module, torch.nn.BatchNorm2d): 129 | print(' Detected BatchNorm2d') 130 | gamma = sub_module.weight 131 | beta = sub_module.bias 132 | mean = sub_module.running_mean 133 | var = sub_module.running_var 134 | eps = sub_module.eps 135 | 136 | model_param[-1].bn_w = (gamma / (torch.sqrt(var + eps))).detach().numpy() 137 | model_param[-1].bn_b = (beta - (mean / (torch.sqrt(var + eps)) * gamma)).detach().numpy() 138 | 139 | elif isinstance(sub_module, torch.nn.MaxPool2d): 140 | feature_map_shape[1] = feature_map_shape[1] // sub_module.kernel_size 141 | feature_map_shape[2] = feature_map_shape[2] // sub_module.kernel_size 142 | model_param[-1].max_pool = True 143 | 144 | if not hasattr(model_param[0], 'abit'): # train code rescaled [0,255] to [0,1) by /256 default 145 | model_param[0].abit = 8 146 | if not hasattr(model_param[0], 'astep'): 147 | model_param[0].astep = 1/256 148 | 149 | return model_param 150 | 151 | def process_batchnorm(model_param): 152 | '''process_batchnorm(model_param) 153 | Merge wstep, astep, ostep scale into batchnorm, then quantize. 154 | 155 | Method: 156 | Define MAC = Conv(w, a), out = MAC*BN_w + BN_b, 157 | wq = w/wstep, aq = a/astep, MACq = MAC/MACstep, outq = out/ostep. 158 | 159 | outq = (MAC*BN_w + BN_b) / ostep 160 | = MACq * (MACstep/ostep)*BN_w + BN_b/ostep 161 | = MACq * inc_raw + bias_raw 162 | next layer activation a' = ActQ(out), i.e. a'q = clip(round(outq)) 163 | 164 | Quantiaztion of inc_raw & bias_raw: 165 | outq_real = round((MACq*round(inc_raw*scale) + round(bias_raw*scale)) / scale) ; where scale=2**T 166 | = (MACq*round(inc_raw*scale) + round(bias_raw*scale) + 0.5 * scale) // scale ; div floor 167 | = (MACq* inc + bias + 2**(T-1) ) >> T ; [!] the 2**(T-1) bias is done by hls code 168 | 169 | Params: 170 | T = (wbit-1)+abit+lshift # This comes from dorefa quant, not optimal 171 | MBIT = wbit+abit+ceil(log2(sum_number)) 172 | incbit = len(bit(inc)); biasbit = len(bit(bias)) 173 | larger lshift is better, but MBIT+incbit<48 174 | ''' 175 | lshift = 16 176 | 177 | for conv in model_param[:-1]: 178 | print(f'Process bn_{conv.n}, shape {conv.bn_w.shape},', end = ' ') 179 | 180 | # Merge step to BN 181 | conv.lshift = lshift 182 | MACstep = conv.wstep * conv.astep 183 | ostep = conv.ostep 184 | inc_raw = conv.bn_w * MACstep / ostep 185 | bias_raw = conv.bn_b / ostep 186 | conv.inc_raw = inc_raw 187 | conv.bias_raw = bias_raw 188 | 189 | # Quantization 190 | T = lshift+conv.wbit+conv.abit-1 191 | conv.inc = np.round(inc_raw * 2**T).astype(np.int64) 192 | conv.bias = np.round(bias_raw * 2**T).astype(np.int64) 193 | conv.lshift_T = T 194 | # Get bitlength 195 | bitlength = lambda x: 1 + int(np.abs(x).max()).bit_length() 196 | conv.incbit = bitlength(conv.inc) 197 | conv.biasbit = bitlength(conv.bias) 198 | print(f'incbit {conv.incbit}, biasbit {conv.biasbit}, lshift_T {conv.lshift_T}') 199 | 200 | conv_last = model_param[-1] # process lastbias 201 | conv_last.inc = None 202 | conv_last.div = 1/(conv_last.wstep * conv_last.astep) 203 | conv_last.bias = np.round(conv_last.convbias * conv_last.div).astype(np.int64) 204 | conv_last.bias_raw = conv_last.convbias * conv_last.div 205 | conv_last.biasbit = bitlength(conv_last.bias) 206 | print(f'conv_last biasbit {conv_last.biasbit}, div {conv_last.div}') 207 | 208 | def reorder_weight(model_param, layers_simd, layers_pe): 209 | '''reorder_weight(model_param) 210 | Reorder array for hlscode. 211 | ''' 212 | 213 | for conv, simd, pe in zip(model_param, layers_simd, layers_pe): 214 | print(f'Reorder conv_{conv.n}, w {conv.w.shape}', end='') 215 | conv.simd = simd 216 | conv.pe = pe 217 | 218 | # process batchnorm 219 | if conv.inc is not None: 220 | conv.inc = conv.inc.reshape(conv.och//conv.pe, conv.pe).T 221 | if conv.bias is not None: 222 | conv.bias = conv.bias.reshape(conv.och//conv.pe, conv.pe).T 223 | 224 | # process conv weight 225 | w = conv.w # [och, ich, kr, kc] 226 | assert conv.och%conv.pe == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}" 227 | assert conv.k*conv.ich%simd == 0, f"conv_{conv.n}, ich {conv.ich}, k {conv.k}, simd {conv.simd}" 228 | 229 | # if conv.n==0: # first layer is different 230 | # w = w.transpose(0, 2, 3, 1) # [och, kr, kc, ich] 231 | # else: 232 | w = w.transpose(0, 3, 2, 1) # [och, kc, kr, ich] 233 | 234 | w = w.reshape(conv.och//conv.pe, conv.pe, conv.k, conv.k*conv.ich//simd, simd) 235 | w = w.transpose(1,2,0,3,4) # [pe, k, och/pe, k*ich/simd, simd] 236 | w = w.reshape(conv.pe, conv.k, -1, simd) # hls format [pe, k, och/pe*k*ich/simd, simd] 237 | 238 | if conv.k == 1: # kernel size=1 239 | w = w.reshape(conv.pe, -1, simd) 240 | print(' ->', w.shape) 241 | 242 | conv.w = w 243 | 244 | def print_ndarray_recursion(arr, str_func=str, file=sys.stdout, stop=0): 245 | if not hasattr(arr, '__iter__') or len(arr.shape) == stop: 246 | print(str_func(arr), file=file, end='') 247 | return 248 | ends = '' if (len(arr.shape)==stop+1) else '\n' 249 | print('{', file=file, end='') 250 | for i, item in enumerate(arr): 251 | print_ndarray_recursion(item, str_func, file, stop) 252 | if i!=len(arr)-1: print(',', file=file, end=ends) 253 | print(ends+'}', file=file, end='') 254 | 255 | def write_hls_weights(model_param, path): 256 | '''write_hls_weights(model_param, path) 257 | Write hls weights+inc+bias array code according to numpy shape. 258 | ''' 259 | f = open(path + 'weights.hpp', 'w') 260 | 261 | print(f'''/******************************************************************************** 262 | * Filename: weights.hpp 263 | * Date: {time.ctime()} 264 | * Description: This file is generated by {parser.prog} 265 | * ptfilename: {opt.weight} 266 | ********************************************************************************/ 267 | 268 | #ifndef _WEIGHTS_HPP_ 269 | #define _WEIGHTS_HPP_ 270 | #include 271 | ''', file=f) 272 | 273 | for conv in model_param: 274 | n = conv.n 275 | print(f"Write conv_{n} weight, pe {conv.pe}, simd {conv.simd}, wbit {conv.wbit}") 276 | print(f"// layer: {n}, PE: {conv.pe}, SIMD: {conv.simd}, wbit: {conv.wbit}", file=f) 277 | 278 | # print conv weight, merge [SIMD] value into one ap_uint 279 | if conv.k>1: 280 | print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.k}][{conv.w.shape[2]}]=", file=f) 281 | else: 282 | print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.w.shape[1]}]=", file=f) 283 | hex_str = lambda x: '"' + hex(x) + '"' 284 | def pack1d_str(arr): # x: 1d-array 285 | x = 0 286 | for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention 287 | v = int(v) # use python bignumber, not np.int 288 | assert -1< conv_{n}_inc[{conv.pe}][{conv.och//conv.pe}]=", file=f) 297 | print_ndarray_recursion(conv.inc, hex_str, f) 298 | print(';', file=f) 299 | if conv.bias is not None: 300 | print(f"const ap_int<{conv.biasbit}> conv_{n}_bias[{conv.pe}][{conv.och//conv.pe}]=", file=f) 301 | print_ndarray_recursion(conv.bias, hex_str, f) 302 | print(';', file=f) 303 | 304 | print('#endif', file=f) 305 | f.close() 306 | 307 | def adjust_weight(model_param): 308 | special_wa_bit = ((4,2),(5,3),(5,4),(5,5),(5,6),(5,7),(5,8),(7,2),(7,3)) 309 | # These packing can't quantize to -2**(wbit-1) 310 | for conv in model_param: 311 | if (conv.wbit, conv.abit) in special_wa_bit: 312 | print(f'Adjust conv_{conv.n} wbit={conv.wbit}') 313 | conv.w = np.maximum(conv.w, -2**(conv.wbit-1)+1) 314 | 315 | if __name__=='__main__': 316 | parser = argparse.ArgumentParser() 317 | parser.add_argument('-w', '--weight', default=None, help='.pt file name in ./weights/') 318 | parser.add_argument('-m', '--model', default='UltraNet_FixQ', help = 'model class name in mymodel.py') 319 | parser.add_argument('-c', '--config-simd-pe', default='config_simd_pe', help = '.txt file in ./hls/') 320 | opt = parser.parse_args() 321 | if opt.weight is None: opt.weight = select_weight_file() 322 | 323 | simd_pe = np.loadtxt('hls/'+opt.config_simd_pe+'.txt', dtype=int, skiprows=1) 324 | dir_output = 'hls/' + opt.weight + '/' 325 | if not os.path.exists(dir_output): os.makedirs(dir_output) 326 | 327 | # load model and state_dict 328 | ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu') 329 | model = getattr(mymodel, opt.model)(**ptfile.setdefault('model_params', {})) 330 | model.load_state_dict(ptfile['model']) 331 | 332 | # processs 333 | model_param = extract_model([1, 160, 320]) 334 | adjust_weight(model_param) 335 | process_batchnorm(model_param) # get bn param before write hls config 336 | torch.save(model_param, dir_output + 'model_param.pkl') 337 | 338 | reorder_weight(model_param, simd_pe[:,0], simd_pe[:,1]) # get pe, simd param before write hls config 339 | write_hls_config(model_param, dir_output) 340 | write_hls_weights(model_param, dir_output) 341 | -------------------------------------------------------------------------------- /dacsdc/export_hls_skynet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | from typing import Dict, List 4 | import torch 5 | import numpy as np 6 | import sys 7 | import os 8 | 9 | import sys 10 | sys.path.append('..') 11 | import mymodel 12 | from utils.view_pt import select_weight_file 13 | from quant_dorefa import activation_quantize_fn 14 | from anypacking.quant_module import HWGQ, QuantConv2d, ImageInputQ 15 | 16 | class ConvParam: ... 17 | 18 | def write_hls_config(model_param, path): 19 | name_mapping = { 20 | 'k': 'K', 21 | #'s': 'S', 22 | #'p': 'P', 23 | 'ich': 'IFM_CH', 24 | 'irow': 'IFM_ROW', 25 | 'icol': 'IFM_COL', 26 | 'och': 'OFM_CH', 27 | 'orow': 'OFM_ROW', 28 | 'ocol': 'OFM_COL', 29 | 'abit': 'IN_BIT', 30 | 'wbit': 'W_BIT', 31 | 'incbit': 'INC_BIT', 32 | 'biasbit': 'BIAS_BIT', 33 | 'simd': 'SIMD', 34 | 'pe': 'PE', 35 | 'lshift': 'L_SHIFT' 36 | } 37 | content = f'''/******************************************************************************** 38 | * Filename: config.h 39 | * Date: {time.ctime()} 40 | * Description: This file is generated by {parser.prog} 41 | * ptfilename: {opt.weight} 42 | ********************************************************************************/ 43 | 44 | #ifndef _CONFIG_H_ 45 | #define _CONFIG_H_ 46 | 47 | ''' 48 | for n, conv_param in enumerate(model_param): 49 | content += f'// conv_{n}\n' 50 | for k, v in name_mapping.items(): 51 | if hasattr(conv_param, k): # e.g. conv_last has no incbit 52 | content += f'#define CONV_{n}_{v} {getattr(conv_param, k)}\n' 53 | content += '\n' 54 | content += '#endif' 55 | 56 | with open(path + 'config.h', 'w') as f: 57 | print(content, file=f) 58 | 59 | def extract_model(in_shape): 60 | model_param: List[ConvParam] = [] 61 | feature_map_shape = in_shape 62 | conv_cnt = 0 63 | conv_cur = None 64 | for sub_module in model.modules(): 65 | # expect [QAct] -> [Pooling] -> Conv -> [BN] -> [Pooling], state machine mode 66 | if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ) or isinstance(sub_module, activation_quantize_fn): 67 | print(' Detected ActQ Layer', end='') 68 | if conv_cur is None: conv_cur = ConvParam() 69 | if isinstance(sub_module, HWGQ) or isinstance(sub_module, ImageInputQ): 70 | conv_cur.abit = sub_module.bit 71 | conv_cur.astep = sub_module.step 72 | else: 73 | conv_cur.abit = sub_module.a_bit 74 | conv_cur.astep = 1/2**conv_cur.abit 75 | 76 | conv_cur.actq_class = type(sub_module).__name__ 77 | print(f', abit {conv_cur.abit}, astep {conv_cur.astep}, class {conv_cur.actq_class}') 78 | 79 | if conv_cnt: # previous.obit = cur.abit 80 | model_param[conv_cnt-1].obit = conv_cur.abit 81 | model_param[conv_cnt-1].ostep = conv_cur.astep 82 | 83 | elif isinstance(sub_module, torch.nn.Conv2d): 84 | if conv_cur is None: conv_cur = ConvParam() 85 | conv_cur.n = conv_cnt 86 | print('Extract conv_%d'%conv_cnt, end='') 87 | 88 | conv_cur.k = sub_module.kernel_size[0] 89 | conv_cur.s = sub_module.stride[0] 90 | conv_cur.p = sub_module.padding[0] 91 | conv_cur.ich = sub_module.in_channels 92 | conv_cur.och = sub_module.out_channels 93 | conv_cur.groups = sub_module.groups if hasattr(sub_module, 'groups') else 1 94 | conv_cur.irow = feature_map_shape[1] 95 | conv_cur.icol = feature_map_shape[2] 96 | 97 | feature_map_shape[0] = sub_module.out_channels 98 | feature_map_shape[1] = (feature_map_shape[1] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1 99 | feature_map_shape[2] = (feature_map_shape[2] + 2 * sub_module.padding[0] - sub_module.kernel_size[0]) // sub_module.stride[0] + 1 100 | conv_cur.orow = feature_map_shape[1] 101 | conv_cur.ocol = feature_map_shape[2] 102 | 103 | if sub_module.bias is not None: 104 | conv_cur.convbias = sub_module.bias.detach().numpy() 105 | print(', +bias', end='') 106 | 107 | if isinstance(sub_module, QuantConv2d): # New quant 108 | conv_cur.wbit = sub_module.bit 109 | conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step becuause of alpha 110 | 111 | elif type(sub_module).__name__ == 'Conv2d_Q': # Old dorefa quant 112 | conv_cur.wbit = sub_module.w_bit 113 | conv_cur.wstep = 1/2**(conv_cur.wbit-1) 114 | weight = np.tanh(sub_module.weight.detach().numpy()) 115 | weight = weight / np.max(np.abs(weight)) 116 | n = 2**(conv_cur.wbit-1) 117 | weight_q = weight * n 118 | weight_q = np.clip(np.round(weight_q),-n, n-1) 119 | weight_q = weight_q.astype(np.int32) 120 | conv_cur.w = weight_q 121 | else: 122 | raise NotImplementedError(sub_module) 123 | print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}, g {groups}'.format(**vars(conv_cur))) 124 | 125 | model_param.append(conv_cur) 126 | conv_cur = None 127 | conv_cnt += 1 128 | 129 | elif isinstance(sub_module, torch.nn.BatchNorm2d): 130 | print(' Detected BatchNorm2d') 131 | gamma = sub_module.weight 132 | beta = sub_module.bias 133 | mean = sub_module.running_mean 134 | var = sub_module.running_var 135 | eps = sub_module.eps 136 | 137 | model_param[-1].bn_w = (gamma / (torch.sqrt(var + eps))).detach().numpy() 138 | model_param[-1].bn_b = (beta - (mean / (torch.sqrt(var + eps)) * gamma)).detach().numpy() 139 | 140 | elif isinstance(sub_module, torch.nn.MaxPool2d): 141 | feature_map_shape[1] = feature_map_shape[1] // sub_module.kernel_size 142 | feature_map_shape[2] = feature_map_shape[2] // sub_module.kernel_size 143 | model_param[-1].max_pool = True 144 | 145 | if not hasattr(model_param[0], 'abit'): # train code rescaled [0,255] to [0,1) by /256 default 146 | model_param[0].abit = 8 147 | if not hasattr(model_param[0], 'astep'): 148 | model_param[0].astep = 1/256 149 | 150 | return model_param 151 | 152 | def process_batchnorm(model_param): 153 | '''process_batchnorm(model_param) 154 | Merge wstep, astep, ostep scale into batchnorm, then quantize. 155 | 156 | Method: 157 | Define MAC = Conv(w, a), out = MAC*BN_w + BN_b, 158 | wq = w/wstep, aq = a/astep, MACq = MAC/MACstep, outq = out/ostep. 159 | 160 | outq = (MAC*BN_w + BN_b) / ostep 161 | = MACq * (MACstep/ostep)*BN_w + BN_b/ostep 162 | = MACq * inc_raw + bias_raw 163 | next layer activation a' = ActQ(out), i.e. a'q = clip(round(outq)) 164 | 165 | Quantiaztion of inc_raw & bias_raw: 166 | outq_real = round((MACq*round(inc_raw*scale) + round(bias_raw*scale)) / scale) ; where scale=2**T 167 | = (MACq*round(inc_raw*scale) + round(bias_raw*scale) + 0.5 * scale) // scale ; div floor 168 | = (MACq* inc + bias + 2**(T-1) ) >> T ; [!] the 2**(T-1) bias is done by hls code 169 | 170 | Params: 171 | T = (wbit-1)+abit+lshift # This comes from dorefa quant, not optimal 172 | MBIT = wbit+abit+ceil(log2(sum_number)) 173 | incbit = len(bit(inc)); biasbit = len(bit(bias)) 174 | larger lshift is better, but MBIT+incbit<48 175 | ''' 176 | lshift = 6 177 | 178 | for conv in model_param[:-1]: 179 | print(f'Process bn_{conv.n}, shape {conv.bn_w.shape},', end = ' ') 180 | 181 | # Merge step to BN 182 | conv.lshift = lshift 183 | MACstep = conv.wstep * conv.astep 184 | ostep = conv.ostep 185 | inc_raw = conv.bn_w * MACstep / ostep 186 | bias_raw = conv.bn_b / ostep 187 | conv.inc_raw = inc_raw 188 | conv.bias_raw = bias_raw 189 | 190 | # Quantization 191 | T = lshift+conv.wbit+conv.abit-1 192 | conv.inc = np.round(inc_raw * 2**T).astype(np.int64) 193 | conv.bias = np.round(bias_raw * 2**T).astype(np.int64) 194 | conv.lshift_T = T 195 | # Get bitlength 196 | bitlength = lambda x: 1 + int(np.abs(x).max()).bit_length() 197 | conv.incbit = bitlength(conv.inc) 198 | conv.biasbit = bitlength(conv.bias) 199 | print(f'incbit {conv.incbit}, biasbit {conv.biasbit}, lshift_T {conv.lshift_T}') 200 | 201 | conv_last = model_param[-1] # process lastbias 202 | conv_last.inc = None 203 | conv_last.div = 1/(conv_last.wstep * conv_last.astep) 204 | #conv_last.bias = np.round(conv_last.convbias * conv_last.div).astype(np.int64) 205 | #conv_last.bias_raw = conv_last.convbias * conv_last.div 206 | #conv_last.biasbit = bitlength(conv_last.bias) 207 | #print(f'conv_last biasbit {conv_last.biasbit}, div {conv_last.div}') 208 | 209 | def reorder_weight(model_param, layers_simd, layers_pe, layers_actp, layers_pep): 210 | '''reorder_weight(model_param) 211 | Reorder array for hlscode. 212 | ''' 213 | 214 | for conv, simd, pe, actp, pep in zip(model_param, layers_simd, layers_pe, layers_actp, layers_pep): 215 | print(f'Reorder conv_{conv.n}, w {conv.w.shape}', end='') 216 | conv.simd = simd 217 | conv.pe = pe 218 | conv.actp = actp 219 | conv.pep = pep 220 | 221 | # process batchnorm 222 | if conv.inc is not None: 223 | conv.inc = conv.inc.reshape(conv.och//conv.actp, conv.actp).T 224 | if hasattr(conv, 'bias') and conv.bias is not None: 225 | conv.bias = conv.bias.reshape(conv.och//conv.actp, conv.actp).T 226 | 227 | # process conv weight 228 | if conv.k == 1: 229 | w = conv.w # [och, ich, kr, kc] 230 | g_ich = w.shape[1] 231 | assert conv.och%(conv.pe * conv.pep) == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}, pep {conv.pep}" 232 | assert g_ich%simd == 0, f"conv_{conv.n}, ich {g_ich}, simd {conv.simd}" 233 | 234 | w = w.reshape(conv.och//(conv.pe * conv.pep), conv.pe, conv.pep, g_ich//simd, simd) # [och / (pe * pep), pe, pep, ich / simd, simd] 235 | w = w.transpose(1,0,3,4,2) #[pe, och / (pe * pep), ich / simd, simd, pep] 236 | w = w.reshape(conv.pe, -1, g_ich//simd, simd*conv.pep) # [pe, och / (pe * pep), ich / simd, simd * pep] 237 | w = w.reshape(conv.pe, -1, simd*conv.pep) # hls format [pe, och/(pe * pep) * ich/simd, simd * pep] 238 | else: 239 | w = conv.w # [och, ich, kr, kc] 240 | g_ich = w.shape[1] 241 | assert conv.och%conv.pe == 0, f"conv_{conv.n}, och {conv.och}, pe {conv.pe}" 242 | assert conv.k*g_ich%simd == 0, f"conv_{conv.n}, ich {g_ich}, k {conv.k}, simd {conv.simd}" 243 | 244 | # if conv.n==0: # first layer is different 245 | # w = w.transpose(0, 2, 3, 1) # [och, kr, kc, ich] 246 | # else: 247 | w = w.transpose(0, 3, 2, 1) # [och, kc, kr, ich] 248 | 249 | w = w.reshape(conv.och//conv.pe, conv.pe, conv.k, conv.k*g_ich//simd, simd) 250 | w = w.transpose(1,2,0,3,4) # [pe, k, och/pe, k*ich/simd, simd] 251 | w = w.reshape(conv.pe, conv.k, -1, simd) # hls format [pe, k, och/pe*k*ich/simd, simd] 252 | 253 | print(' ->', w.shape) 254 | 255 | conv.w = w 256 | 257 | def print_ndarray_recursion(arr, str_func=str, file=sys.stdout, stop=0): 258 | if not hasattr(arr, '__iter__') or len(arr.shape) == stop: 259 | print(str_func(arr), file=file, end='') 260 | return 261 | ends = '' if (len(arr.shape)==stop+1) else '\n' 262 | print('{', file=file, end='') 263 | for i, item in enumerate(arr): 264 | print_ndarray_recursion(item, str_func, file, stop) 265 | if i!=len(arr)-1: print(',', file=file, end=ends) 266 | print(ends+'}', file=file, end='') 267 | 268 | def write_hls_weights(model_param, path): 269 | '''write_hls_weights(model_param, path) 270 | Write hls weights+inc+bias array code according to numpy shape. 271 | ''' 272 | f = open(path + 'weights.hpp', 'w') 273 | 274 | print(f'''/******************************************************************************** 275 | * Filename: weights.hpp 276 | * Date: {time.ctime()} 277 | * Description: This file is generated by {parser.prog} 278 | * ptfilename: {opt.weight} 279 | ********************************************************************************/ 280 | 281 | #ifndef _WEIGHTS_HPP_ 282 | #define _WEIGHTS_HPP_ 283 | #include 284 | ''', file=f) 285 | 286 | for conv in model_param: 287 | n = conv.n 288 | def pack1d_str(arr): # x: 1d-array 289 | x = 0 290 | for v in arr[::-1]: # [!] reverse simd pack, it is related to hls implemention 291 | v = int(v) # use python bignumber, not np.int 292 | assert -1< conv_{n}_w[{conv.pe}][{conv.w.shape[1]}]=", file=f) 302 | hex_str = lambda x: '"' + hex(x) + '"' 303 | print_ndarray_recursion(conv.w, pack1d_str, f, stop=1) 304 | print(';', file=f) 305 | else: 306 | print(f"Write conv_{n} weight, pe {conv.pe}, simd {conv.simd}, actp {conv.actp}, wbit {conv.wbit}") 307 | print(f"// layer: {n}, PE: {conv.pe}, SIMD: {conv.simd}, ACTP: {conv.actp}, wbit: {conv.wbit}", file=f) 308 | 309 | # print conv weight, merge [SIMD] value into one ap_uint 310 | print(f"const ap_uint<{conv.wbit * conv.simd}> conv_{n}_w[{conv.pe}][{conv.k}][{conv.w.shape[2]}]=", file=f) 311 | hex_str = lambda x: '"' + hex(x) + '"' 312 | print_ndarray_recursion(conv.w, pack1d_str, f, stop=1) 313 | print(';', file=f) 314 | 315 | # print inc, bias 316 | if conv.inc is not None: 317 | print(f"const ap_int<{conv.incbit}> conv_{n}_inc[{conv.actp}][{conv.och//conv.actp}]=", file=f) 318 | print_ndarray_recursion(conv.inc, hex_str, f) 319 | print(';', file=f) 320 | if hasattr(conv, 'bias') and conv.bias is not None: 321 | print(f"const ap_int<{conv.biasbit}> conv_{n}_bias[{conv.actp}][{conv.och//conv.actp}]=", file=f) 322 | print_ndarray_recursion(conv.bias, hex_str, f) 323 | print(';', file=f) 324 | 325 | print('#endif', file=f) 326 | f.close() 327 | 328 | def adjust_weight(model_param): 329 | special_wa_bit = ((4,2),(5,3),(5,4),(5,5),(5,6),(5,7),(5,8),(7,2),(7,3)) 330 | # These packing can't quantize to -2**(wbit-1) 331 | for conv in model_param: 332 | if (conv.wbit, conv.abit) in special_wa_bit: 333 | print(f'Adjust conv_{conv.n} wbit={conv.wbit}') 334 | conv.w = np.maximum(conv.w, -2**(conv.wbit-1)+1) 335 | 336 | if __name__=='__main__': 337 | parser = argparse.ArgumentParser() 338 | parser.add_argument('-w', '--weight', default=None, help='.pt file name in ./weights/') 339 | parser.add_argument('-m', '--model', default='SkyNet_FixQ', help = 'model class name in mymodel.py') 340 | parser.add_argument('-c', '--config-simd-pe', default='config_simd_pe_skynet', help = '.txt file in ./hls/') 341 | opt = parser.parse_args() 342 | if opt.weight is None: opt.weight = select_weight_file() 343 | 344 | simd_pe = np.loadtxt('hls/'+opt.config_simd_pe+'.txt', dtype=int, skiprows=1) 345 | dir_output = 'hls/' + opt.weight + '/' 346 | if not os.path.exists(dir_output): os.makedirs(dir_output) 347 | 348 | # load model and state_dict 349 | ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu') 350 | model = getattr(mymodel, opt.model)(**ptfile.setdefault('model_params', {})) 351 | model.load_state_dict(ptfile['model']) 352 | 353 | # processs 354 | model_param = extract_model([1, 160, 320]) 355 | # adjust_weight(model_param) 356 | process_batchnorm(model_param) # get bn param before write hls config 357 | torch.save(model_param, dir_output + 'model_param.pkl') 358 | 359 | reorder_weight(model_param, simd_pe[:,0], simd_pe[:,1], simd_pe[:,2], simd_pe[:,3]) # get pe, simd, actp, pep param before write hls config 360 | write_hls_config(model_param, dir_output) 361 | write_hls_weights(model_param, dir_output) 362 | -------------------------------------------------------------------------------- /dacsdc/hls/config_simd_pe.txt: -------------------------------------------------------------------------------- 1 | simd pe 2 | 3 16 3 | 16 4 4 | 8 8 5 | 8 4 6 | 4 2 7 | 4 2 8 | 4 2 9 | 4 2 10 | 4 2 -------------------------------------------------------------------------------- /dacsdc/main_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch.distributed as dist 4 | import torch.optim as optim 5 | import torch.optim.lr_scheduler as lr_scheduler 6 | import time 7 | 8 | import sys 9 | sys.path.append('..') 10 | import localconfig 11 | import test 12 | from datasets import * 13 | from yolo_utils import * 14 | 15 | from mymodel import * 16 | import mymodel 17 | 18 | wdir = 'weights' + os.sep # weights dir 19 | 20 | # Hyperparameters (results68: 59.9 mAP@0.5 yolov3-spp-416) https://github.com/ultralytics/yolov3/issues/310 21 | 22 | hyp = {'giou': 3.54, # giou loss gain 23 | 'cls': 37.4, # cls loss gain 24 | 'cls_pw': 1.0, # cls BCELoss positive_weight 25 | 'obj': 64.3, # obj loss gain (*=img_size/320 if img_size != 320) 26 | 'obj_pw': 1.0, # obj BCELoss positive_weight 27 | 'iou_t': 0.225, # iou training threshold 28 | 'lr0': 0.01, # initial learning rate (SGD=5E-3, Adam=5E-4) 29 | 'lrf': -4., # final LambdaLR learning rate = lr0 * (10 ** lrf) 30 | 'momentum': 0.937, # SGD momentum 31 | 'weight_decay': 0.000484, # optimizer weight decay 32 | 'fl_gamma': 0.5, # focal loss gamma 33 | 'hsv_h': 0.0138, # image HSV-Hue augmentation (fraction) 34 | 'hsv_s': 0.678, # image HSV-Saturation augmentation (fraction) 35 | 'hsv_v': 0.36, # image HSV-Value augmentation (fraction) 36 | 'degrees': 1.98, # image rotation (+/- deg) 37 | 'translate': 0.05, # image translation (+/- fraction) 38 | 'scale': 0.05, # image scale (+/- gain) 39 | 'shear': 0.641} # image shear (+/- deg) 40 | 41 | # Overwrite hyp with hyp*.txt (optional) 42 | f = glob.glob('hyp*.txt') 43 | if f: 44 | print('Using %s' % f[0]) 45 | for k, v in zip(hyp.keys(), np.loadtxt(f[0])): 46 | hyp[k] = v 47 | 48 | def train(): 49 | img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2 # train, test sizes 50 | epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs 51 | batch_size = opt.batch_size 52 | accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 53 | weights = opt.weights # initial training weights 54 | 55 | # Initialize 56 | init_seeds() 57 | 58 | # Configure run 59 | train_path = localconfig.train_path 60 | test_path = localconfig.test_path 61 | nc = 1 62 | 63 | results_file = 'results/%s.txt'%opt.name 64 | 65 | # Initialize model 66 | if opt.model != '': 67 | model = getattr(mymodel, opt.model)(opt.bitw, opt.bita).to(device) 68 | else: 69 | if opt.bypass: 70 | model = UltraNetBypass_FixQ(opt.bitw, opt.bita).to(device) 71 | else: 72 | model = UltraNet_FixQ(opt.bitw, opt.bita).to(device) 73 | 74 | # Optimizer 75 | pg0, pg1, pg2 = [], [], [] # optimizer parameter groups 76 | for k, v in dict(model.named_parameters()).items(): 77 | if '.bias' in k: 78 | pg2 += [v] # biases 79 | elif 'Conv2d.weight' in k: 80 | pg1 += [v] # apply weight_decay 81 | else: 82 | pg0 += [v] # all else 83 | 84 | if opt.adam: 85 | # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) 86 | optimizer = optim.Adam(pg0, lr=hyp['lr0']) 87 | else: 88 | optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) 89 | optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay 90 | optimizer.add_param_group({'params': pg2}) # add pg2 (biases) 91 | optimizer.param_groups[2]['lr'] *= 2.0 # bias lr 92 | 93 | del pg0, pg1, pg2 94 | 95 | start_epoch = 0 96 | test_best_iou = 0.0 97 | 98 | # load weights 99 | if weights.endswith('.pt'): # pytorch format 100 | # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. 101 | chkpt = torch.load(weights, map_location=device) 102 | 103 | # load model 104 | try: 105 | chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} 106 | model.load_state_dict(chkpt['model'], strict=False) 107 | except KeyError as e: 108 | s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " % (opt.weights, opt.cfg, opt.weights) 109 | raise KeyError(s) from e 110 | 111 | if opt.resume: 112 | # load optimizer 113 | if chkpt['optimizer'] is not None: 114 | optimizer.load_state_dict(chkpt['optimizer']) 115 | best_fitness = chkpt['best_fitness'] 116 | 117 | # load results 118 | if chkpt.get('training_results') is not None: 119 | with open(results_file, 'w') as file: 120 | file.write(chkpt['training_results']) # write results.txt 121 | 122 | start_epoch = chkpt['epoch'] + 1 123 | 124 | del chkpt 125 | 126 | # Scheduler https://github.com/ultralytics/yolov3/issues/238 127 | lf = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.999 + 0.001 # cosine https://arxiv.org/pdf/1812.01187.pdf 128 | scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) 129 | scheduler.last_epoch = start_epoch 130 | 131 | # Initialize distributed training 132 | if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): 133 | dist.init_process_group(backend='nccl', # 'distributed backend' 134 | init_method='tcp://127.0.0.1:5000', # distributed training init method 135 | world_size=1, # number of nodes for distributed training 136 | rank=0) # distributed training node rank 137 | model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) 138 | model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level 139 | 140 | # Dataloader 141 | #batch_size = min(batch_size, len(dataset)) 142 | nw = min([os.cpu_count()//4, batch_size//4 if batch_size > 1 else 0, 8]) # number of workers 143 | 144 | # Testloader 145 | testset = LoadImagesAndLabels(test_path, img_size_test, batch_size, 146 | hyp=hyp, 147 | rect=False, 148 | cache_images=opt.cache_images, 149 | single_cls=opt.single_cls) 150 | testloader = torch.utils.data.DataLoader(testset, 151 | batch_size=batch_size, 152 | num_workers=0, 153 | pin_memory=True, 154 | collate_fn=testset.collate_fn) 155 | 156 | test.test(batch_size=batch_size, 157 | img_size=img_size_test, 158 | model=model, 159 | dataloader=testloader) # make forward 160 | bops, bita, bitw, dsps, brams = model.fetch_arch_info() 161 | print('model with bops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M, bram: {:.3f}K'.format(bops, bita, bitw, dsps, brams)) 162 | 163 | # Dataset 164 | dataset = LoadImagesAndLabels(train_path, img_size, batch_size, 165 | augment=True, 166 | hyp=hyp, # augmentation hyperparameters 167 | rect=opt.rect, # rectangular training 168 | cache_images=opt.cache_images, 169 | single_cls=opt.single_cls) 170 | 171 | dataloader = torch.utils.data.DataLoader(dataset, 172 | batch_size=batch_size, 173 | num_workers=nw, 174 | shuffle=not opt.rect, # Shuffle=True unless rectangular training is used 175 | pin_memory=True, 176 | collate_fn=dataset.collate_fn) 177 | 178 | # Start training 179 | nb = len(dataloader) 180 | prebias = start_epoch == 0 181 | model.nc = nc # attach number of classes to model 182 | model.arc = opt.arc # attach yolo architecture 183 | model.hyp = hyp # attach hyperparameters to model 184 | model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights 185 | # torch.autograd.set_detect_anomaly(True) 186 | results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' 187 | t0 = time.time() 188 | torch_utils.model_info(model, report='summary') # 'full' or 'summary' 189 | print('Using %g dataloader workers' % nw) 190 | print('Starting training for %g epochs...' % epochs) 191 | 192 | for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ 193 | model.train() 194 | model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2 # GIoU <-> 1.0 loss ratio 195 | 196 | # Prebias 197 | if prebias: 198 | ne = max(round(30 / nb), 3) # number of prebias epochs 199 | ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \ 200 | np.interp(epoch, [0, ne], [0.9, hyp['momentum']]) # prebias settings (lr=0.1, momentum=0.9) 201 | if epoch == ne: 202 | # print_model_biases(model) 203 | prebias = False 204 | 205 | # Bias optimizer settings 206 | optimizer.param_groups[2]['lr'] = ps[0] 207 | if optimizer.param_groups[2].get('momentum') is not None: # for SGD but not Adam 208 | optimizer.param_groups[2]['momentum'] = ps[1] 209 | 210 | mloss = torch.zeros(4).to(device) # mean losses 211 | print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'iouloss', 'objloss', 'triou', 'mloss', 'targets', 'img_size')) 212 | pbar = tqdm(enumerate(dataloader), total=nb) # progress bar 213 | for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- 214 | ni = i + nb * epoch # number integrated batches (since train start) 215 | imgs = imgs.to(device).float() / 256.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 216 | targets = targets.to(device) 217 | 218 | # Run model 219 | pred = model(imgs) 220 | 221 | # Compute loss 222 | loss, loss_items = compute_loss(pred, targets, model) 223 | if not torch.isfinite(loss): 224 | print('WARNING: non-finite loss, ending training ', loss_items) 225 | return results 226 | 227 | # Scale loss by nominal batch_size of 64 228 | loss *= batch_size / 64 229 | 230 | loss.backward() 231 | 232 | # Optimize accumulated gradient 233 | if ni % accumulate == 0: 234 | optimizer.step() 235 | optimizer.zero_grad() 236 | # Print batch results 237 | mloss = (mloss * i + loss_items) / (i + 1) # update mean losses 238 | mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) 239 | s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) 240 | pbar.set_description(s) 241 | 242 | # end batch ------------------------------------------------------------------------------------------------ 243 | 244 | # Update scheduler 245 | scheduler.step() 246 | 247 | train_iou = mloss[2] 248 | 249 | # Process epoch results 250 | final_epoch = epoch + 1 == epochs 251 | if not opt.notest or final_epoch: # Calculate mAP 252 | results = test.test(batch_size=batch_size, 253 | img_size=img_size_test, 254 | model=model, 255 | dataloader=testloader) 256 | 257 | # Write epoch results 258 | with open(results_file, 'a') as f: 259 | f.write(s + '%10.3g' * len(results) % results + '\n') # test_losses=(iou, loss_sum, lobj, lcls) 260 | 261 | # Update best mAP 262 | results = torch.tensor(results, device = 'cpu') 263 | 264 | test_iou = results[0] 265 | if test_iou > test_best_iou: 266 | test_best_iou = test_iou 267 | 268 | # Save training results 269 | save = (not opt.nosave) or (final_epoch) 270 | if save: 271 | with open(results_file, 'r') as f: 272 | # Create checkpoint 273 | chkpt = {'epoch': epoch, 274 | 'training_results': f.read(), 275 | 'model': model.module.state_dict() if type( 276 | model) is nn.parallel.DistributedDataParallel else model.state_dict(), 277 | 'optimizer': None if final_epoch else optimizer.state_dict(), 278 | 'model_params':model.model_params, # arch param 279 | 'extra': {'time': time.ctime(), 'name': opt.name}} 280 | 281 | # Save last checkpoint 282 | torch.save(chkpt, wdir + '%s_last.pt'%opt.name) 283 | 284 | if test_iou == test_best_iou: 285 | torch.save(chkpt, wdir + '%s_best.pt'%opt.name) 286 | 287 | # Delete checkpoint 288 | del chkpt 289 | 290 | # end epoch ---------------------------------------------------------------------------------------------------- 291 | 292 | # end training 293 | n = opt.name 294 | 295 | print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) 296 | dist.destroy_process_group() if torch.cuda.device_count() > 1 else None 297 | torch.cuda.empty_cache() 298 | 299 | with open('results.csv', 'a') as f: 300 | print("fixed,%s,%d/%d, , ,%s,%s,%.1f,%.1f, , , ,%d, ,%.3f, "% 301 | (opt.name,epochs-1,epochs,opt.bitw,opt.bita,train_iou*100,(test_iou+test_best_iou)*50, 302 | int(round(bops)), dsps), file=f) 303 | 304 | return results 305 | 306 | 307 | if __name__ == '__main__': 308 | parser = argparse.ArgumentParser() 309 | parser.add_argument('--bypass', action='store_true', help='use bypass model') 310 | parser.add_argument('--epochs', type=int, default=200) # 500200 batches at bs 16, 117263 COCO images = 273 epochs 311 | parser.add_argument('--batch-size', type=int, default=64) # effective bs = batch_size * accumulate = 16 * 4 = 64 312 | parser.add_argument('--accumulate', type=int, default=1, help='batches to accumulate before optimizing') 313 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path') 314 | parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path') 315 | parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes') 316 | parser.add_argument('--rect', action='store_true', help='rectangular training') 317 | parser.add_argument('--resume', action='store_true', help='resume training from last.pt') 318 | parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') 319 | parser.add_argument('--notest', action='store_true', help='only test final epoch') 320 | parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') 321 | parser.add_argument('--weights', type=str, default='', help='initial weights path') 322 | parser.add_argument('--arc', type=str, default='default', help='yolo architecture') # default, uCE, uBCE 323 | parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied') 324 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)') 325 | parser.add_argument('--adam', action='store_true', help='use adam optimizer') 326 | parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') 327 | parser.add_argument('--mixm', type=str) 328 | parser.add_argument('--bitw', type=str, default='') 329 | parser.add_argument('--bita', type=str, default='') 330 | parser.add_argument('--var', type=float, help='debug variable') 331 | parser.add_argument('--model', type=str, default='', help='use specific model') 332 | 333 | opt = parser.parse_args() 334 | 335 | if opt.mixm is not None: 336 | wmix = torch.load('weights/%s.pt'%opt.mixm) 337 | opt.bitw = wmix['extra']['bestw'] 338 | opt.bita = wmix['extra']['besta'] 339 | del wmix 340 | last = wdir + 'last_%s.pt'%opt.name 341 | opt.weights = last if opt.resume else opt.weights 342 | print(opt) 343 | device = torch_utils.select_device(opt.device, batch_size=opt.batch_size) 344 | 345 | train() # train normally 346 | -------------------------------------------------------------------------------- /dacsdc/pareto_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | cds = { 5 | 'cd':['3e-5', '6e-5', '1e-4', '2e-4', '3e-4'], 6 | 'cdt':['1e-5', '2e-5', '3e-5', '6e-5', '1e-4'], 7 | } 8 | 9 | def search_train(): 10 | for cd in cds[opt.arg]: 11 | name = '%d_%s_'%(opt.it, opt.arg)+cd.replace('-','').replace('.','') 12 | os.system('python search_train.py --name %s --cd %s'%('f'+name, cd)) 13 | 14 | def main_train(): 15 | for cd in cds[opt.arg]: 16 | name = '%d_%s_'%(opt.it, opt.arg)+cd.replace('-','').replace('.','') 17 | os.system('python main_train.py --name %s --mixm %s'%('x'+name, 'f'+name+'_last')) 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--search', action='store_true') 21 | parser.add_argument('--main', action='store_true') 22 | parser.add_argument('--it', type=int) 23 | parser.add_argument('--arg', type=str) 24 | opt = parser.parse_args() 25 | 26 | if opt.search: 27 | search_train() 28 | 29 | if opt.main: 30 | main_train() 31 | -------------------------------------------------------------------------------- /dacsdc/quant_dorefa.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | # 改动了权重数据的量化 7 | def uniform_quantize(k): 8 | class qfn(torch.autograd.Function): 9 | 10 | @staticmethod 11 | def forward(ctx, input): 12 | if k == 32: 13 | out = input 14 | # elif k == 1: 15 | # out = torch.sign(input) 16 | else: 17 | # cang 18 | n = float(2 ** k) 19 | out = torch.round(input * n).clamp_(-n, n-1) / n 20 | # normal 21 | # n = float(2 ** k - 1) 22 | # out = torch.round(input * n) / n 23 | 24 | return out 25 | 26 | @staticmethod 27 | def backward(ctx, grad_output): 28 | grad_input = grad_output.clone() 29 | return grad_input 30 | 31 | return qfn().apply 32 | 33 | 34 | class weight_quantize_fn(nn.Module): 35 | def __init__(self, w_bit): 36 | super(weight_quantize_fn, self).__init__() 37 | assert w_bit <= 8 or w_bit == 32 38 | self.w_bit = w_bit 39 | # 符号位 占一位 40 | self.uniform_q = uniform_quantize(k=w_bit - 1) 41 | 42 | def forward(self, x): 43 | # print('===================') 44 | if self.w_bit == 32: 45 | # weight_q = x 46 | weight = torch.tanh(x) 47 | # weight = weight / 2 / torch.max(torch.abs(weight)) + 0.5 48 | # weight_q = 2 * self.uniform_q(weight) - 1 49 | weight_q = weight / torch.max(torch.abs(weight)) 50 | elif self.w_bit == 1: 51 | E = torch.mean(torch.abs(x)).detach() 52 | weight_q = (self.uniform_q(x / E) + 1) / 2 * E 53 | else: 54 | weight = torch.tanh(x) 55 | # weight = weight / 2 / torch.max(torch.abs(weight)) + 0.5 56 | # weight_q = 2 * self.uniform_q(weight) - 1 57 | weight = weight / torch.max(torch.abs(weight)) 58 | # 想量化到带符号的 k bit 59 | weight_q = self.uniform_q(weight) 60 | return weight_q 61 | 62 | 63 | class activation_quantize_fn(nn.Module): 64 | def __init__(self, a_bit): 65 | super(activation_quantize_fn, self).__init__() 66 | assert a_bit <= 8 or a_bit == 32 67 | self.a_bit = a_bit 68 | self.uniform_q = uniform_quantize(k=a_bit) 69 | 70 | def forward(self, x): 71 | if self.a_bit == 32: 72 | activation_q = torch.clamp(x, 0, 6) 73 | else: 74 | activation_q = self.uniform_q(torch.clamp(x, 0, 1)) 75 | # print(np.unique(activation_q.detach().numpy())) 76 | return activation_q 77 | 78 | class ActQuant_PACT(nn.Module): 79 | def __init__(self, act_bit=4, scale_coef=1.0): 80 | super(ActQuant_PACT, self).__init__() 81 | self.act_bit=act_bit 82 | self.scale_coef = nn.Parameter(torch.ones(1)*scale_coef) 83 | 84 | self.uniform_q = uniform_quantize(k=act_bit) 85 | 86 | # self.uniform_q = uniform_quantize(k=act_bit) 87 | 88 | def forward(self, x): 89 | if self.act_bit==32: 90 | out=0.5*(x.abs() - (x-self.scale_coef.abs()).abs()+self.scale_coef.abs())/self.scale_coef.abs() 91 | else: 92 | out = 0.5*(x.abs() - (x-self.scale_coef.abs()).abs()+self.scale_coef.abs()) 93 | activation_q = self.uniform_q(out / self.scale_coef) 94 | # print(self.scale_coef) 95 | 96 | # out = torch.round(out * (2**self.act_bit - 1) / self.scale_coef) / (2**self.act_bit - 1) 97 | return activation_q 98 | 99 | 100 | def conv2d_Q_fn(w_bit): 101 | class Conv2d_Q(nn.Conv2d): 102 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, 103 | padding=0, dilation=1, groups=1, bias=True): 104 | super(Conv2d_Q, self).__init__(in_channels, out_channels, kernel_size, stride, 105 | padding, dilation, groups, bias) 106 | self.w_bit = w_bit 107 | self.quantize_fn = weight_quantize_fn(w_bit=w_bit) 108 | 109 | def forward(self, input, order=None): 110 | weight_q = self.quantize_fn(self.weight) 111 | # print(np.unique(weight_q.detach().numpy())) 112 | return F.conv2d(input, weight_q, self.bias, self.stride, 113 | self.padding, self.dilation, self.groups) 114 | 115 | return Conv2d_Q 116 | 117 | class activation_quantize_fn_test(nn.Module): 118 | def __init__(self, a_bit): 119 | super(activation_quantize_fn_test, self).__init__() 120 | assert a_bit <= 8 or a_bit == 32 121 | self.a_bit = a_bit 122 | self.uniform_q = uniform_quantize(k=a_bit) 123 | 124 | def forward(self, x): 125 | if self.a_bit == 32: 126 | activation_q = torch.clamp(x, 0, 6) 127 | else: 128 | activation_q = self.uniform_q(torch.clamp(x, 0, 6)/6)*6 129 | return activation_q 130 | 131 | class weight_quantize_fn_test(nn.Module): 132 | def __init__(self, w_bit): 133 | super(weight_quantize_fn_test, self).__init__() 134 | assert w_bit <= 8 or w_bit == 32 135 | self.w_bit = w_bit 136 | # 符号位 占一位 137 | self.uniform_q = uniform_quantize(k=w_bit - 1) 138 | 139 | def forward(self, x): 140 | # print('===================') 141 | assert(1 1 and torch.distributed.is_available(): 145 | dist.init_process_group(backend='nccl', # 'distributed backend' 146 | init_method='tcp://127.0.0.1:5000', # distributed training init method 147 | world_size=1, # number of nodes for distributed training 148 | rank=0) # distributed training node rank 149 | model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) 150 | model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level 151 | 152 | # Dataloader 153 | #batch_size = min(batch_size, len(dataset)) 154 | nw = min([os.cpu_count()//4, batch_size//4 if batch_size > 1 else 0, 8]) # number of workers 155 | 156 | # Testloader 157 | testset = LoadImagesAndLabels(test_path, img_size_test, batch_size, 158 | hyp=hyp, 159 | rect=False, 160 | cache_images=opt.cache_images, 161 | single_cls=opt.single_cls) 162 | testloader = torch.utils.data.DataLoader(testset, 163 | batch_size=batch_size, 164 | num_workers=0, 165 | pin_memory=True, 166 | collate_fn=testset.collate_fn) 167 | 168 | # Dataset 169 | dataset = LoadImagesAndLabels(train_path, img_size, batch_size, 170 | augment=True, 171 | hyp=hyp, # augmentation hyperparameters 172 | rect=opt.rect, # rectangular training 173 | cache_images=opt.cache_images, 174 | single_cls=opt.single_cls) 175 | 176 | dataloader = torch.utils.data.DataLoader(dataset, 177 | batch_size=batch_size, 178 | num_workers=nw, 179 | shuffle=not opt.rect, # Shuffle=True unless rectangular training is used 180 | pin_memory=True, 181 | collate_fn=dataset.collate_fn) 182 | 183 | # Start training 184 | nb = len(dataloader) 185 | prebias = start_epoch == 0 186 | model.nc = nc # attach number of classes to model 187 | model.arc = opt.arc # attach yolo architecture 188 | model.hyp = hyp # attach hyperparameters to model 189 | model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights 190 | maps = np.zeros(nc) # mAP per class 191 | # torch.autograd.set_detect_anomaly(True) 192 | results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' 193 | t0 = time.time() 194 | torch_utils.model_info(model, report='summary') # 'full' or 'summary' 195 | print('Using %g dataloader workers' % nw) 196 | print('Starting training for %g epochs...' % epochs) 197 | 198 | for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ 199 | model.train() 200 | model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2 # GIoU <-> 1.0 loss ratio 201 | 202 | # Prebias 203 | if prebias: 204 | ne = max(round(30 / nb), 3) # number of prebias epochs 205 | ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \ 206 | np.interp(epoch, [0, ne], [0.9, hyp['momentum']]) # prebias settings (lr=0.1, momentum=0.9) 207 | if epoch == ne: 208 | # print_model_biases(model) 209 | prebias = False 210 | 211 | # Bias optimizer settings 212 | optimizer.param_groups[2]['lr'] = ps[0] 213 | if optimizer.param_groups[2].get('momentum') is not None: # for SGD but not Adam 214 | optimizer.param_groups[2]['momentum'] = ps[1] 215 | 216 | curr_lr = optimizer.param_groups[0]['lr'] 217 | curr_lra = arch_optimizer.param_groups[0]['lr'] 218 | print(f'lr:{curr_lr}, lra:{curr_lra}') 219 | 220 | mloss = torch.zeros(4).to(device) # mean losses 221 | print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'iouloss', 'objloss', 'triou', 'mloss', 'targets', 'img_size')) 222 | pbar = tqdm(enumerate(dataloader), total=nb) # progress bar 223 | for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- 224 | ni = i + nb * epoch # number integrated batches (since train start) 225 | imgs = imgs.to(device).float() / 256.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 226 | targets = targets.to(device) 227 | 228 | # Run model 229 | pred = model(imgs) 230 | 231 | # Compute loss 232 | loss, loss_items = compute_loss(pred, targets, model) 233 | if not torch.isfinite(loss): 234 | print('WARNING: non-finite loss, ending training ', loss_items) 235 | return results 236 | 237 | # Scale loss by nominal batch_size of 64 238 | loss *= batch_size / 64 239 | 240 | # complexity penalty 241 | if opt.complexity_decay != 0: 242 | loss_complexity = opt.complexity_decay * model.complexity_loss() 243 | loss += loss_complexity * 4.0 244 | 245 | if opt.complexity_decay_trivial != 0: 246 | loss_complexity_trivial = opt.complexity_decay_trivial * model.complexity_loss_trivial() 247 | loss += loss_complexity_trivial * 4.0 248 | 249 | if opt.bram_decay != 0: 250 | if hasattr(model, 'module'): 251 | loss_bram = opt.bram_decay * model.bram_loss() 252 | loss += loss_bram * 4.0 253 | 254 | loss.backward() 255 | 256 | # Optimize accumulated gradient 257 | if ni % accumulate == 0: 258 | optimizer.step() 259 | arch_optimizer.step() 260 | optimizer.zero_grad() 261 | arch_optimizer.zero_grad() 262 | 263 | # Print batch results 264 | mloss = (mloss * i + loss_items) / (i + 1) # update mean losses 265 | mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) 266 | s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) 267 | pbar.set_description(s) 268 | 269 | # end batch ------------------------------------------------------------------------------------------------ 270 | 271 | print('========= architecture =========') 272 | if hasattr(model, 'module'): 273 | best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache = model.module.fetch_best_arch() 274 | else: 275 | best_arch, bitops, bita, bitw, mixbitops, mixbita, mixbitw, dsps, mixdsps, mixbram_weight, mixbram_cache = model.fetch_best_arch() 276 | print('best model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M'.format( 277 | bitops, bita, bitw, dsps)) 278 | print('expected model with bitops: {:.3f}M, bita: {:.3f}K, bitw: {:.3f}M, dsps: {:.3f}M, bram_wa:({:.3f},{:.3f})K'.format( 279 | mixbitops, mixbita, mixbitw, mixdsps, mixbram_weight, mixbram_cache)) 280 | 281 | bestw_str = "".join([str(x+2) for x in best_arch["best_weight"]]) 282 | besta_str = "".join([str(x+2) for x in best_arch["best_activ"]]) 283 | print(f'best_weight: {best_arch["best_weight"]}') 284 | print(f'best_activ: {best_arch["best_activ"]}') 285 | 286 | # Update scheduler 287 | scheduler.step() 288 | arch_scheduler.step() 289 | 290 | train_iou = mloss[2] 291 | 292 | # Process epoch results 293 | final_epoch = epoch + 1 == epochs 294 | if not opt.notest or final_epoch: # Calculate mAP 295 | results = test.test(batch_size=batch_size, 296 | img_size=img_size_test, 297 | model=model, 298 | dataloader=testloader) 299 | 300 | # Write epoch results 301 | with open(results_file, 'a') as f: 302 | f.write(s + '%10.3g' * len(results) % results + '\n') # test_losses=(iou, loss_sum, lobj, lcls) 303 | 304 | # Update best mAP 305 | results = torch.tensor(results, device = 'cpu') 306 | 307 | test_iou = results[0] 308 | if test_iou > test_best_iou: 309 | test_best_iou = test_iou 310 | 311 | # Save training results 312 | save = (not opt.nosave) or (final_epoch) 313 | if save: 314 | with open(results_file, 'r') as f: 315 | # Create checkpoint 316 | chkpt = {'epoch': epoch, 317 | 'training_results': f.read(), 318 | 'model': model.module.state_dict() if type( 319 | model) is nn.parallel.DistributedDataParallel else model.state_dict(), 320 | 'optimizer': None if final_epoch else optimizer.state_dict(), 321 | 'extra': {'time': time.ctime(), 'name': opt.name, 'bestw': bestw_str, 'besta': besta_str}} 322 | 323 | # Save last checkpoint 324 | torch.save(chkpt, wdir + '%s_last.pt'%opt.name) 325 | 326 | if test_iou == test_best_iou: 327 | torch.save(chkpt, wdir + '%s_best.pt'%opt.name) 328 | 329 | # Delete checkpoint 330 | del chkpt 331 | 332 | # end epoch ---------------------------------------------------------------------------------------------------- 333 | 334 | # end training 335 | n = opt.name 336 | 337 | print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) 338 | dist.destroy_process_group() if torch.cuda.device_count() > 1 else None 339 | torch.cuda.empty_cache() 340 | 341 | with open('results.csv', 'a') as f: 342 | print("mixed,%s,%d/%d, , , , ,%.1f,%.1f, ,%s,%s,%d,%d,%.3f,%.3f"% 343 | (opt.name,epochs-1,epochs,train_iou*100,(test_iou+test_best_iou)*50, 344 | bestw_str,besta_str, 345 | int(round(bitops)), int(round(mixbitops)), dsps, mixdsps), file=f) 346 | 347 | return results 348 | 349 | 350 | if __name__ == '__main__': 351 | parser = argparse.ArgumentParser() 352 | parser.add_argument('--bypass', action='store_true', help='use bypass model') 353 | parser.add_argument('--epochs', type=int, default=35) # 500200 batches at bs 16, 117263 COCO images = 273 epochs 354 | parser.add_argument('--batch-size', type=int, default=64) # effective bs = batch_size * accumulate = 16 * 4 = 64 355 | parser.add_argument('--accumulate', type=int, default=1, help='batches to accumulate before optimizing') 356 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path') 357 | parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path') 358 | parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes') 359 | parser.add_argument('--rect', action='store_true', help='rectangular training') 360 | parser.add_argument('--resume', action='store_true', help='resume training from last.pt') 361 | parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') 362 | parser.add_argument('--notest', action='store_true', help='only test final epoch') 363 | parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') 364 | parser.add_argument('--weights', type=str, default='', help='initial weights path') 365 | parser.add_argument('--arc', type=str, default='default', help='yolo architecture') # default, uCE, uBCE 366 | parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied') 367 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)') 368 | parser.add_argument('--adam', action='store_true', help='use adam optimizer') 369 | parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') 370 | parser.add_argument('--var', type=float, help='debug variable') 371 | parser.add_argument('--complexity-decay', '--cd', default=0, type=float, metavar='W', help='complexity decay (default: 0)') 372 | parser.add_argument('--complexity-decay-trivial', '--cdt', default=0, type=float, metavar='W', help='complexity decay (default: 0)') 373 | parser.add_argument('--bram-decay', '--bd', default=0, type=float, metavar='W', help='complexity decay (default: 0)') 374 | parser.add_argument('--lra', '--learning-rate-alpha', default=0.01, type=float, metavar='LR', help='initial alpha learning rate') 375 | parser.add_argument('--no-share', action='store_true', help='no share weight quantization') 376 | parser.add_argument('--model', type=str, default='', help='use specific model') 377 | 378 | opt = parser.parse_args() 379 | last = wdir + 'last_%s.pt'%opt.name 380 | opt.weights = last if opt.resume else opt.weights 381 | print(opt) 382 | device = torch_utils.select_device(opt.device, batch_size=opt.batch_size) 383 | 384 | train() # train normally 385 | -------------------------------------------------------------------------------- /dacsdc/simulate_hw.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from export_hls import ConvParam 7 | from mymodel import YOLOLayer 8 | from test import get_prebox, hyp, bbox_iou, select_weight_file 9 | from torch.utils.data import DataLoader 10 | 11 | from datasets import LoadImagesAndLabels 12 | 13 | class QConvLayer: 14 | def __init__(self, conv_param): 15 | self.conv = conv_param 16 | self.w = torch.tensor(self.conv.w, dtype = torch.int64) 17 | 18 | def __call__(self, x): 19 | if self.conv.icol < x.shape[-1]: # maxpool 20 | assert self.conv.irow*2, self.conv.icol*2 == x.shape[2:] 21 | x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64) 22 | # print('convi', self.conv.n, x[0,0,:,0]) 23 | 24 | groups = self.conv.groups if hasattr(self.conv, 'groups') else 1 25 | x = F.conv2d(x, self.w, bias=None, stride=self.conv.s, padding=self.conv.p, groups=groups) # [N, OCH, OROW, OCOL] 26 | # print('convo', self.conv.n, x[0,0,:,0]) 27 | och = x.shape[1] 28 | if True: 29 | if self.conv.inc is not None: 30 | inc_ch = self.conv.inc.reshape((1, och, 1, 1)) 31 | x *= inc_ch 32 | if hasattr(self.conv, 'bias'): 33 | bias_ch = self.conv.bias.reshape((1, och, 1, 1)) 34 | x += bias_ch 35 | 36 | # print('biaso', self.conv.n, x[0,0,:,:]/2**self.conv.lshift_T) 37 | if hasattr(self.conv, 'lshift'): 38 | x += 1 << self.conv.lshift_T-1 39 | x >>= self.conv.lshift_T 40 | 41 | else: ## no inc/bias quantization 42 | if self.conv.inc is not None: 43 | inc_ch = self.conv.inc_raw.reshape((1, och, 1, 1)) 44 | x *= inc_ch 45 | if hasattr(self.conv, 'bias'): 46 | bias_ch = self.conv.bias_raw.reshape((1, och, 1, 1)) 47 | x += bias_ch 48 | # if hasattr(self.conv, 'max_pool'): # maxpool 49 | # x = F.max_pool2d(x, kernel_size = 2, stride = 2) 50 | # print('biaso', self.conv.n, x[0,0,:,0]) 51 | x = torch.round(x).to(dtype = torch.int64) 52 | 53 | if hasattr(self.conv, 'obit'): 54 | x.clip_(0, 2**(self.conv.obit)-1) 55 | 56 | return x 57 | 58 | def reorg(x): 59 | stride = 2 60 | B = x.data.size(0) 61 | C = x.data.size(1) 62 | H = x.data.size(2) 63 | W = x.data.size(3) 64 | ws = stride 65 | hs = stride 66 | x = x.view([B, C, H//hs, hs, W//ws, ws]).transpose(3, 4).contiguous() 67 | x = x.view([B, C, H//hs*W//ws, hs*ws]).transpose(2, 3).contiguous() 68 | x = x.view([B, C, hs*ws, H//hs, W//ws]).transpose(1, 2).contiguous() 69 | x = x.view([B, hs*ws*C, H//hs, W//ws]) 70 | return x 71 | 72 | class HWModel: 73 | def __init__(self, model_param): 74 | self.layers = [QConvLayer(conv_param) for conv_param in model_param] 75 | self.yololayer = YOLOLayer([[20,20], [20,20], [20,20], [20,20], [20,20], [20,20]]) 76 | self.yololayer.eval() 77 | 78 | def __call__(self, x): 79 | assert len(x.shape) == 4 and x.dtype == torch.int64 80 | img_size = x.shape[-2:] 81 | 82 | if self.layers[0].conv.abit<8: # ImageInputQ 83 | x=x>>(8-self.layers[0].conv.abit) 84 | 85 | if not opt.bypass: 86 | for i, layer in enumerate(self.layers): 87 | x = layer(x) 88 | else: 89 | for i in [0,1,2,3]: 90 | x = self.layers[i](x) 91 | p4_in = torch.round(reorg(x) * 92 | self.layers[4].conv.astep / self.layers[7].conv.astep).to(dtype=torch.int64) 93 | for i in [4,5,6]: 94 | x = self.layers[i](x) 95 | x = torch.cat([p4_in, x], 1) 96 | for i in [7,8]: 97 | x= self.layers[i](x) 98 | 99 | x = x.float() / self.layers[-1].conv.div 100 | 101 | io, p = self.yololayer(x, img_size) 102 | return io 103 | 104 | def testdataset(hwmodel): 105 | img_size = 320 106 | dataset = LoadImagesAndLabels(opt.datapath, img_size, opt.batch_size, rect=False, cache_labels=True, hyp=hyp, augment=False) 107 | dataloader = DataLoader(dataset, 108 | batch_size=opt.batch_size, 109 | #num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]), 110 | pin_memory=True, 111 | collate_fn=dataset.collate_fn) 112 | 113 | iou_sum = 0.0 114 | test_n = 0 115 | for batch_i, (imgs, targets, paths, shapes) in enumerate(dataloader): 116 | if batch_i == opt.num_batch: break 117 | bn, _, height, width = imgs.shape # batch size, channels, height, width 118 | test_n += bn 119 | 120 | imgs = imgs.to(dtype = torch.int64) 121 | inf_out = hwmodel(imgs) 122 | pre_box = get_prebox(inf_out) 123 | 124 | tbox = targets[..., 2:6] * torch.Tensor([width, height, width, height]) 125 | ious = bbox_iou(pre_box, tbox) 126 | iou_sum += ious.sum() 127 | 128 | np.set_printoptions(precision = 2) 129 | for p in range(len(imgs)): 130 | print('pbox_xywh', pre_box[p].numpy(), 'tbox_xywh', tbox[p].numpy(), 'iou %.4f'%ious[p].item()) 131 | 132 | meaniou = iou_sum / test_n 133 | 134 | print('iou', meaniou) 135 | 136 | if __name__=='__main__': 137 | parser = argparse.ArgumentParser() 138 | parser.add_argument('-w', '--weight', help='weight folder name in ./hls/, which contians model_param.pkl') 139 | parser.add_argument('-bp', '--bypass', action='store_true', help='use bypass model') 140 | parser.add_argument('--datapath', default='', help = 'test dataset path') 141 | parser.add_argument('-bs', '--batch-size', type=int, default=1, help = 'batch-size') 142 | parser.add_argument('-nb', '--num-batch', type=int, default=1, help = 'num of batchs to run, -1 for full dataset') 143 | opt = parser.parse_args() 144 | 145 | if opt.datapath == '': 146 | try: 147 | import localconfig 148 | opt.datapath = localconfig.test_path 149 | except Exception: 150 | pass 151 | print(opt) 152 | if opt.weight is None: opt.weight = select_weight_file() 153 | 154 | x = torch.zeros([1,3,320,160], dtype=torch.int64) 155 | hwmodel = HWModel(torch.load('hls/'+opt.weight+'/model_param.pkl')) 156 | 157 | testdataset(hwmodel) 158 | -------------------------------------------------------------------------------- /dacsdc/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from torch.utils.data import DataLoader 4 | 5 | import sys 6 | sys.path.append('..') 7 | from datasets import * 8 | from yolo_utils import * 9 | 10 | import mymodel 11 | from mymodel import * 12 | from utils.view_pt import select_weight_file 13 | import cv2 14 | 15 | opt=None 16 | 17 | hyp = {'giou': 3.54, # giou loss gain 18 | 'cls': 37.4, # cls loss gain 19 | 'cls_pw': 1.0, # cls BCELoss positive_weight 20 | 'obj': 64.3, # obj loss gain (*=img_size/320 if img_size != 320) 21 | 'obj_pw': 1.0, # obj BCELoss positive_weight 22 | 'iou_t': 0.225, # iou training threshold 23 | 'lr0': 0.01, # initial learning rate (SGD=5E-3, Adam=5E-4) 24 | 'lrf': -4., # final LambdaLR learning rate = lr0 * (10 ** lrf) 25 | 'momentum': 0.937, # SGD momentum 26 | 'weight_decay': 0.000484, # optimizer weight decay 27 | 'fl_gamma': 0.5, # focal loss gamma 28 | 'hsv_h': 0.0138, # image HSV-Hue augmentation (fraction) 29 | 'hsv_s': 0.678, # image HSV-Saturation augmentation (fraction) 30 | 'hsv_v': 0.36, # image HSV-Value augmentation (fraction) 31 | 'degrees': 1.98, # image rotation (+/- deg) 32 | 'translate': 0.05, # image translation (+/- fraction) 33 | 'scale': 0.05, # image scale (+/- gain) 34 | 'shear': 0.641} # image shear (+/- deg) 35 | 36 | 37 | def save_test_pic(filename, img, pbox, tbox): 38 | img=img.numpy().transpose((1,2,0))*255 39 | img=np.ascontiguousarray(img) 40 | 41 | pp1, pp2 = (int(pbox[0]-pbox[2]/2), int(pbox[1]-pbox[3]/2)), (int(pbox[0]+pbox[2]/2), int(pbox[1]+pbox[3]/2)) 42 | tp1, tp2 = (int(tbox[0]-tbox[2]/2), int(tbox[1]-tbox[3]/2)), (int(tbox[0]+tbox[2]/2), int(tbox[1]+tbox[3]/2)) 43 | 44 | cv2.rectangle(img, pp1, pp2, color=(0,0,255), thickness=1) # red pbox 45 | cv2.rectangle(img, tp1, tp2, color=(0,255,0), thickness=1) # green tbox 46 | cv2.putText(img, text=str((pp1,pp2))+str((tp1, tp2)), 47 | org = (0, 10), 48 | fontFace=cv2.FONT_HERSHEY_SCRIPT_SIMPLEX, 49 | fontScale=0.35, 50 | color = (255,255,255)) 51 | 52 | cv2.imwrite('test_result/'+filename+'.jpg', img) 53 | 54 | def bbox_iou(box1, box2): 55 | """ 56 | Returns the IoU of two bounding boxes 57 | """ 58 | 59 | # Transform from center and width to exact coordinates 60 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 61 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 62 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 63 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 64 | 65 | # get the corrdinates of the intersection rectangle 66 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 67 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 68 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 69 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 70 | # Intersection area 71 | inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, min=0) * torch.clamp( 72 | inter_rect_y2 - inter_rect_y1, min=0 73 | ) 74 | # Union Area 75 | b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) 76 | b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) 77 | 78 | iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) 79 | 80 | return iou 81 | 82 | def select_boxes(pred_boxes, pred_conf): 83 | n = pred_boxes.size(0) 84 | # pred_boxes = pred_boxes.view(n, -1, 4) 85 | # pred_conf = pred_conf.view(n, -1, 1) 86 | FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor 87 | p_boxes = FloatTensor(n, 4) 88 | # print(pred_boxes.shape, pred_conf.shape) 89 | 90 | for i in range(n): 91 | _, index = pred_conf[i].max(0) 92 | p_boxes[i] = pred_boxes[i][index] 93 | 94 | return p_boxes 95 | 96 | def get_prebox(inf_out): 97 | inf_out = inf_out.view(inf_out.shape[0], 6, -1) # bs, anchors, nw*nh*6 98 | inf_out_t = torch.zeros_like(inf_out[:, 0, :]) 99 | for i in range(inf_out.shape[1]): 100 | inf_out_t += inf_out[:, i, :] 101 | inf_out_t = inf_out_t.view(inf_out_t.shape[0], -1, 6) / 6 # average anchors: box, conf 102 | 103 | pre_box = select_boxes(inf_out_t[..., :4], inf_out_t[..., 4]) # get pbox by max conf 104 | return pre_box 105 | 106 | def test(weights=None, 107 | batch_size=16, 108 | img_size=416, 109 | model=None, 110 | dataloader=None, 111 | num_batch=-1): 112 | # torch.set_default_tensor_type(torch.DoubleTensor) 113 | # Initialize/load model and set device 114 | if model is None or type(model)==str: 115 | device = torch_utils.select_device(opt.device, batch_size=batch_size) 116 | 117 | # Remove previous 118 | for f in glob.glob('test_batch*.jpg'): 119 | os.remove(f) 120 | 121 | ptfile: Dict = torch.load('weights/' + weights+'.pt', map_location=device) 122 | model_params = ptfile.setdefault('model_params') 123 | print('model_params', model_params) 124 | model = getattr(mymodel, model)(**model_params).to(device) 125 | 126 | model.hyp = hyp 127 | model.nc = 1 128 | model.arc = 'default' 129 | 130 | # Load weights 131 | model.load_state_dict(ptfile['model']) 132 | 133 | if torch.cuda.device_count() > 1: 134 | model = nn.DataParallel(model) 135 | else: # called by train.py 136 | device = next(model.parameters()).device # get model device 137 | 138 | # Dataloader 139 | if dataloader is None: 140 | dataset = LoadImagesAndLabels(opt.datapath, img_size, batch_size, rect=False, cache_labels=True, hyp=hyp, augment=False) 141 | batch_size = min(batch_size, len(dataset)) 142 | dataloader = DataLoader(dataset, 143 | batch_size=batch_size, 144 | #num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]), 145 | pin_memory=True, 146 | collate_fn=dataset.collate_fn) 147 | 148 | model.eval() 149 | loss = torch.zeros(2) 150 | iou_sum = 0 151 | test_n = 0 152 | 153 | # model.layers[0].weight.data = torch.tensor(model.layers[0].weight.data.numpy()[:,::-1].copy()) # swap RGB<->BGR 154 | 155 | print(('\n' + '%10s' * 4) % ('IOU', 'l', 'Giou-l', 'obj-l')) 156 | pbar = tqdm(enumerate(dataloader), total=len(dataloader)) 157 | for batch_i, (imgs, targets, paths, shapes) in pbar: 158 | if batch_i == num_batch: break 159 | 160 | imgs = imgs.to(device).float() / 256.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 161 | targets = targets.to(device) 162 | bn, _, height, width = imgs.shape # batch size, channels, height, width 163 | test_n += bn 164 | 165 | with torch.no_grad(): 166 | # Run model 167 | inf_out, train_out = model(imgs) # inference and training outputs, inf_out = bs*anchors*nw*nh*6 168 | # Compute loss 169 | if hasattr(model, 'hyp'): # if model has loss hyperparameters 170 | loss += compute_loss(train_out, targets, model)[1][:2].cpu() # GIoU, obj 171 | 172 | pre_box = get_prebox(inf_out) # anchor average, select max 173 | 174 | tbox = targets[..., 2:6] * torch.Tensor([width, height, width, height]).to(device) 175 | 176 | ious = bbox_iou(pre_box, tbox) 177 | iou_sum += ious.sum() 178 | loss_o = loss / (batch_i + 1) 179 | 180 | iou = iou_sum / test_n 181 | s = (('%10.4f')*4+'%10d') % (iou, loss_o.sum(), loss_o[0], loss_o[1], len(targets)) 182 | 183 | 184 | if opt and opt.verbose: 185 | np.set_printoptions(precision = 2) 186 | for p in range(len(imgs)): 187 | print(paths[p], 'pbox_xywh', pre_box[p].numpy(), 'tbox_xywh', tbox[p].numpy()) 188 | 189 | if opt and opt.save_pic: 190 | for p in range(len(imgs)): 191 | save_test_pic(str(p+test_n-batch_size), imgs[p], pre_box[p], tbox[p]) 192 | 193 | pbar.set_description(s) 194 | 195 | return iou, loss_o.sum(), loss_o[0], loss_o[1] # iou, loss_sum, lobj, lcls 196 | 197 | 198 | if __name__ == '__main__': 199 | parser = argparse.ArgumentParser(prog='test.py') 200 | parser.add_argument('-m', '--model', type=str, default='UltraNet_FixQ', help='model name') 201 | parser.add_argument('-w', '--weight', default=None, help='weights path') 202 | parser.add_argument('-bs', '--batch-size', type=int, default=16, help='size of each image batch') 203 | parser.add_argument('--img-size', type=int, default=320, help='inference size (pixels)') 204 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu') 205 | parser.add_argument('--datapath', default='', help = 'test dataset path') 206 | parser.add_argument('-v', '--verbose', action='store_true', help = 'show predict value result') 207 | parser.add_argument('--save-pic', action='store_true', help = 'save predict output picture') 208 | parser.add_argument('-nb', '--num-batch', type=int, default='-1', help='num of batchs to run, -1 for full dataset') 209 | opt = parser.parse_args() 210 | print(opt) 211 | if opt.weight is None: opt.weight = select_weight_file() 212 | 213 | if opt.datapath == '': 214 | try: 215 | import localconfig 216 | opt.datapath = localconfig.test_path 217 | except Exception: 218 | pass 219 | 220 | # Test 221 | res = test( 222 | opt.weight, 223 | opt.batch_size, 224 | opt.img_size, 225 | opt.model, 226 | num_batch = opt.num_batch) 227 | 228 | print(('%s %s.pt\niou %.5f, lsum %.4f, lobj %.4f, lcls %.4f')%(opt.model, opt.weight, *res)) 229 | -------------------------------------------------------------------------------- /dacsdc/train_old.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch.distributed as dist 4 | import torch.optim as optim 5 | import torch.optim.lr_scheduler as lr_scheduler 6 | 7 | import sys 8 | sys.path.append('..') 9 | import localconfig 10 | import test # import test.py to get mAP after each epoch 11 | from datasets import * 12 | from yolo_utils import * 13 | 14 | from mymodel import * 15 | import mymodel 16 | 17 | wdir = 'weights' + os.sep # weights dir 18 | 19 | # Hyperparameters (results68: 59.9 mAP@0.5 yolov3-spp-416) https://github.com/ultralytics/yolov3/issues/310 20 | 21 | hyp = {'giou': 3.54, # giou loss gain 22 | 'cls': 37.4, # cls loss gain 23 | 'cls_pw': 1.0, # cls BCELoss positive_weight 24 | 'obj': 64.3, # obj loss gain (*=img_size/320 if img_size != 320) 25 | 'obj_pw': 1.0, # obj BCELoss positive_weight 26 | 'iou_t': 0.225, # iou training threshold 27 | 'lr0': 0.01, # initial learning rate (SGD=5E-3, Adam=5E-4) 28 | 'lrf': -4., # final LambdaLR learning rate = lr0 * (10 ** lrf) 29 | 'momentum': 0.937, # SGD momentum 30 | 'weight_decay': 0.000484, # optimizer weight decay 31 | 'fl_gamma': 0.5, # focal loss gamma 32 | 'hsv_h': 0.0138, # image HSV-Hue augmentation (fraction) 33 | 'hsv_s': 0.678, # image HSV-Saturation augmentation (fraction) 34 | 'hsv_v': 0.36, # image HSV-Value augmentation (fraction) 35 | 'degrees': 1.98, # image rotation (+/- deg) 36 | 'translate': 0.05, # image translation (+/- fraction) 37 | 'scale': 0.05, # image scale (+/- gain) 38 | 'shear': 0.641} # image shear (+/- deg) 39 | 40 | # Overwrite hyp with hyp*.txt (optional) 41 | f = glob.glob('hyp*.txt') 42 | if f: 43 | print('Using %s' % f[0]) 44 | for k, v in zip(hyp.keys(), np.loadtxt(f[0])): 45 | hyp[k] = v 46 | 47 | 48 | def train(): 49 | cfg = opt.cfg 50 | data = opt.data 51 | img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2 # train, test sizes 52 | epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs 53 | batch_size = opt.batch_size 54 | accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 55 | weights = opt.weights # initial training weights 56 | 57 | # Initialize 58 | init_seeds() 59 | if opt.multi_scale: 60 | img_sz_min = round(img_size / 32 / 1.5) 61 | img_sz_max = round(img_size / 32* 1.5) 62 | img_size = img_sz_max * 32 # initiate with maximum multi_scale size 63 | print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) 64 | 65 | # Configure run 66 | # data_dict = parse_data_cfg(data) 67 | train_path = localconfig.train_path 68 | test_path = localconfig.test_path 69 | nc = 1 70 | 71 | results_file = 'results/%s.txt'%opt.name 72 | # Remove previous results 73 | for f in glob.glob('*_batch*.png') + glob.glob(results_file): 74 | os.remove(f) 75 | 76 | # Initialize model 77 | model = getattr(mymodel, opt.model)().to(device) 78 | 79 | # Optimizer 80 | pg0, pg1, pg2 = [], [], [] # optimizer parameter groups 81 | for k, v in dict(model.named_parameters()).items(): 82 | if '.bias' in k: 83 | pg2 += [v] # biases 84 | elif 'Conv2d.weight' in k: 85 | pg1 += [v] # apply weight_decay 86 | else: 87 | pg0 += [v] # all else 88 | 89 | if opt.adam: 90 | # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) 91 | optimizer = optim.Adam(pg0, lr=hyp['lr0']) 92 | # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) 93 | else: 94 | optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) 95 | optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay 96 | optimizer.add_param_group({'params': pg2}) # add pg2 (biases) 97 | optimizer.param_groups[2]['lr'] *= 2.0 # bias lr 98 | del pg0, pg1, pg2 99 | 100 | start_epoch = 0 101 | best_fitness = 0.0 102 | test_best_iou = 0.0 103 | 104 | # attempt_download(weights) 105 | # 加载权重 106 | if weights.endswith('.pt'): # pytorch format 107 | # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. 108 | chkpt = torch.load(weights, map_location=device) 109 | 110 | # load model 111 | try: 112 | chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} 113 | model.load_state_dict(chkpt['model'], strict=False) 114 | except KeyError as e: 115 | s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " % (opt.weights, opt.cfg, opt.weights) 116 | raise KeyError(s) from e 117 | 118 | if opt.resume: 119 | # load optimizer 120 | if chkpt['optimizer'] is not None: 121 | optimizer.load_state_dict(chkpt['optimizer']) 122 | best_fitness = chkpt['best_fitness'] 123 | 124 | # load results 125 | if chkpt.get('training_results') is not None: 126 | with open(results_file, 'w') as file: 127 | file.write(chkpt['training_results']) # write results.txt 128 | 129 | start_epoch = chkpt['epoch'] + 1 130 | 131 | del chkpt 132 | 133 | elif len(weights) > 0: # darknet format 134 | # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. 135 | load_darknet_weights(model, weights) 136 | 137 | # Scheduler https://github.com/ultralytics/yolov3/issues/238 138 | # lf = lambda x: 1 - x / epochs # linear ramp to zero 139 | # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp 140 | # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp 141 | lf = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.99 + 0.01 # cosine https://arxiv.org/pdf/1812.01187.pdf 142 | scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) 143 | # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(epochs * x) for x in [0.8, 0.9]], gamma=0.1) 144 | scheduler.last_epoch = start_epoch 145 | 146 | # # Plot lr schedule 147 | # y = [] 148 | # for _ in range(epochs): 149 | # scheduler.step() 150 | # y.append(optimizer.param_groups[0]['lr']) 151 | # plt.plot(y, '.-', label='LambdaLR') 152 | # plt.xlabel('epoch') 153 | # plt.ylabel('LR') 154 | # plt.tight_layout() 155 | # plt.savefig('LR.png', dpi=300) 156 | # Initialize distributed training 157 | if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): 158 | dist.init_process_group(backend='nccl', # 'distributed backend' 159 | init_method='tcp://127.0.0.1:5000', # distributed training init method 160 | world_size=1, # number of nodes for distributed training 161 | rank=0) # distributed training node rank 162 | model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) 163 | model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level 164 | 165 | # Dataloader 166 | #batch_size = min(batch_size, len(dataset)) 167 | nw = min([os.cpu_count()//4, batch_size//4 if batch_size > 1 else 0, 8]) # number of workers 168 | 169 | # Testloader 170 | testset = LoadImagesAndLabels(test_path, img_size_test, batch_size, 171 | hyp=hyp, 172 | rect=False, 173 | cache_images=opt.cache_images, 174 | single_cls=opt.single_cls) 175 | testloader = torch.utils.data.DataLoader(testset, 176 | batch_size=batch_size, 177 | num_workers=0, 178 | pin_memory=True, 179 | collate_fn=testset.collate_fn) 180 | 181 | # Dataset 182 | dataset = LoadImagesAndLabels(train_path, img_size, batch_size, 183 | augment=True, 184 | hyp=hyp, # augmentation hyperparameters 185 | rect=opt.rect, # rectangular training 186 | cache_images=opt.cache_images, 187 | single_cls=opt.single_cls) 188 | 189 | dataloader = torch.utils.data.DataLoader(dataset, 190 | batch_size=batch_size, 191 | num_workers=nw, 192 | shuffle=not opt.rect, # Shuffle=True unless rectangular training is used 193 | pin_memory=True, 194 | collate_fn=dataset.collate_fn) 195 | 196 | # Start training 197 | nb = len(dataloader) 198 | prebias = start_epoch == 0 199 | model.nc = nc # attach number of classes to model 200 | model.arc = opt.arc # attach yolo architecture 201 | model.hyp = hyp # attach hyperparameters to model 202 | model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights 203 | maps = np.zeros(nc) # mAP per class 204 | # torch.autograd.set_detect_anomaly(True) 205 | results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' 206 | t0 = time.time() 207 | torch_utils.model_info(model, report='summary') # 'full' or 'summary' 208 | print('Using %g dataloader workers' % nw) 209 | print('Starting training for %g epochs...' % epochs) 210 | for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ 211 | model.train() 212 | model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2 # GIoU <-> 1.0 loss ratio 213 | 214 | # Prebias 215 | if prebias: 216 | ne = max(round(30 / nb), 3) # number of prebias epochs 217 | ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \ 218 | np.interp(epoch, [0, ne], [0.9, hyp['momentum']]) # prebias settings (lr=0.1, momentum=0.9) 219 | if epoch == ne: 220 | # print_model_biases(model) 221 | prebias = False 222 | 223 | # Bias optimizer settings 224 | optimizer.param_groups[2]['lr'] = ps[0] 225 | if optimizer.param_groups[2].get('momentum') is not None: # for SGD but not Adam 226 | optimizer.param_groups[2]['momentum'] = ps[1] 227 | 228 | mloss = torch.zeros(4).to(device) # mean losses 229 | print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) 230 | pbar = tqdm(enumerate(dataloader), total=nb) # progress bar 231 | for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- 232 | ni = i + nb * epoch # number integrated batches (since train start) 233 | imgs = imgs.to(device).float() / 256.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 234 | targets = targets.to(device) 235 | 236 | # Hyperparameter burn-in 237 | # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches 238 | # if ni <= n_burn: 239 | # for m in model.named_modules(): 240 | # if m[0].endswith('BatchNorm2d'): 241 | # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 242 | # g = (i / n_burn) ** 4 # gain rises from 0 - 1 243 | # for x in optimizer.param_groups: 244 | # x['lr'] = hyp['lr0'] * g 245 | # x['weight_decay'] = hyp['weight_decay'] * g 246 | 247 | # Plot images with bounding boxes 248 | if ni < 1: 249 | f = 'train_batch%g.png' % i # filename 250 | # plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) 251 | if tb_writer: 252 | tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') 253 | 254 | # Multi-Scale training 255 | if opt.multi_scale: 256 | if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch 257 | img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 258 | sf = img_size / max(imgs.shape[2:]) # scale factor 259 | if sf != 1: 260 | ns = [math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]] # new shape (stretched to 16-multiple) 261 | imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) 262 | 263 | # Run model 264 | pred = model(imgs) 265 | 266 | # Compute loss 267 | loss, loss_items = compute_loss(pred, targets, model) 268 | if not torch.isfinite(loss): 269 | print('WARNING: non-finite loss, ending training ', loss_items) 270 | return results 271 | 272 | # Scale loss by nominal batch_size of 64 273 | loss *= batch_size / 64 274 | 275 | 276 | loss.backward() 277 | 278 | # Optimize accumulated gradient 279 | if ni % accumulate == 0: 280 | optimizer.step() 281 | optimizer.zero_grad() 282 | 283 | # Print batch results 284 | mloss = (mloss * i + loss_items) / (i + 1) # update mean losses 285 | mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) 286 | s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) 287 | pbar.set_description(s) 288 | 289 | # end batch ------------------------------------------------------------------------------------------------ 290 | 291 | # Update scheduler 292 | scheduler.step() 293 | 294 | # Process epoch results 295 | final_epoch = epoch + 1 == epochs 296 | if not opt.notest or final_epoch: # Calculate mAP 297 | results = test.test(batch_size=batch_size, 298 | img_size=img_size_test, 299 | model=model, 300 | dataloader=testloader) 301 | 302 | # Write epoch results 303 | with open(results_file, 'a') as f: 304 | f.write(s + '%10.3g' * len(results) % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) 305 | if len(opt.name) and opt.bucket: 306 | os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) 307 | 308 | # Write Tensorboard results 309 | if tb_writer: 310 | x = list(mloss) + list(results) 311 | titles = ['GIoU', 'Objectness', 'Classification', 'Train loss', 312 | 'iou', 'loss', 'Giou loss', 'obj loss'] 313 | for xi, title in zip(x, titles): 314 | tb_writer.add_scalar(title, xi, epoch) 315 | 316 | # Update best mAP 317 | results = torch.tensor(results, device = 'cpu') 318 | fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] 319 | if fi > best_fitness: 320 | best_fitness = fi 321 | 322 | test_iou = results[0] 323 | if test_iou > test_best_iou: 324 | test_best_iou = test_iou 325 | 326 | # Save training results 327 | save = (not opt.nosave) or (final_epoch and not opt.evolve) 328 | if save: 329 | with open(results_file, 'r') as f: 330 | # Create checkpoint 331 | chkpt = {'epoch': epoch, 332 | 'best_fitness': best_fitness, 333 | 'training_results': f.read(), 334 | 'model': model.module.state_dict() if type( 335 | model) is nn.parallel.DistributedDataParallel else model.state_dict(), 336 | 'optimizer': None if final_epoch else optimizer.state_dict()} 337 | 338 | # Save last checkpoint 339 | torch.save(chkpt, wdir + '%s_last.pt'%opt.name) 340 | 341 | if test_iou == test_best_iou: 342 | torch.save(chkpt, wdir + '%s_best.pt'%opt.name) 343 | 344 | # Save backup every 10 epochs (optional) 345 | # if epoch > 0 and epoch % 10 == 0: 346 | # torch.save(chkpt, wdir + 'backup%g.pt' % epoch) 347 | 348 | # Delete checkpoint 349 | del chkpt 350 | 351 | # end epoch ---------------------------------------------------------------------------------------------------- 352 | 353 | # end training 354 | n = opt.name 355 | if len(n) and False: 356 | n = '_' + n if not n.isnumeric() else n 357 | fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n 358 | os.rename('results.txt', fresults) 359 | os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None 360 | os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None 361 | if opt.bucket: # save to cloud 362 | os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket)) 363 | os.system('gsutil cp %s gs://%s/weights' % (wdir + flast, opt.bucket)) 364 | # os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket)) 365 | 366 | #if not opt.evolve: 367 | # plot_results() # save as results.png 368 | print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) 369 | dist.destroy_process_group() if torch.cuda.device_count() > 1 else None 370 | torch.cuda.empty_cache() 371 | 372 | return results 373 | 374 | 375 | if __name__ == '__main__': 376 | parser = argparse.ArgumentParser() 377 | parser.add_argument('--epochs', type=int, default=200) # 500200 batches at bs 16, 117263 COCO images = 273 epochs 378 | parser.add_argument('--batch-size', type=int, default=64) # effective bs = batch_size * accumulate = 16 * 4 = 64 379 | parser.add_argument('--accumulate', type=int, default=1, help='batches to accumulate before optimizing') 380 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path') 381 | parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path') 382 | parser.add_argument('--multi-scale', action='store_true', help='adjust (67% - 150%) img_size every 10 batches') 383 | parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes') 384 | parser.add_argument('--rect', action='store_true', help='rectangular training') 385 | parser.add_argument('--resume', action='store_true', help='resume training from last.pt') 386 | parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') 387 | parser.add_argument('--notest', action='store_true', help='only test final epoch') 388 | parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') 389 | parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') 390 | parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') 391 | parser.add_argument('--weights', type=str, default='', help='initial weights path') 392 | parser.add_argument('--arc', type=str, default='default', help='yolo architecture') # default, uCE, uBCE 393 | parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied') 394 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1 or cpu)') 395 | parser.add_argument('--adam', action='store_true', help='use adam optimizer') 396 | parser.add_argument('--model', type=str, default='UltraNetFloat', help='model used') 397 | parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') 398 | parser.add_argument('--var', type=float, help='debug variable') 399 | opt = parser.parse_args() 400 | last = wdir + 'last_%s.pt'%opt.name 401 | opt.weights = last if opt.resume else opt.weights 402 | print(opt) 403 | device = torch_utils.select_device(opt.device, batch_size=opt.batch_size) 404 | # scale hyp['obj'] by img_size (evolved at 320) 405 | # hyp['obj'] *= opt.img_size[0] / 320. 406 | 407 | tb_writer = None 408 | if not opt.evolve: # Train normally 409 | try: 410 | # Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/ 411 | from torch.utils.tensorboard import SummaryWriter 412 | 413 | tb_writer = SummaryWriter() 414 | except: 415 | pass 416 | 417 | train() # train normally 418 | 419 | else: # Evolve hyperparameters (optional) 420 | opt.notest, opt.nosave = True, True # only test/save final epoch 421 | if opt.bucket: 422 | os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists 423 | 424 | for _ in range(1): # generations to evolve 425 | if os.path.exists('evolve.txt'): # if evolve.txt exists: select best hyps and mutate 426 | # Select parent(s) 427 | parent = 'single' # parent selection method: 'single' or 'weighted' 428 | x = np.loadtxt('evolve.txt', ndmin=2) 429 | n = min(5, len(x)) # number of previous results to consider 430 | x = x[np.argsort(-fitness(x))][:n] # top n mutations 431 | w = fitness(x) - fitness(x).min() # weights 432 | if parent == 'single' or len(x) == 1: 433 | # x = x[random.randint(0, n - 1)] # random selection 434 | x = x[random.choices(range(n), weights=w)[0]] # weighted selection 435 | elif parent == 'weighted': 436 | x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination 437 | 438 | # Mutate 439 | method, mp, s = 3, 0.9, 0.2 # method, mutation probability, sigma 440 | npr = np.random 441 | npr.seed(int(time.time())) 442 | g = np.array([1, 1, 1, 1, 1, 1, 1, 0, .1, 1, 0, 1, 1, 1, 1, 1, 1, 1]) # gains 443 | ng = len(g) 444 | if method == 1: 445 | v = (npr.randn(ng) * npr.random() * g * s + 1) ** 2.0 446 | elif method == 2: 447 | v = (npr.randn(ng) * npr.random(ng) * g * s + 1) ** 2.0 448 | elif method == 3: 449 | v = np.ones(ng) 450 | while all(v == 1): # mutate until a change occurs (prevent duplicates) 451 | # v = (g * (npr.random(ng) < mp) * npr.randn(ng) * s + 1) ** 2.0 452 | v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) 453 | for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) 454 | hyp[k] = x[i + 7] * v[i] # mutate 455 | 456 | # Clip to limits 457 | keys = ['lr0', 'iou_t', 'momentum', 'weight_decay', 'hsv_s', 'hsv_v', 'translate', 'scale', 'fl_gamma'] 458 | limits = [(1e-5, 1e-2), (0.00, 0.70), (0.60, 0.98), (0, 0.001), (0, .9), (0, .9), (0, .9), (0, .9), (0, 3)] 459 | for k, v in zip(keys, limits): 460 | hyp[k] = np.clip(hyp[k], v[0], v[1]) 461 | 462 | # Train mutation 463 | results = train() 464 | 465 | # Write mutation results 466 | print_mutation(hyp, results, opt.bucket) 467 | 468 | # Plot results 469 | # plot_evolution_results(hyp) 470 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # DeepBurning-MixQ 2 | 3 | This is part of the [DeepBurning project](https://github.com/groupsada/DeepBurning) developed for agile neural network accelerator design in Institute of Computing Technology, Chinese Academy of Sciences. It focuses on the software/hardware co-optimization of FPGA-based accelerators for low bit-width mixed-precision neural network models. In terms of hardware, we mainly explore the packing method of various low bit-width convolution operators, so that each primitive DSP in FPGAs can accommodate as many low bit-width operations as possible, thereby improving DSP utilization. In terms of the model, we mainly utilize differential NAS (Network Architecture Search) technique to perform mixed-precision quantization on the given model, while also considering the hardware implementation efficiency of the quantized model, in order to efficiently deploy the target convolutional neural network model onto FPGA under given resource constraints. 4 | 5 | This work has been published at ICCAD'23 and please refer to the paper for more details. (DOI:[10.1109/ICCAD57390.2023.10323831](https://doi.org/10.1109/ICCAD57390.2023.10323831)) 6 | 7 | Erjing Luo#, Haitong Huang#, Cheng Liu*, Guoyu Li, Bing Yang, Ying Wang, Huawei Li, Xiaowei Li, "DeepBurning-MixQ: An Open Source Mixed-Precision Neural Network Accelerator Design Framework for FPGAs", ICCAD, 2023. (# equal contribution) 8 | 9 | ## Status 10 | This project mainly explores automatic HW/SW co-optimization of FPGA-based neural network accelerators for mixed-precision neural network models. Currently, we have the mixed-precision neural network models fully pipelined across the FPGA, so it mainly targets smaller neural network models with limited layers. A hybrid multi-core neural network accelerator that can accommodate generic mixed-precision neural network models will come coon. 11 | 12 | This repo includes the training, quantization, and weight export of hardware aware mixed precision neural network models. For efficient FPGA HLS operators and optimization code, please refer to [https://github.com/MatthewLuo7/MixQ_Gen_Acce](https://github.com/MatthewLuo7/MixQ_Gen_Accel). 13 | 14 | ## Classification Model 15 | 16 | ### Usage 17 | ```bash 18 | cd cifar/ 19 | 20 | # 1. Hardware-aware Mixed Precison NAS 21 | python search_train.py --cd 3e-5 --name mix_vggtiny_cifar_cd3e5 22 | # Params: 23 | # --cd Stands for complexity decay 24 | # --name Stands for checkpoint .pt and .log filename 25 | # --model Mixed precision supernet model, default is `VGGtiny_MixQ` 26 | # Then, the optimal bit width of each layer will converge after dozens of epochs, for example bitw={8,2,2,2,2,2}, bita = {8,3,3,3,6,3} 27 | 28 | 29 | # 2. Main train 30 | python main_train.py --bitw 822222 --bita 833363 --name vggtiny_cifar_cd3e5 31 | # Trained weights are under weights/tiny_cifar_cd3e5.pt 32 | 33 | 34 | # 3. Test model 35 | python test_acc.py 36 | # You can choose tiny_cifar_cd3e5.pt for test if nothing wrong 37 | 38 | # 4. HLS code generation: 39 | # Now can directly export HLS configuration header and weight file form .pt weight file. 40 | # Adjust `simd, pe` parallelization factor of each layer firstly. 41 | vim hls/config_simd_pe.txt 42 | # Export `config.h` and `weights.hpp` to /hls/tiny_cifar_cd3e5/ 43 | python export_hls.py 44 | 45 | 46 | # 5. Model-Level Hardware Simulation 47 | # simulate_hls.py requires /hls/tiny_cifar_cd3e5/model_param.pkl file generated by export_hls.py 48 | python simulate_hls.py 49 | # This output should consist with hardware output or HLS C-Level simluation 50 | 51 | ``` 52 | 53 | ## DAC-SDC Object Detection Model 54 | 55 | The DAC System Design Contest focused on low-power object detection on an embedded FPGA system: https://www.dac.com/Conference/System-Design-Contest. 56 | 57 | The target of this contest is optimize performance of the designs in terms of accuracy and power on a Ultra 96 v2 FPGA board. This contest was held 5 times, from 2018 to 2022, and the performance of optimal design in these years increased from 30 fps to thousands of fps. 58 | 59 | Base models for anypacking bitwidth search: 60 | 61 | - UltraNet: https://github.com/heheda365/ultra_net by BJUT_runner team, 1st place of 2020 DAC-SDC contest. UltraNet is a VGGNet-like model with much less parameters. UltraNet_iSmart is 2nd place of 2021 DAC-SDC design by UIUC ismart team, which have much better throughput by fixed packing optimize. 62 | - UltraNet_Bypass: https://github.com/heymesut/SJTU_microe 21' SJTU, 3rd place of 2021 DAC-SDC contest. A variant of UltraNet with bypass connect. Bypass connect increases model accuracy, but makes design of NN acclerator based on pipeline architecture more difficult. 63 | - SkyNet: https://github.com/jiangwx/SkrSkr SkrSkr by SHTECH, 1st place of 2021 DAC-SDC contest. SkyNet is a MobileNet-like lightweight model. 64 | - SkyNetk5: SkyNet with 5x5 depthwise convolution kernel. Since dwconv uses much fewer calculations than pwconv, larger kernel brings higher accuracy with slight cost. 65 | 66 | Dataset: See https://byuccl.github.io/dac_sdc_2022/info/. 67 | 68 | **Usage**: First `cd dacsdc/`, then follow next steps. 69 | 70 | ### 1) Hardware-aware Mixed Precison NAS for bit width 71 | 72 | ```bash 73 | # For UltraNet with mixed precision: 74 | python search_train.py --cd 1e-5 --name mix_ultranet_cd1e5 75 | 76 | # UltraNet with Bypass: 77 | python search_train.py --cd 1e-5 --name mix_ultranet_bypass_cd1e5 --model UltraNetBypass_MixQ 78 | 79 | # SkyNet/SkyNetk5 80 | python search_train.py --cd 1e-5 --name mix_skynet_cd1e5 --model [SkyNet_MixQ | SkyNetk5_MixQ] 81 | ``` 82 | 83 | ### 2) Main Train 84 | 85 | For UltraNet: 86 | ```bash 87 | # UltraNet_BJTU use full 4bit wquantization 88 | python main_train.py --bitw 444444444 --bita 844444444 --name ultranet_BJTU 89 | 90 | # UltraNet_iSmart use full 4-8 mixed quantization for weight 91 | python main_train.py --bitw 844444448 --bita 844444444 --name ultranet_iSmart 92 | 93 | # Or use searched bitw, bita from search_train.py 94 | python main_train.py --bitw --bita --name ultranet_anypacking 95 | ``` 96 | For UltraNet_Bypass/SkyNet/SkyNetk5 97 | ```bash 98 | python main_train.py --bitw --bita --name --model [UltraNet_Bypass | SkyNet | SkyNetk5] 99 | ``` 100 | 101 | ### 3) Test model 102 | 103 | ```bash 104 | python test.py [--model [UltraNet_Bypass_FixQ | SkyNet_FixQ | SkyNetk5_FixQ]] 105 | ``` 106 | 107 | ### 4) HLS export 108 | ```bash 109 | # For Ultranet or Ultranet_Bypass 110 | python export_hls.py [--model UltraNet_Bypass_FixQ] 111 | # For SkyNet or SkyNetk5 112 | python export_hls.py [--model SkyNetk5_FixQ] 113 | ``` 114 | 115 | ### 5) Model-Level Hardware Simulation 116 | ```bash 117 | python simulate_hls.py [--model [UltraNet_Bypass_FixQ | SkyNet_FixQ | SkyNetk5_FixQ]] 118 | ``` 119 | 120 | ## Reference 121 | - https://github.com/zhaoweicai/EdMIPS EdMIPS: Rethinking Differentiable Search for Mixed-Precision Neural Networks 122 | - https://github.com/kuangliu/pytorch-cifar Smaller models for cifar dataset 123 | - https://github.com/ultralytics/yolov3.git yolov3 training framework 124 | - https://github.com/jiangwx/SkyNet SkyNet by SHTECH, winner of 2019 DAC-SDC contest 125 | - https://github.com/jgoeders/dac_sdc_2020_designs Winner designs of 2020 DAC-SDC contest 126 | - https://github.com/heheda365/ultra_net BJUT_runner team, 1st place of 2020 DAC-SDC contest, UltraNet 127 | - https://github.com/jgoeders/dac_sdc_2021_designs Winner designs of 2021 DAC-SDC contest 128 | - https://github.com/jiangwx/SkrSkr SkrSkr by SHTECH, 1st place of 2021 DAC-SDC contest, SkyNet 129 | - https://github.com/xliu0709/DACSDC2021 iSmart team, 2nd place of 2021 DAC-SDC design, UltraNet with optimized packing method 130 | - https://github.com/heymesut/SJTU_microe 3rd place of 2021 DAC-SDC design by SJTU, a variant of UltraNet with bypass 131 | - https://github.com/jgoeders/dac_sdc_2022_designs Winner designs of 2022 DAC-SDC contest 132 | - https://github.com/MatthewLuo7/InvolutionNet 3rd place of 2022 DAC-SDC design (ours), without anypacking design 133 | 134 | ## License 135 | 136 | ![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg) ![License: AGPL](https://img.shields.io/badge/License-AGPL-red.svg) 137 | 138 | NOTE that directories in this repo have different licenses. 139 | 140 | The main code `anypacking/` and example `cifar/` use MIT license. However, due to the fact that DACSDC's object detection model uses some code from YOLO v3, which is the AGPL license, `dacsdc/` example also uses AGPL license. 141 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fffasttime/AnyPackingNet/1d740bf0071bec024a745adc3bcd31426b29f601/utils/__init__.py -------------------------------------------------------------------------------- /utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def init_seeds(seed=0): 7 | torch.manual_seed(seed) 8 | 9 | # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html 10 | if seed == 0: 11 | torch.backends.cudnn.deterministic = True 12 | torch.backends.cudnn.benchmark = False 13 | 14 | 15 | def select_device(device='', apex=False, batch_size=None): 16 | # device = 'cpu' or '0' or '0,1,2,3' 17 | cpu_request = device.lower() == 'cpu' 18 | if device and not cpu_request: # if device requested other than 'cpu' 19 | os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable 20 | assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device # check availablity 21 | 22 | cuda = False if cpu_request else torch.cuda.is_available() 23 | if cuda: 24 | c = 1024 ** 2 # bytes to MB 25 | ng = torch.cuda.device_count() 26 | # if ng > 1 and batch_size: # check that batch_size is compatible with device_count 27 | # assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng) 28 | x = [torch.cuda.get_device_properties(i) for i in range(ng)] 29 | s = 'Using CUDA ' + ('Apex ' if apex else '') # apex for mixed precision https://github.com/NVIDIA/apex 30 | for i in range(0, ng): 31 | if i == 1: 32 | s = ' ' * len(s) 33 | print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" % 34 | (s, i, x[i].name, x[i].total_memory / c)) 35 | else: 36 | print('Using CPU') 37 | 38 | return torch.device('cuda:0' if cuda else 'cpu') 39 | 40 | 41 | def fuse_conv_and_bn(conv, bn): 42 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/ 43 | with torch.no_grad(): 44 | # init 45 | fusedconv = torch.nn.Conv2d(conv.in_channels, 46 | conv.out_channels, 47 | kernel_size=conv.kernel_size, 48 | stride=conv.stride, 49 | padding=conv.padding, 50 | bias=True) 51 | 52 | # prepare filters 53 | w_conv = conv.weight.clone().view(conv.out_channels, -1) 54 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) 55 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) 56 | 57 | # prepare spatial bias 58 | if conv.bias is not None: 59 | b_conv = conv.bias 60 | else: 61 | b_conv = torch.zeros(conv.weight.size(0)) 62 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) 63 | fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) 64 | 65 | return fusedconv 66 | 67 | 68 | def model_info(model, report='summary'): 69 | # Plots a line-by-line description of a PyTorch model 70 | n_p = sum(x.numel() for x in model.parameters()) # number parameters 71 | n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients 72 | if report is 'full': 73 | print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) 74 | for i, (name, p) in enumerate(model.named_parameters()): 75 | name = name.replace('module_list.', '') 76 | print('%5g %40s %9s %12g %20s %10.3g %10.3g' % 77 | (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) 78 | print('Model Summary: %g layers, %g parameters, %g gradients' % (len(list(model.parameters())), n_p, n_g)) 79 | 80 | 81 | def load_classifier(name='resnet101', n=2): 82 | # Loads a pretrained model reshaped to n-class output 83 | import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch#torchvision 84 | model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet') 85 | 86 | # Display model properties 87 | for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']: 88 | print(x + ' =', eval(x)) 89 | 90 | # Reshape output to n classes 91 | filters = model.last_linear.weight.shape[1] 92 | model.last_linear.bias = torch.nn.Parameter(torch.zeros(n)) 93 | model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters)) 94 | model.last_linear.out_features = n 95 | return model 96 | 97 | log_layerid = 0 98 | def loglayer(x): 99 | global log_layerid 100 | import numpy as np 101 | x=x.numpy() 102 | assert x.dtype == np.int 103 | with open('_logs/test%d.txt'%log_layerid, 'w') as f: 104 | for i in range(x.shape[0]): 105 | print('C', i, file=f) 106 | for j in range(x.shape[1]): 107 | for k in range(x.shape[2]): 108 | print('%3d'%x[i,j,k], end=',', file=f) 109 | print(file=f) 110 | 111 | log_layerid+=1 112 | -------------------------------------------------------------------------------- /utils/view_pt.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | from typing import Dict 3 | import argparse 4 | import torch 5 | import glob 6 | import os 7 | 8 | def select_weight_file(): 9 | files = glob.glob('weights/*.pt') 10 | if len(files) == 0: 11 | print('[Error] No pt file found in current folder') 12 | exit(1) 13 | for i, s in enumerate(files): 14 | print('', i, s) 15 | sel = int(input('Select one .pt file (0-%d): '%(len(files)-1))) 16 | return os.path.split(files[sel])[-1][:-3] 17 | 18 | if __name__=='__main__': 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('-w', '--weight', type=str, default=None, help='weights path') 21 | opt = parser.parse_args() 22 | if opt.weight is None: opt.weight = select_weight_file() 23 | 24 | model: Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu') 25 | res = model['training_results'] 26 | print(res) 27 | 28 | if 'model_params' in model: 29 | print(model['model_params']) 30 | 31 | if 'extra' in model: 32 | print(model['extra']) 33 | --------------------------------------------------------------------------------