├── paper ├── quantization.md ├── fpga.md ├── sparse.md ├── NAS.md ├── object_detection.md └── pruning.md ├── doc ├── learn-step.vsd ├── Roofline Model.pdf └── pytorch │ ├── PyTorch 的内核机制.pdf │ └── talk_pytorch.pdf ├── PipeCNN_note_resource ├── de5Alexnet.xlsx ├── images │ ├── dsp.png │ ├── fre.png │ ├── log.png │ ├── mem.png │ ├── ram.png │ ├── reg.png │ ├── flops.png │ ├── conv_read.png │ ├── conv_time.png │ └── read_time.png ├── calculate_parameters.py ├── explore_v4.py └── flops_calculation_v2.py ├── fpga ├── ref_design.md └── fpga.md ├── project ├── project.md └── install.md ├── PipeCNN_note.md ├── README.md └── git_note.md /paper/quantization.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper/fpga.md: -------------------------------------------------------------------------------- 1 | ## A list of papers for FPGA Accelerator Designs -------------------------------------------------------------------------------- /doc/learn-step.vsd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/doc/learn-step.vsd -------------------------------------------------------------------------------- /doc/Roofline Model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/doc/Roofline Model.pdf -------------------------------------------------------------------------------- /doc/pytorch/PyTorch 的内核机制.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/doc/pytorch/PyTorch 的内核机制.pdf -------------------------------------------------------------------------------- /doc/pytorch/talk_pytorch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/doc/pytorch/talk_pytorch.pdf -------------------------------------------------------------------------------- /PipeCNN_note_resource/de5Alexnet.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/de5Alexnet.xlsx -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/dsp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/dsp.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/fre.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/fre.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/log.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/mem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/mem.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/ram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/ram.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/reg.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/flops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/flops.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/conv_read.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/conv_read.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/conv_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/conv_time.png -------------------------------------------------------------------------------- /PipeCNN_note_resource/images/read_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doonny/basic_knowledge/HEAD/PipeCNN_note_resource/images/read_time.png -------------------------------------------------------------------------------- /fpga/ref_design.md: -------------------------------------------------------------------------------- 1 | ## A list of FPGA accelerator design 2 | 3 | - [Ultra_net : A FPGA-based Object Detection for the DAC-SDC 2020, 1st place.](https://github.com/heheda365/ultra_net) 4 | 5 | - [SkrSkr, FPGA-based Object Detection for the DAC-SDC 2020, 2nd place.](https://github.com/jiangwx/SkrSkr) 6 | 7 | - [SkyNet, a new hardware-efficient DNN specialized in object detection and tracking, DAC 2019, 1st place](https://github.com/TomG008/SkyNet) 8 | 9 | - [XJTU-Tripler design in DAC19 Conference System Design Competition (SDC), 2nd place](https://github.com/venturezhao/XJTU-Tripler) -------------------------------------------------------------------------------- /paper/sparse.md: -------------------------------------------------------------------------------- 1 | ## Papers related to sparse convolution algorithm and hardware implementation 2 | 3 | - Addressing Sparsity in Deep Neural Networks, 2019 TCAD 4 | - An Efficient Hardware Accelerator for Sparse Convolutional Neural Networks on FPGAs, 2019 FCCM 5 | - Promoting the Harmony between Sparsity and Regularity A Relaxed Synchronous Architecture for Convolutional Neural Networks, 2019 TC 6 | - CirCNN Accelerating and Compressing Deep Neural Networks Using Block-CirculantWeight Matrices, 2018 MICRO 7 | - Running Sparse and Low-Precision Neural Network When Algorithm Meets Hardware, 2018 ASP-DAC -------------------------------------------------------------------------------- /project/project.md: -------------------------------------------------------------------------------- 1 | ## 综述型项目 2 | 3 | ### PyTorch基础 4 | 5 | - [动手学深度学习PyTorch版](https://github.com/ShusenTang/Dive-into-DL-PyTorch)(上手学习) 6 | 7 | - [PyTorchTricks](https://github.com/lartpang/PyTorchTricks)(Pytorch代码性能优化) 8 | 9 | ### 知识蒸馏 10 | 11 | - [Knowledge-Distillation-Zoo](https://github.com/AberHu/Knowledge-Distillation-Zoo) 12 | 13 | - [distiller](https://github.com/karanchahal/distiller) 14 | 15 | 16 | ### 强化学习 17 | 18 | - [Deep-Reinforcement-Learning-Algorithms-with-PyTorch](https://github.com/p-christ/Deep-Reinforcement-Learning-Algorithms-with-PyTorch) 19 | 20 | 21 | 22 | 23 | 24 | ## 剪枝项目 25 | 26 | - 进化算法+元学习 [MetaPruning](https://github.com/liuzechun/MetaPruning) 27 | 28 | - 强化学习 [AMC](https://github.com/mit-han-lab/amc) 29 | 30 | - 可微分马尔科夫 [DMCP](https://github.com/Zx55/dmcp) 31 | 32 | 33 | ## 量化 34 | 35 | - AQD + 多种量化方法[model-quantization](https://github.com/blueardour/model-quantization) 36 | 37 | - 强化学习 [HAQ](https://github.com/mit-han-lab/haq) 38 | 39 | - Data-Free量化 [ZeroQ](https://github.com/amirgholami/ZeroQ) 40 | 41 | ## 量化&剪枝 42 | 43 | - 韩松基于进化算法的NAS+剪枝+量化 [APQ](https://github.com/mit-han-lab/apq) 44 | 45 | - 剪枝+量化 [model-compression](https://github.com/666DZY666/model-compression)(综合性工程) 46 | 47 | 48 | ## NAS 49 | 50 | - 分布式的 DARTS 项目 [CDARTS](https://github.com/researchmm/CDARTS) 51 | 52 | - 韩松 proxylessnas [proxylessnas](https://github.com/mit-han-lab/proxylessnas) 53 | 54 | - 韩松 once-for-all [once-for-all](https://github.com/mit-han-lab/once-for-all) 55 | 56 | - 旷世 SinglePathOneShot [SinglePathOneShot](https://github.com/megvii-model/SinglePathOneShot) 57 | 58 | 59 | ## 即插即用高效卷积设计 60 | 61 | - 加法卷积替代乘法卷积 [AdderNet](https://github.com/huawei-noah/AdderNet) 62 | 63 | - 移位卷积替代乘法卷积 [DeepShift](https://github.com/mostafaelhoushi/DeepShift) 64 | 65 | - CVPR 2020 [GhostNet](https://github.com/huawei-noah/ghostnet) 66 | 67 | - IJCAI 2020 [SPConv](https://github.com/qiulinzhang/SPConv.pytorch) -------------------------------------------------------------------------------- /fpga/fpga.md: -------------------------------------------------------------------------------- 1 | ## 如果你使用 Xilinx FPGA 2 | 3 | ### 首先咨询老师你的研究路线是下面哪条? 4 | 5 | ### 如果你的任务是基于FPGA进行算法加速 6 | - 首先,学习王老师《异构计算》课程实验,配合阅读《Vitis Unified Software Platform Documentation, Application Acceleration Development》,学习掌握Vitis环境使用(仿真、硬件部署、对比硬件加速比); 7 | - 接下来,阅读《The HLS Book》和学习《Vitis High-Level Synthesis User Guide》,学习HLS代码优化方法;同时学习Xilinx官方Vitis Example代码(参考下面链接); 8 | - 最后,学习实验室PipeCNN-v2神经网络硬件加速器代码,学习分类、目标检测等实例代码。 9 | 10 | ### 如果你任务是基于zynq或者MPSOC处理器做设计 11 | - 首先,学习Zynq处理器基本架构,详细阅读《The Zynq Book》,使用Zedboard,完成UG1165和《Zynq Book Tutorial》相关实验,学会PS启动和C/C++应用程序编写,学会使用Vivado/Vitis在PL端配置相关IP核(如GPIO、AXI-DMA、AXI-Stream接口等); 12 | - 接下来,学习高层次综合HLS方法设计IP核,阅读《The HLS Book》,学会在PL端设计算法加速电路; 13 | - 最后,学习在Zynq处理器上启动Linxu、在Linux下设计应用程序,学习Petalinux使用,学习基于Vitis的算法硬件加速方法,参考UG1391,学习Github上Xilinx官方的2个教程[Vitis_Tutorial](https://github.com/Xilinx/Vitis-Tutorials)和Vitis_Accel_Examples(见下面链接)。 14 | 15 | 16 | Best Book: 17 | - [The Zynq Book](www.zynqbook.com) (新生入门必读) 18 | - [Zynq MPSoC Book](https://www.zynq-mpsoc-book.com) 19 | - [Parallel Programming for FPGAs - The HLS Book](http://kastner.ucsd.edu/hlsbook/) (新生入门必读) 20 | 21 | Board Tutorial: 22 | 23 | - UG1165 Zynq-7000 SoC: Embedded Design Tutorial (新生入门必须完成的实验) 24 | - UG1209 Zynq UltraScale+ MPSoC: Embedded Design Tutorial 25 | 26 | Vitis: 27 | 28 | - UG1393 Vitis Unified Software Platform Documentation, Application Acceleration Development (必读) 29 | - UG1400 Vitis Unified Software Platform Documentation, Embedded Software Development 30 | - [官方Vitis入门教程含代码](https://github.com/Xilinx/Vitis-Tutorials) (新生入门必做: 01.Getting Started, 02.Hardware_Accelerators) 31 | - [Vitis设计FPGA加速例程代码](https://github.com/Xilinx/Vitis_Accel_Examples) 和 [例程2](https://github.com/Xilinx/Vitis-HLS-Introductory-Examples)(入门后必读代码) 32 | - UG1399 Vitis High-Level Synthesis User Guide (进阶必读) 33 | 34 | Vivado: 35 | 36 | - UG898 Vivado Design Suite User Guide, Embedded Processor Hardware Design 37 | - UG893 Vivado Design Suite User Guide, Using the Vivado IDE 38 | 39 | A few useful blogs: 40 | 41 | - [Xilinx vitis学习教程:ZYNQ之Hello world](https://blog.csdn.net/longfei_3/article/details/103757018) 42 | 43 | Xilinx Design Hub: 44 | 45 | - [所有资料分门别类大汇总](https://www.xilinx.com/support/documentation-navigation/design-hubs.html) 46 | 47 | ## 如果你使用 Intel FPGA 48 | 49 | * ''OpenCL Programming Guide'', Aaftab Munshi, et.al., 2012 [also known as The Green Book] 50 | * [''FPGA异构计算——基于OpenCL的开发方法''](https://baike.baidu.com/item/FPGA%E5%BC%82%E6%9E%84%E8%AE%A1%E7%AE%97%E2%80%94%E2%80%94%E5%9F%BA%E4%BA%8EOpenCL%E7%9A%84%E5%BC%80%E5%8F%91%E6%96%B9%E6%B3%95), 黄乐天 等, 2015 51 | 52 | Also, refers to Intel/Xilinx's OpenCL user guide to learn specific techniques that will be used in the project. 53 | 54 | * [''面向 OpenCL 的英特尔 FPGA SDK 最佳实践指南''](https://www.intel.cn/content/www/cn/zh/programmable/products/design-software/embedded-software-developers/opencl/support.html) 55 | 56 | 除此之外,需要学习官方文档: 57 | 58 | - Intel FPGA SDK for OpenCL Pro Edition Programming Guide 59 | - Intel FPGA SDK for OpenCL Pro Edition Best Practices Guide -------------------------------------------------------------------------------- /paper/NAS.md: -------------------------------------------------------------------------------- 1 | # NAS paper list 2 | 3 | ## 综述 4 | 5 | NAS 的基本思路是给定一个称为搜索空间的候选神经网络结构集合,用某种策略从中搜索出最优网络结构。神经网络结构的优劣即性能用某些指标如精度、速度来度量,称为性能评估。**搜索空间,搜索策略和性能评估策略**是NAS问题的三个核心要素。 6 | ![20191111155645.png](http://image.jingsnow.com/image/20191111155645.png) 7 | 8 | 9 | | Title | Venue | Search Method | Memory Consumption(supernet) | Code | 10 | |:--------|:--------:|:--------:|:--------:|:--------:| 11 | | [Neural Architecture Search with reinforcement learning](https://arxiv.org/abs/1611.01578) | ICLR 2017 | RL-based | - | - | 12 | | [Learning Transferable Architectures for Scalable Image Recognition](https://arxiv.org/abs/1707.07012) | CVPR 2018 | RL-based | - | - | 13 | | [Progressive Neural Architecture Search](https://arxiv.org/abs/1712.00559) | ECCV 2018 | RL-based | - | - | 14 | | [MnasNet: Platform-Aware Neural Architecture Search for Mobile](https://arxiv.org/abs/1807.11626?context=cs.LG) | CVPR 2019 | RL-based | - | - | 15 | | [Regularized Evolution for Image Classifier Architecture Search](https://arxiv.org/abs/1802.01548) | AAAI 2019 | EA-based | - | - | 16 | | [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055) | ICLR 2019 | Grident-based | whole supernet |[github](https://github.com/quark0/darts)| 17 | | [ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware](https://arxiv.org/pdf/1812.00332.pdf) | ICLR 2019 | RL+GD | Two paths |[github](https://github.com/MIT-HAN-LAB/ProxylessNAS)| 18 | | [Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) | 2019.04 arxiv | Grident-based | whole supernet |[github](https://github.com/chenxin061/pdarts)| 19 | | [PC-DARTS: Partial Channel Connections for Memory-Efficient Differentiable Architecture Search](https://arxiv.org/abs/1907.05737v1) | 2019.07 arxiv | Grident-based | whole supernet |[github](https://github.com/yuhuixu1993/PC-DARTS)| 20 | | [Densely Connected Search Space for More Flexible Neural Architecture Search](https://arxiv.org/abs/1906.09607) | 2019.06 arxiv | Grident-based | whole supernet |[github](https://github.com/JaminFong/DenseNAS)| 21 | | [Efficient Neural Architecture Search via Parameter Sharing](https://arxiv.org/abs/1802.03268) | ICML 2018 | RL-based | Single path |[github](https://github.com/carpedm20/ENAS-pytorch)| 22 | | [Single-Path NAS: Designing Hardware-Efficient ConvNets in less than 4 Hours](https://arxiv.org/abs/1904.02877?context=cs) | 2019.04 arxiv | RL-based | Single path with super kernels |[github](https://github.com/dstamoulis/single-path-nas)| 23 | | [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420?context=cs.CV) | 2019.03.31 arxiv | EA-based | Single path |[github](https://github.com/megvii-model/SinglePathOneShot)| 24 | | [FairNAS: Rethinking Evaluation Fairness of Weight Sharing Neural Architecture Search](https://arxiv.org/abs/1907.01845) | 2019.07.03 arxiv | RL+EA | Single path |[github](https://github.com/xiaomi-automl/FairNAS)| 25 | 26 | 27 | -------------------------------------------------------------------------------- /PipeCNN_note_resource/calculate_parameters.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This scrypt is used to calculate the possible values of VEC_SIZE and LANE_NUM 3 | ''' 4 | import numpy as np 5 | 6 | #AlexNet 7 | layer_config = [[0, 8 | 227, 227, 3, 11, 11, 3, 96, 96, 9 | 0, 10 | 55, 55, 96, 4, 0, 0, 1, 11 | 1, 27, 27, 96, 3, 2, 12 | 1, 13 | 1],#Layer-1 14 | [0, 15 | 27, 27, 96, 5, 5, 48, 256, 256, 16 | 0, 17 | 27, 27, 256, 1, 2, 1, 1, 18 | 1, 13, 13, 256, 3, 2, 19 | 1, 20 | 1],#Layer-2 21 | [0, 22 | 13, 13, 256, 3, 3, 256, 384, 384, 23 | 0, 24 | 13, 13, 384, 1, 1, 0, 1, 25 | 0, 13, 13, 384, 0, 0, 26 | 0, 27 | 1],#Layer-3 28 | [0, 29 | 13, 13, 384, 3, 3, 192, 384, 384, 30 | 1, 31 | 13, 13, 384, 1, 1, 1, 1, 32 | 0, 13, 13, 384, 0, 0, 33 | 0, 34 | 0],#Layer-4 35 | [0, 36 | 13, 13, 384, 3, 3, 192, 256, 256, 37 | 0, 38 | 13, 13, 256, 1, 1, 1, 1, 39 | 1, 6, 6, 256, 3, 2, 40 | 0, 41 | 1],#Layer-5 Note: for last conv layer, outputs are write to fc buffer 42 | [1, 43 | 6, 6, 256, 6, 6, 256, 4096, 4096, # Note: The input size (dim1/dim2) is the combined data size (batched) 44 | 4, 45 | 1, 1, 4096, 6, 0, 0, 1, 46 | 0, 1, 1, 4096, 0, 0, 47 | 0, 48 | 2],#Layer-6 fc 49 | [1, 50 | 1, 1, 4096, 1, 1, 4096, 4096, 4096, 51 | 2, 52 | 1, 1, 4096, 1, 0, 0, 1, 53 | 0, 1, 1, 4096, 0, 0, 54 | 0, 55 | 3],#Layer-7 fc 56 | [1, 57 | 1, 1, 4096, 1, 1, 4096, 1024, 1024, 58 | 3, 59 | 1, 1, 1024, 1, 0, 0, 0, 60 | 0, 1, 1, 1024, 0, 0, 61 | 0, 62 | 2]]#Layer-8 fc 63 | 64 | # Configuration file instructions 65 | [layer_type, # "0" -> conv, "1" -> fc 66 | 67 | data_w, data_h, data_n, weight_w, weight_h, weight_n, weight_m, bias_size, #/memRd Parameters 68 | 69 | memrd_src, #"0"-> data_buf "1"-> output_buf "2"->"fc_1_buffer" "3"->"fc_2_buffer" "4"->"pool_buffer" "5"->"eltwise_buf"(resnet) 70 | 71 | conv_x, conv_y, conv_z, conv_stride, conv_padding, conv_split, conv_relu, #Conv Parameters 72 | 73 | pool_on, pool_x, pool_y, pool_z, pool_size, pool_stride, # Pooling Parameters 74 | 75 | lrn_on,# lrn on/off control 76 | 77 | memwr_dst] = [i for i in range(25)]#"0"-> data_buf "1"-> output_buf "2"->"fc_1_buffer" "3"->"fc_2_buffer" 78 | 79 | #VEC_SIZE 需要大于等于4,并且除去输入层,其余各层都需要被weight_n整除, i.e., 4, 8, 16, ...;典型值为16 80 | #LANE_NUM 需要大于1,对于alexnet: 2, 3, 4, 8, 12, 15, 16, 22, 28, 32, 34, 48, 50, 51, 64, ...; 典型值16 81 | # 82 | VEC_SIZE_LIST=[] 83 | for vec_size in range(4,100): 84 | for ll in range(1,8): 85 | if (np.mod(layer_config[ll][weight_n],vec_size)!=0): 86 | break 87 | if(ll==7): 88 | VEC_SIZE_LIST.append(vec_size) 89 | print(VEC_SIZE_LIST) 90 | 91 | LANE_NUM_LIST=[] 92 | for lan_num in range(2,100): 93 | for ll in range(1,8): 94 | if (np.mod(np.ceil(layer_config[ll][weight_m]/lan_num),2)!=0 and (layer_config[ll][conv_split]==1)): 95 | break 96 | if(ll==7): 97 | LANE_NUM_LIST.append(lan_num) 98 | print(LANE_NUM_LIST) -------------------------------------------------------------------------------- /paper/object_detection.md: -------------------------------------------------------------------------------- 1 | 2 | ## deep learning object detection 3 | 4 | A paper list of object detection using deep learning. 5 | 6 | Survey papers: 7 | 8 | - A Survey of Deep Learning-Based Object Detection, 2019 IEEE Access 9 | - Deep Learning for Generic Object Detection A Survey, IJCV 2019 10 | - Object Detection With Deep Learning A Review, 2019 TNNLS 11 | 12 | 13 | ### Table of Contents 14 | - Paper list from 2014 to now(2019) 15 | - Performance table 16 | - Dataset Papers 17 | 18 | ### Paper list from 2014 to now(2019) 19 | 20 | The part highlighted with red characters means papers that i think **"must-read"**. 21 | 22 | ![deep_learning_object_detection_history.png](http://image.jingsnow.com/image/deep_learning_object_detection_history.png) 23 | 24 | 25 | ### Performance table 26 | 27 | FPS(Speed) index is related to the hardware spec(e.g. CPU, GPU, RAM, etc), so it is hard to make an equal comparison. The solution is to measure the performance of all models on hardware with equivalent specifications, but it is very difficult and time consuming. 28 | 29 | | Detector | VOC07(mAP@IoU=0.5) | VOC12(mAP@IoU=0.5) | COCO(mAP@IoU=0.5:0.95) | Published In | 30 | |:--------|:--------:|:--------:|:--------:|:--------:| 31 | | R-CNN | 58.5 | - | - | CVPR'14 | 32 | | SPP-Net | 59.2 | - | - | ECCV'14 | 33 | |MR-CNN |78.2 (07+12)| 73.9 (07+12)| - | ICCV'15 | 34 | | Fast R-CNN | 70.0 (07+12) | 68.4 (07++12) |19.7 | ICCV'15 | 35 | | Faster R-CNN | 73.2 (07+12) | 70.4 (07++12)| 21.9 | NIPS'15 | 36 | | YOLO v1 | 66.4 (07+12) | 57.9 (07++12) | - | CVPR'16 | 37 | | G-CNN | 66.8 | 66.4 (07+12) | - | CVPR'16 | 38 | | AZNet | 70.4 | - | 22.3 | CVPR'16 | 39 | | ION | 80.1 | 77.9 | 33.1 | CVPR'16 | 40 | | HyperNet | 76.3 (07+12) | 71.4 (07++12) | - | CVPR'16 | 41 | | OHEM | 78.9 (07+12) | 76.3 (07++12) | 22.4 | CVPR'16 | 42 | | MPN | - | - | 33.2 | BMVC'16 | 43 | |SSD| 76.8 (07+12)| 74.9 (07++12)| 31.2| ECCV'16| 44 | |GBDNet| 77.2| (07+12)| -| 27.0| ECCV'16| 45 | |CPF| 76.4 (07+12)| 72.6 (07++12)| -| ECCV'16| 46 | |R-FCN |79.5 (07+12)| 77.6 (07++12)| 29.9| NIPS'16| 47 | |DeepID-Net| 69.0| - |-| PAMI'16| 48 | |NoC |71.6 (07+12) |68.8 (07+12) |27.2| TPAMI'16| 49 | |DSSD |81.5 (07+12)| 80.0 (07++12)| 33.2| arXiv'17| 50 | |TDM| - |- |37.3 |CVPR'17| 51 | |FPN| - |-| 36.2| CVPR'17| 52 | |YOLO v2| 78.6 (07+12)| 73.4 (07++12)| -| CVPR'17| 53 | |RON| 77.6 (07+12)| 75.4 (07++12)| 27.4| CVPR'17| 54 | |DeNet| 77.1 (07+12)| 73.9 (07++12)| 33.8| ICCV'17| 55 | |CoupleNet| 82.7 (07+12)| 80.4 (07++12)| 34.4| ICCV'17| 56 | |RetinaNet| - |- |39.1| ICCV'17| 57 | |DSOD |77.7 (07+12)| 76.3 (07++12) |-| ICCV'17| 58 | |SMN |70.0| -| - |ICCV'17| 59 | |Light-Head R-CNN| - |-| 41.5| arXiv'17| 60 | |YOLO v3| - |-| 33.0| arXiv'18| 61 | |SIN| 76.0 (07+12)| 73.1 (07++12)| 23.2| CVPR'18| 62 | |STDN |80.9 (07+12)| - |-| CVPR'18| 63 | |RefineDet| 83.8 (07+12)| 83.5 (07++12)| 41.8| CVPR'18| 64 | |SNIP| - |- |45.7 |CVPR'18| 65 | |elation-Network| -| - |32.5| CVPR'18| 66 | |Cascade R-CNN| - |-| 42.8| CVPR'18| 67 | |MLKP |80.6 (07+12)| 77.2 (07++12)| 28.6| CVPR'18| 68 | |Fitness-NMS| - |-| 41.8| CVPR'18| 69 | |RFBNet| 82.2 (07+12)| -| - |ECCV'18| 70 | |CornerNet| - |-| 42.1| ECCV'18| 71 | |PFPNet| 84.1 (07+12)| 83.7 (07++12)| 39.4| ECCV'18| 72 | |Pelee| 70.9 (07+12)| -| - |NIPS'18| 73 | |HKRM| 78.8 (07+12)| -| 37.8| NIPS'18| 74 | |M2Det| -| -| 44.2| AAAI'19| 75 | |R-DAD| 81.2 (07++12)| 82.0 (07++12)| 43.1| AAAI'19| 76 | 77 | ### Dataset Papers 78 | Statistics of commonly used object detection datasets. The Table came from [this survey paper](https://arxiv.org/pdf/1809.02165v1.pdf). 79 | 80 | ![deep_learning_object_detection_dataset.png](http://image.jingsnow.com/image/deep_learning_object_detection_dataset.png) 81 | 82 | 83 | reference:https://github.com/hoya012/deep_learning_object_detection 84 | -------------------------------------------------------------------------------- /paper/pruning.md: -------------------------------------------------------------------------------- 1 | # Pruning Read List 2 | 3 | ## Non-Structured Pruning 4 | 5 | | Title | Venue | Method | Type | Code | 6 | |:--------|:--------:|:--------:|:--------:|:--------:| 7 | | [Learning both Weights and Connections for Efficient Neural Networks](https://arxiv.org/abs/1506.02626) | NIPS 2015 (必读) | Absolute value metric | Weights-Pruning |[github](https://github.com/jack-willturner/DeepCompression-PyTorch)| 8 | |[Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding](https://arxiv.org/abs/1510.00149)| ICLR 2016 (best paper)(必读) | Deep Compression| Weights-Pruning | [github](https://github.com/jack-willturner/DeepCompression-PyTorch)| | 9 | |[Dynamic Network Surgery for Efficient DNNs](https://arxiv.org/abs/1608.04493)| NIPS 2016 | Dynamic Pruning | Weights-Pruning | [github](https://github.com/yiwenguo/Dynamic-Network-Surgery)| 10 | |[Bayesian Compression for Deep Learning](https://papers.nips.cc/paper/6921-bayesian-compression-for-deep-learning.pdf) | NIPS 2017 | Bayesian | Weights-Pruning | [github](https://github.com/KarenUllrich/Tutorial_BayesianCompressionForDL)| 11 | |[A Systematic DNN Weight Pruning Framework using Alternating Direction Method of Multipliers](https://arxiv.org/abs/1804.03294)| ECCV 2018(必读)| ADMM | Weights-Pruning | [github](https://github.com/KaiqiZhang/admm-pruning)| 12 | | [To prune, or not to prune: exploring the efficacy of pruning for model compression](https://arxiv.org/abs/1710.01878)| ICLR 2018 | Progressive Pruning | Weights-Pruning | - | 13 | | [Bandwidth-Efficient Deep Learning](https://ieeexplore.ieee.org/document/8465812)| DAC 2018 | AutoML | Weights-Pruning | - | 14 | | [Frequency-Domain Dynamic Pruning for Convolutional Neural Networks](https://papers.nips.cc/paper/7382-frequency-domain-dynamic-pruning-for-convolutional-neural-networks.pdf)| NIPS 2018 | DCT Frequency-Domain Pruning | Weights-Pruning | - | 15 | [CLIP-Q: Deep Network Compression Learning by In-Parallel Pruning-Quantization](http://www.sfu.ca/~ftung/papers/clipq_cvpr18.pdf)| CVPR 2018 | Pruning & mix-Quantization | Weights | - | 16 | | [SNIP: Single-shot Network Pruning based on Connection Sensitivity](https://arxiv.org/abs/1810.02340) | ICLR 2019 | One-shot Pruning | Weights-Pruning | [github](https://github.com/namhoonlee/snip-public)| 17 | | [Energy-Constrained Compression for Deep Neural Networks via Weighted Sparse Projection and Layer Input Masking](https://openreview.net/forum?id=BylBr3C9K7) | ICLR 2019 | Energy-Constrained Pruning | Weights-Pruning | [github](https://github.com/hyang1990/model_based_energy_constrained_compression)| 18 | | [The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks](https://openreview.net/forum?id=rJl-b3RcF7) | ICLR 2019 (best paper)| Lottery Ticket Hypothesis | Weights-Pruning | [github](https://github.com/rahulvigneswaran/Lottery-Ticket-Hypothesis-in-Pytorch) | 19 | 20 | 21 | 22 | ## Structured Pruning 23 | 24 | | Title | Venue | Method | Type | Code | 25 | |:--------|:--------:|:--------:|:--------:|:--------:| 26 | |[Learning Structured Sparsity in Deep Neural Networks](https://papers.nips.cc/paper/6504-learning-structured-sparsity-in-deep-neural-networks.pdf)| NIPS 2016 (必读) | Group Lasso | Filter-Pruning | - | 27 | |[Channel Pruning for Accelerating Very Deep Neural Networks](https://arxiv.org/pdf/1707.06168v2.pdf) | ICCV 2017(必读) | Lasso | Filter-Pruning | [github](https://github.com/yihui-he/channel-pruning) | 28 | | [Pruning Filters for Efficient ConvNets](https://arxiv.org/abs/1608.08710)| ICLR 2017 | L1 norm metric | Filter-Pruning | [github](https://github.com/rahulvigneswaran/Lottery-Ticket-Hypothesis-in-Pytorch) | - | 29 | | [ThiNet: A Filter Level Pruning Method for Deep Neural Network Compression](https://arxiv.org/abs/1707.06342)| ICCV 2017(必读) | L1 norm metric | Filter-Pruning|[github](https://github.com/Roll920/ThiNet)| 30 | | [Pruning Convolutional Neural Networks for Resource Efficient Inference](https://arxiv.org/abs/1611.06440)| ICLR 2017 | Taylor expansion | Filter-Pruning | [github](https://github.com/Tencent/PocketFlow#channel-pruning)| 31 | | [AMC: Automl for model compression and acceleration on mobile devices](https://arxiv.org/abs/1802.03494) | ECCV 2018 | AutoML(必读) | Filter-Pruning | [github](https://github.com/mit-han-lab/amc-release)| 32 | |[Soft Filter Pruning for Accelerating Deep Convolutional Neural Networks]()| IJCAI 2018 | Soft Filter Pruning | Filter-Pruning |[github](https://github.com/he-y/soft-filter-pruning)| 33 | |[Accelerating Convolutional Networks via Global & Dynamic Filter Pruning](https://www.ijcai.org/proceedings/2018/0336.pdf) | IJCAI 2018 | Global Filter Pruning | Filter-Pruning | - | 34 | | [ADAM-ADMM: A Unified, Systematic Framework of Structured Weight Pruning for DNNs](https://www.semanticscholar.org/paper/ADAM-ADMM%3A-A-Unified%2C-Systematic-Framework-of-for-Zhang-Zhang/64db2e2c76aa3f028b6866f91795a7c005a3f13b) | NIPS 2018(没中) | ADMM | Filter-Pruning | [github](https://github.com/KaiqiZhang/ADAM-ADMM)| 35 | |[MetaPruning: Meta Learning for Automatic Neural Network Channel Pruning](https://arxiv.org/abs/1903.10258) | ICCV 2019 | Meta Learning & Genetic algorithm(必读)| Filter-Pruning | [github](https://github.com/liuzechun/MetaPruning)| 36 | -------------------------------------------------------------------------------- /PipeCNN_note.md: -------------------------------------------------------------------------------- 1 | # PipeCNN设计空间探索 2 | 文件中有些公式,可能在github上预览效果不好,可以使用typora软件下载后查看;另外文中的实验数据是以加了profile命令编译的,所以资源消耗偏大一些。我会在跑完新的实验后更新实验数据。 3 | ## 1. PipeCNN并行度分析 4 | 在阐述PipeCNN的并行性之前,我们首先探讨每一层的计算量,由于全连接操作均转化为卷积操作,所以本文主要讨论卷积运算。假设输入特征图的尺寸为`W*H*N`,卷积核尺寸为`K*K*N*M`,输出卷积结果为`R*C*M`,则为了得到一个卷积输出特征值需要进行的乘法数量为`K*K*N`,之后对`K*K*N`个数进行相加所需的加法数量为`K*K*N-1`,最终再加上`bias`,所以一共需要的运算数目为`2*K*K*N`。而卷积输出的尺寸为`R*C*M`,所以这一层卷积的总共计算量为`2*K*K*N*R*C*M` 5 | 6 | 为了加速计算,PipeCNN框架有两个并行度,即`VEC_SIZE`和`LANE_NUM`,**即有`LANE_NUM`个卷积核同时进行卷积**操作,而在每个卷积核进行卷积时,**同时有`VEC_SIZE`个数进行加乘操作**,即计算一次就会有`2*VEC_SIZE*LANE_NUM`个乘累加操作,若系统的频率为`Freq`则每一层卷积所需要的时间为 7 | $$ 8 | T_{conv}^{l}=\frac{2*K*K*N*R*C*M}{2*VEC\_SIZE*LANE\_NUM*Freq} \tag{1} 9 | $$ 10 | 从上式可以看出,当`VEC_SIZE`和`LANE_NUM`选择合适时,会大大加快卷积计算速度。但是一味增大这两个参数并不会缩短运行时间,接下来我们通过计算来进行探究。 11 | ## 2. 每一层性能瓶颈的分析 12 | 为了减小读取数据时的延时,PipeCNN框架采用了`win_buffer`来缓冲数据,并且使用`ping-pong buffer`。对于每层网络而言,需要将输入分为几个组,每组输入特征的大小为`conv_win_size_dim1x2x3`对应`CONV_GP_SIZE*1*LANE_NUM`个卷积输出特征(PipeCNN中没有Y方向的数据重用),分别对各个组进行卷积,将所有组的卷积结果合并成最后的输出。在每组内需要做两个事情:1. 读取**输入特征**和**权重数据** 2. 进行卷积运算。首先我们分别对每组中这两种操作所需要的理论时间进行计算。 13 | 14 | 读取**每个GROUP**对应数据所需要的时间:分为从DDR内存中读取输入特征和读取权重的时间,读取输入特征的时间为 15 | $$ 16 | t_{data}=\frac{[(CONV\_GP\_SIZE\_X-1)*S+K]*K*N*DATA\_WIDTH}{DDR\_BANDWIDTH} 17 | $$ 18 | 其中S为卷积核的步长,DATA_WIDTH为数据的宽度,DDR_BANDWIDTH为DDR内存的带宽。 19 | 读取**每个GROUP**对应权重的时间为 20 | $$ 21 | t_{weight}=\frac{K*K*N*LANE\_NUM*DATA\_WIDTH}{DDR\_BANDWIDTH} 22 | $$ 23 | 对于**每个GROUP**进行卷积所需要的时间为 24 | $$ 25 | t_{conv}=\frac{K*K*N*CONV\_GP\_SIZE*LANE\_NUM}{LANE\_NUM*VEC\_SIZE*FREQUENCY} 26 | $$ 27 | 在读取**每个GROUP**所需要的时间在下图中用黄色柱状图表示,对**每个GROUP**进行卷积所需要的时间在下图中用蓝色柱状图表示,可以看出来,在Alexnet的前几个卷积层,卷积时间大于读取权重的时间,所以前5层的性能瓶颈在于卷积速度。而6、7、8三层的全连接层,读取数据的时间要远远大于计算时间,所以这时的性能瓶颈在于数据的读取。 28 | 29 | ![](PipeCNN_note_resource/images/conv_read.png) 30 | 31 | 普通卷积层每组读取数据所消耗时间和每组卷积所消耗时间的示意图如下,可以看出卷积时间大于从DDR内存读取数据的时间,所以性能瓶颈在于卷积操作,该层所消耗的时间可以用公式(1)进行计算。 32 | $$ 33 | T_{conv}^{l}=\frac{2*K*K*N*R*C*M}{2*VEC\_SIZE*LANE\_NUM*Freq} \tag{1} 34 | $$ 35 | 36 | ![](PipeCNN_note_resource/images/conv_time.png) 37 | 38 | 对于**全连接层**而言示意图如下,可以看出读取数据和权重所需要的时间远大于计算时间,所以性能瓶颈在于读取数据操作。 39 | 40 | ![](./PipeCNN_note_resource/images/read_time.png) 41 | 42 | 全连接层输入数据的维度为`K*K*N`,权重维度为`K*K*N*M`,该层所消耗的时间可以用如下公式进行计算 43 | $$ 44 | T_{read}^{l}=\frac{(K*K*N*M+K*K*N)*DATA\_WIDTH}{DDR\_BANDWIDTH} \tag{2} 45 | $$ 46 | 采用这种模型分别对卷积层和全连接层的时间消耗进行建模,可以得出每层时间消耗 47 | 48 | 层数|理论时间消耗|实际时间消耗|效率 49 | ----|-----------|-----------|- 50 | 1 |18.91|19.57|0.97 51 | 2 |7.53|7.96|0.95 52 | 3 |5.03|5.22|0.96 53 | 4 |3.77|3.94|0.96 54 | 5 |2.51|2.64|0.95 55 | 6 |1.58|2.57|0.61 56 | 7 |0.70|1.17|0.60 57 | 8 |0.18|0.32|0.56 58 | 59 | 60 | ## 2. 对`VEC_SIZE`和`LANE_NUM`进行探索 61 | ### 2.1 网络参数对VEC_SIZE和LANE_NUM的约束 62 | VEC_SIZE 和 LANE_NUM 的选取首先需要和网络参数匹配,分别需要满足以下几点。 63 | 1. VEC_SIZE需要同是时各层输入通道数的公约数:PipeCNN框架中,除了对输入层有在第三个维度(N)上的补零之外,其余各层都需要同时是VEC_SIZE的倍数,即每一层的N都可以被VEC_SIZE整除。能满足这个要求的只有[4, 8, 16] 64 | 2. 在`layer_config[ll][conv_split]==1`时,需要`np.mod(np.ceil(layer_config[ll][weight_m]/lan_num),2)==0`。能满足该要求的有[2, 3, 4, 8, 12, 13, 15, 16, 22, 28, 32, 33, 34, 48, 49, 50, 51, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76] 65 | ### 2.2 DSP数量对VEC_SIZE和LANE_NUM的约束 66 | TODO: 67 | - [ ] 何种情况下会消耗DSP 68 | 69 | DSP一个重要的去向就是进行卷积运算,因为PipeCNN中,每个时刻都有$VEC\_SIZE\times LANE\_NUM$个$MAC$运算,而一个DSP可以处理两个8bit MAC运算,所以在进行卷积运算时需要消耗的DSP数量为$VEC\_SIZE\times LANE\_NUM/2$。而其他内核也会消耗一部分DSP,比如在memRead内核中的counter在计算时也会消耗一部分DSP,这部分DSP的用量可以用一个常数C来表示,所以总的DSP消耗量可以用以下等式估算 70 | $$ 71 | \#\{DSP\}=VEC\_SIZE\times LANE\_NUM/2+C 72 | $$ 73 | ![](./PipeCNN_note_resource/images/dsp.png) 74 | De5net开发板上测试的数据如图,根据在De5_net 开发板上的测试结果,常数C在50-51之间,matlab计算得到C=50.45。根据这个等式可以根据DSP的总量和VEC_SIZE来计算LANE_NUM的上限值。De5net的DSP数量是256个,则256-50.5=vec*lane/2。所以当vec=16时,lane=25;vec = 8时lane=51,vec=4时,lane=102。 75 | 76 | ### 2.3 RAM资源对VEC_SIZE和LANE_NUM的约束 77 | 78 | 根据由De5net的实验结果,经过线性回归得到如图所示拟合结果,并且得到RAM用量的表达式 79 | ![](./PipeCNN_note_resource/images/ram.png) 80 | $$ 81 | \#\{RAM\} = 583 - 1.6*VEC\_SIZE +6*LANE\_NUM+0.6*VEC\_SIZE*LANE\_NUM 82 | $$ 83 | De5net有2560个RAM,如果只用其中70%的RAM资源,分别将vec=16,8,4带入得到最大的$LANE\_NUM=79, 113, 144$ 84 | 85 | ### 2.4 寄存器资源对VEC_SIZE和LANE_NUM的约束 86 | 87 | ![](./PipeCNN_note_resource/images/reg.png) 88 | $$ 89 | reg=103743 + 335 * VEC\_SIZE + 980 * LANE\_NUM+155*VEC\_SIZE*LANE\_NUM 90 | $$ 91 | 92 | ### 2.5 逻辑资源对VEC_SIZE和LANE_NUM的约束 93 | ![](./PipeCNN_note_resource/images/log.png) 94 | $$ 95 | log=63810+118*VEC\_SIZE+619*LANE\_NUM+69*VEC\_SIZE*LANE\_NUM 96 | $$ 97 | De5net中逻辑资源为234720 98 | ### 2.6 运行频率对VEC_SIZE和LANE_NUM的约束 99 | 100 | ![](./PipeCNN_note_resource/images/fre.png) 101 | $$ 102 | Fre=249.6+0.85*VEC\_SIZE-0.71*LANE\_NUM-0.12*VEC\_SIZE*LANE\_NUM 103 | $$ 104 | 这里取最低频率180,分别将vec=16,8,4带入,则LANE_NUM=31,45,60 105 | ### 2.7 综合以上所有因素 106 | 为了不超过硬件资源限制,分别计算不同的VEC_SIZE和对应的LANE_NUM,得到如下表格 107 | VEC_SIZE|DSP|RAM(70%)|LOG(70%)|FREQ(190)|Min 108 | -|-|-|-|-|- 109 | 16|25|79|57|25|25 110 | 8|51|110|85|42|42 111 | 4|102|137|111|74|74 112 | 再结合网络的参数约束,再对VEC_SIZE和LANE_NUM不同的组合进行硬件编译,并运行得到[实验结果](./PipeCNN_note_resource/images/de5Alexnet.xlsx)。发现当LAnE_NUM为2的幂次时,消耗的资源较少,而且性能较高。 113 | 114 | 由于这几天FPGA板子没法跑,所以用计算表1时使用的模型计算了一下性能 115 | 116 | ![](./PipeCNN_note_resource/images/flops.png) 117 | 118 | ## 3. 其他影响性能的因素 119 | 1. DDR内存的带宽限制:根据数据手册,De5_net板子上的DDR内存的带宽为94.5gbit/s,根据上面对每个GROUP运行时间进行分析可以知道,在最后三层的全连接层,限制速度的组要因素为**从global memory向local memory读取数据的速度较卷积操作慢** 120 | 2. 提高VEC_SIZE和LANE_NUM所带来的无效操作的增加 121 | -------------------------------------------------------------------------------- /project/install.md: -------------------------------------------------------------------------------- 1 | # caffe、CUDA、pytorch安装 2 | 3 | ## Caffe 4 | 官方参考http://caffe.berkeleyvision.org/install_apt.html, 5 | 系统要求Ubuntu 14.04、16.04 6 | 7 | 1)依赖项 8 | sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler 9 | sudo apt-get install --no-install-recommends libboost-all-dev 10 | sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev (14.04依赖项) 11 | 其中OpenCV也可通过source手动安装并配置 12 | 安装Linux Header文件 13 | $ sudo apt-get install linux-headers-$(uname -r) 14 | 15 | 2)BLAS 16 | install ATLAS by sudo apt-get install libatlas-base-dev or install OpenBLAS by sudo apt-get install libopenblas-dev or MKL for better CPU performance. 17 | 18 | 3)CUDA 19 | CUDA版本与driver版本关系(https://docs.nvidia.com/deploy/cuda-compatibility/index.html) 20 | 21 | 22 | ## GPU显卡驱动安装 23 | 24 | - 通过ppa安装(推荐) 25 | ``` 26 | sudo apt-get purge nvidia-* 27 | sudo add-apt-repository ppa:graphics-drivers/ppa and then sudo apt-get update. 28 | sudo apt-get install software-properties-common(可选) 29 | sudo apt-get install nvidia-375(这里选择你需要的版本号) 30 | ``` 31 | 然后重启机器,执行nvidia-smi查看显卡信息 (装完cuda还也要重启) 32 | 33 | ## 安装CUDA(最新CUDA9/10可以不需要进行a、b步骤): 34 | a)ctrl + alt + f1 进入tty1界面 35 | 36 | b)关闭x桌面服务 37 | sudo service lightdm stop 38 | 39 | c)安装依赖库 40 | sudo apt-get install freeglut3-dev build-essential libx11-dev libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev 41 | 42 | d)安装CUDA 43 | sudo sh cuda_8.0.61_375.26_linux.run --no-opengl-libs 44 | 如果有PATCH可以把PATCH也安装好。 45 | 46 | e)添加环境变量 47 | export CUDA_HOME=/usr/local/cuda 48 | export PATH=$PATH:$CUDA_HOME/bin 49 | export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64$LD_LIBRARY_PATH 50 | 51 | f)进入example中Utility,编译deviceQuery例子,运行,如果PASS证明安装完成。 52 | 53 | ## 安装cuDNN 54 | 55 | 参考https://developer.nvidia.com/cudnn注册并下载对应版本 56 | 57 | a)解压压缩文件到特定路径 58 | 59 | b)添加路径 60 | export LD_LIBRARY_PATH=/your path to cudnn/lib64:$LD_LIBRARY_PATH 61 | 62 | c)拷贝cudnn.h文件到cuda目录下 63 | 拷贝lib下文件到cuda/lib64目录下,注意增加读权限 64 | 65 | ## 安装nccl(pytorch自带nccl后端,可以不需要自己装) 66 | 67 | - 方法一 用源文件安装: 68 | 69 | a)版本v1.2.3-1+cuda8.0 70 | 下载 https://github.com/NVIDIA/nccl/tree/v1.2.3-1+cuda8.0 71 | 72 | b)编译和测试 73 | make CUDA_HOME= test 74 | 测试:./build/test/single/all_reduce_test 1000000 75 | 76 | c)安装 77 | sudo make install(默认都安装到/usr/local下了) 78 | 记住要sudo ldconfig一下更新缓存 79 | 80 | - 方法二 用deb安装(推荐): 81 | 82 | 下载deb安装文件,如nccl-repo-ubuntu1804-2.4.8-ga-cuda10.0_1-1_amd64.deb 83 | ``` 84 | dpky -i nccl-repo-ubuntu1804-2.4.8-ga-cuda10.0_1-1_amd64.deb 85 | sudo apt update 86 | sudo apt install libnccl2 libnccl-dev 87 | ``` 88 | 如果要指定某个版本: 89 | sudo apt install libnccl2=2.0.0-1+cuda8.0 libnccl-dev=2.0.0-1+cuda8.0 90 | 91 | ## 安装Conda和Pytorch 92 | 93 | a)https://repo.continuum.io/archive/ 下载需要的版本 94 | 95 | b)安装 96 | bash Anaconda2-5.1.0-Linux-x86_64.sh 97 | 98 | c)添加PATH 99 | 100 | d)安装Pytorch和torchvision 101 | conda install pytorch=1.0 torchvision cudatoolkit=10.0 102 | 103 | e)测试,进入python,然后 104 | ``` 105 | import torch 106 | import torchvision 107 | print(torch.cuda.is_available()) 108 | ``` 109 | 或者 110 | ``` 111 | import torch 112 | x=torch.Tensor(5,3) 113 | print x 114 | ``` 115 | 116 | f)卸载 117 | conda uninstall pytorch torchvision 118 | 119 | g) 如果使用lmdb数据库还需要安装: 120 | conda install -c conda-forge python-lmdb 121 | 122 | - 问题处理: 123 | 124 | 1)切换国内源: 125 | ``` 126 | vim .condarc 127 | channels: 128 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/ 129 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/ 130 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ 131 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main 132 | show_channel_urls: true 133 | ``` 134 | 135 | 2)多环境管理: 136 | 查看当前conda所有可用环境 137 | conda info -e 138 | 创建一个新环境 139 | conda create -n py36 python=3.6 140 | 切换到想要的环境 141 | source activate py36 142 | 退出当前环境 143 | source deactivate 144 | 145 | ## 安装 DALI 146 | 147 | 安装: 148 | pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/cuda/10.0 nvidia-dali 149 | 150 | 升级:待补充 151 | 152 | ## 安装 APEX 153 | 154 | 待补充 155 | 156 | 157 | ## 安装 Caffe 158 | 159 | a)下载源码 160 | git clone https://github.com/BVLC/caffe.git 161 | 162 | b)其他依赖库: 163 | sudo apt install libboost-python-dev python-skimage python-protobuf 164 | 165 | c)编译配置 166 | 由于conda自带库和caffe不兼容,编译前先屏蔽conda相关库和路径 167 | 168 | - 方法一通过Cmake编译(推荐,提示信息更清楚): 169 | 170 | 修改CMakeLists.txt选择相关编译项,如: 171 | USE_NCCL -> ON 172 | 然后 173 | ``` 174 | mkdir build 175 | cd build 176 | cmake .. 177 | make all 178 | make install 179 | make runtest 180 | ``` 181 | 182 | - 方法二直接Makefile编译: 183 | 184 | 将 Makefile.config.example 文件复制一份并更名为 Makefile.config 185 | 修改相关编译项如: 186 | 187 | USE_CUDNN := 1 188 | 189 | 注意选择用conda的python还是系统python 190 | 191 | USE_NCCL := 1 192 | 193 | 修改Makefile,增加 194 | NVCCFLAGS += -D_FORCE_INLINES -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS) 195 | 196 | 如果提示找不到hdf5,增加hdf5支持: 197 | INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include 198 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu (14.04直接这个目录下,https://packages.ubuntu.com/trusty/amd64/libhdf5-dev/filelist) 199 | 200 | 如果是16.04修改为: 201 | INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include /usr/include/hdf5/serial/ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu/hdf5/serial 202 | (或者/your conda path/lib 待验证) 203 | 204 | c)编译 205 | ``` 206 | make all -j8 207 | make pycaffe 208 | make runtest -j8 209 | ``` 210 | 如果 make runtest -j8 时找不到libhdf5相关库,增加下面路径(make all之前不能加,否则anaconda中有些lib会和系统lib冲突): 211 | export LD_LIBRARY_PATH=/your conda path/lib:/usr/local/lib:$LD_LIBRARY_PATH 212 | 213 | d)添加Python路径 214 | 215 | export PYTHONPATH=/path/to/caffe/python:$PYTHONPATH 216 | 217 | 7)为Conda的Python安装依赖项 218 | conda install -c conda-forge python-lmdb 219 | conda install protobuf 220 | 221 | 问题处理: 222 | 1)CUDA9以上可能会出现cublas库不识别的问题,需要卸载老版本cmake,安装新版本,如3.14.7版本,参考:https://www.linuxidc.com/Linux/2018-09/154165.htm 223 | 224 | 225 | ## 常用命令 226 | nvidia-smi topo --matrix 227 | nvidia-smi dmon 228 | nvidia-smi pmon 229 | 230 | ## 参考评测文章: 231 | (1)https://www.pugetsystems.com/labs/hpc/PCIe-X16-vs-X8-with-4-x-Titan-V-GPUs-for-Machine-Learning-1167/ 232 | 233 | (2) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Things to learn for new students in the Lab for AI Chips and Systems of BJTU
智能芯片与应用实验室新生学习指南 2 | 3 | * 注意:新生一定认真学习相关基础知识,特别是标记了推荐的内容,至少看两遍 4 | * 要多动手,通过练习才能掌握 5 | * 要主动,不懂就问 6 | 7 | # Basic Engineering Skills You Have to Master
硕士生根据个人研究方向学习以下内容 8 | 9 | ## Software Programming 10 | 11 | Learn the following two programing langrages 12 | 13 | * C/C++ (C99/C++11) 14 | * Python (v3.0) 15 | 16 | Do the following projects to practice and learn good habits when programming in C 17 | 18 | * [codewithc](https://www.codewithc.com/c-projects-with-source-code/) 19 | * [一个python教程](https://github.com/overmind1980/oeasy-python-tutorial) 20 | 21 | Here's another good repo that has lots of good projects for you to practice. 22 | 23 | * [Project-Based-Tutorials-in-C](https://github.com/rby90/Project-Based-Tutorials-in-C) (try 'Emulator 101', 'hash table', 'How to Write a Video Player in Less Than 1000 Lines') 24 | 25 | Learn the following Good Coding Styles and use them in your research projects: 26 | 27 | * [C语言的语法风格与代码书写规范指南](https://www.ctolib.com/topics-55863.html) (简单) 28 | * [NASA C coding style](http://mechatronics.me.wisc.edu/labresources/DataSheets/NASA-GSFC_C_Programming_Styles-94-003.pdf), NASA, 1994 (推荐) 29 | * [Recommended C Style and Coding Standards](https://www.maultech.com/chrislott/resources/cstyle/indhill-cstyle.pdf), UC Berkeley, 1997 (简单) 30 | * [Guidelines for the use of the C language in critical systems](http://caxapa.ru/thumbs/468328/misra-c-2004.pdf), MISRA, 2018 (高级) 31 | * [A list of C and C++ Style Guides](https://www.maultech.com/chrislott/resources/cstyle/) 32 | * [Embedded C programming](http://www.eng.auburn.edu/~nelson/courses/elec3040_3050/C%20programming%20for%20embedded%20system%20applications.pdf) 33 | 34 | Some other good projects based on C/C++ 35 | 36 | * [Darknet: Open Source Neural Networks in C](https://pjreddie.com/darknet/)(深度学习最好的学习C/C++开发实例,推荐) 37 | * [BM3D denoising algorithm](https://github.com/20logTom/BM3D) 38 | * [ARM Compute Library](https://github.com/ARM-software/ComputeLibrary) 39 | 40 | Software Optimizations on Different Platforms 41 | 42 | *[软件代码优化资源](https://agner.org/optimize/)(推荐学习) 43 | 44 | ## Deep Learning Frameworks 45 | 46 | * [一个韩国人的教程很简洁](https://github.com/yunjey/pytorch-tutorial)(入门推荐) 47 | * [动手学深度学习PyTorch版](https://github.com/ShusenTang/Dive-into-DL-PyTorch)(入门推荐) 48 | * Pytorch[中文教程](https://github.com/zergtant/pytorch-handbook) 49 | * Nvidia官方training、inference[例子、参考代码](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets) 50 | * [Deep-Learning-with-PyTorch-Chinese 中文翻译含例程](https://github.com/ShusenTang/Deep-Learning-with-PyTorch-Chinese) 51 | * [斯坦福李飞飞老师课程 Convolutional Neural Networks for Visual Recognition](http://cs231n.stanford.edu/index.html)(做目标检测同学推荐学习) 52 | 53 | 54 | #### Excellent Deeplearning Project Using Pytorch 55 | 56 | There are many good examples that we have collected from github, and students can learn how to write "good" pytorch codes by reading and modifying the codes from these example project. A more complete list is [here](https://github.com/doonny/basic_knowledge/blob/master/project/project.md). 57 | 58 | #### How to setup the GPU and Pytorch environment 59 | 60 | [Pytorch安装说明](https://github.com/doonny/basic_knowledge/blob/master/project/install.md) 61 | 62 | #### 目标检测框架mm-detection代码讲解 63 | 64 | * [代码构建一](https://zhuanlan.zhihu.com/p/337375549), [代码构建二](https://zhuanlan.zhihu.com/p/341954021) 65 | * [一些代码注释](https://github.com/ming71/mmdetection-annotated) 66 | 67 | ## Hardware Basic Knowledge 68 | 69 | Read the following two books to learn basic concepts for digital circuit design and computer architecture. 70 | 71 | ### Digital Circuit/Logic Design 72 | Important concepts to understand include **combinatonal logic and sequential logic (组合逻辑和时序逻辑), register寄存器(Flip-Flop circuit), FSM状态机, counter计数器, decoder/encoder编码器和译码器, FIFO, RAM, etc.** Read the following two books: 73 | 74 | * ''数字逻辑设计与计算机组成, 戴志涛等,机械工业出版社'' 75 | * ["Digital Logic"](https://inst.eecs.berkeley.edu/~cs150/archives.html) 76 | 77 | ### Computer Architecture 78 | Important concepts to understand include **pipeline, memory hierarchy, roofline model, Amdahl's law, ILP (instruction level parallelism), TLP (task level parallelism), DLP (data level parallelism), SIMD/VLIW processor, etc.** Read the following two books: 79 | 80 | * [''Computer Organization and Design The Hardware Software Interface''](http://staff.ustc.edu.cn/~llxx/cod/reference_srcs.html), ARM Edition, 2017(重要,零基础学习) 81 | * [''Computer Architecture A Quantitative Approach''](https://book.douban.com/subject/6795919/), 6th Edition, 2019 (重要,进阶学习) 82 | 83 | More reading: 84 | 85 | * Loop-carried dependency: [1](https://www.cs.utexas.edu/~lin/cs380c/handout27.pdf), [2](https://people.engr.ncsu.edu/efg/506/s10/www/lectures/notes/lec5.pdf) 86 | * Roofline Model Basic: ./doc/Roofline Model.pdf 87 | * [并行处理的几种常见方式](http://www.inf.ed.ac.uk/teaching/courses/pa/Notes/lecture02-types.pdf)(推荐) 88 | 89 | 90 | 91 | ## FPGA Design 92 | 93 | After you have basic knowledges on digital circuit and computer architectures, you could learn FPGA design or heterogenours computing (using FPGA as accelerators). We recommand using HLS (High-level Synthesis)-based schemes (C/C++-based HLS or OpenCL) rather than RTL-level programming (i.e., Verilog and VHDL) to design application specific circuit on FPGAs (However, if you have time, you should always learn Verilog). 94 | 95 | 96 | Go to [HERE](https://github.com/doonny/basic_knowledge/blob/master/fpga/fpga.md) and read all the materials we have listed. 97 | 98 | FPGA相关学习资料在[这里](https://github.com/doonny/basic_knowledge/blob/master/fpga/fpga.md) (零基础、进阶推荐必读)。 99 | 100 | Finally, learn our opensource project [PipeCNN](https://github.com/doonny/PipeCNN). Run the examples, such as caffenet, vgg-16, resnet, YOLO on the Arria-10 FPGA and The Zynq FPGA platforms. Learn how to configure, compile, debug the source codes and profile the performance of the accelerator. After entering our lab, you will have access to our latest designs, i.e., PipeCNN-v2 and PipeCNN-sparse, which are in private repos. 101 | 102 | Zhang DeZheng has wrote a good study note on PipeCNN, please read it [here](https://github.com/doonny/basic_knowledge/blob/master/PipeCNN_note.md). 103 | 104 | #### A list of GOOD FPGA accelerator design can be found [here]() 105 | 106 | ## GPU Design 107 | 108 | Learn TensorRT and CUDA programing. Try examples on our TX2/TK1 platforms. 109 | 110 | * [TensorRT](https://developer.nvidia.com/tensorrt) 111 | 112 | 113 | 114 | # Research Related Topics
高年级硕士和博士生学习内容 115 | 116 | First, students should read the following artichles to learn how to write research papers. 117 | 118 | * [''How to write a great research paper''](http://www.sohu.com/a/254967611_473283), Deep Learning Indaba, Stellenbosch, 2018 119 | * [''How to Publish a Research Paper''](https://www.wikihow.com/Publish-a-Research-Paper), wikiHow, 2019 120 | * [''How to Write a Good Scientific Paper''](https://spie.org/samples/9781510619142.pdf), Chris A. Mack, SPIE, 2018 121 | * [''How to Write a Good Paper in Computer Science and How Will It Be Measured by ISI Web of Knowledge''](http://univagora.ro/jour/index.php/ijccc/article/view/2493), R?zvan Andonie, et.al., 2010 122 | 123 | Secondly, read the following selected papers in each research topics, which are really good examples in the related fields. 124 | 125 | 126 | ### Tutorials for Hardware Architectures for DNN 127 | 128 | Students who are working on hardware designs for deep neural networks should read the following tutorials. 129 | 130 | * [Hardware Architectures for Deep Neural Networks](http://eyeriss.mit.edu/tutorial.html), MICRO Tutorial 2016. (推荐) 131 | 132 | ### FPGA Accelerator Design 133 | 134 | * Optimizing FPGA-based Accelerator Design for Deep Convolutional Neural Networks, FPGA 2015. 135 | * Throughput-Optimized OpenCL-based FPGA Accelerator for Large-Scale Convolutional Neural Networks, FPGA 2016. 136 | * An OpenCL Deep Learning Accelerator on Arria 10, FPGA 2017. 137 | * Improving the Performance of OpenCL-based FPGA Accelerator for Convolutional Neural Network, FPGA 2017. 138 | * A Framework for Generating High Throughput CNN Implementations on FPGAs, FPGA 2018. 139 | * An Efficient Hardware Accelerator for Sparse Convolutional Neural Networks on FPGAs, FCCM 2019. 140 | 141 | The following survery papers are also worth reading. 142 | 143 | * A Survey of FPGA Based Neural Network Accelerator, ACM TRETS 2017. 144 | * Deep Neural Network Approximation for Custom Hardware: Where We’ve Been, Where We’re Going, ACM Computing Surveys 2019. 145 | 146 | Our own research papers on FPGA accelerators: 147 | 148 | * PipeCNN: An OpenCL-Based Open-Source FPGA Accelerator for Convolution Neural Networks, FPT 2017 149 | * ABM-SpConv: A Novel Approach to FPGA-Based Acceleration of Convolutional Neural Network Inference, DAC 2019 150 | 151 | 152 | A more complete paper list is [here](https://github.com/doonny/basic_knowledge/blob/master/paper/fpga.md). 153 | 154 | 155 | #### Sparse Convolution Design 156 | 157 | 158 | A more complete list is [here](https://github.com/doonny/basic_knowledge/blob/master/paper/sparse.md). 159 | 160 | 161 | ### Neural network optimization (quantization, pruning, et.al.) 162 | 163 | #### Neural Network Quantization 164 | 165 | * Ristretto: A Framework for Empirical Study of Resource-Efficient Inference in Convolutional Neural Networks, IEEE T-NNLS 2018. 166 | * 8-bit Inference with TensorRT, Nvidia 2017. 167 | * Quantizing deep convolutional networks for efficient inference: A whitepaper, Google, 2018. 168 | 169 | A more complete list is [here](https://github.com/doonny/basic_knowledge/blob/master/paper/quantization.md). 170 | 171 | #### Network Pruning and Compression 172 | 173 | A more complete list is [here](https://github.com/doonny/basic_knowledge/blob/master/paper/pruning.md). 174 | 175 | #### Neural Architecture Search (NAS) 176 | 177 | A more complete list is [here](https://github.com/doonny/basic_knowledge/blob/master/paper/NAS.md). 178 | 179 | #### Object Detection 180 | 181 | A more complete list is [here](https://github.com/doonny/basic_knowledge/blob/master/paper/object_detection.md). 182 | -------------------------------------------------------------------------------- /git_note.md: -------------------------------------------------------------------------------- 1 | # 20个最常用的 Git 命令用法说明及示例 2 | 3 | ## 设置SSH 4 | 1. 生成公钥 5 | ```bash 6 | ssh-keygen -t rsa -C "youremail@example.com" 7 | ``` 8 | 2. 将公钥添加到github等等托管平台上 9 | 10 | ## Git 命令 11 | 1. git config 12 | 用法:git config –global user.name “[name]” 13 | 用法:git config –global user.email “[email address]” 14 | 该命令将分别设置提交代码的用户名和电子邮件地址。 15 | ```bash 16 | dee@dee-Latitude:~$ git config --global user.name "dezengzang" 17 | dee@dee-Latitude:~$ git config --global user.email "dezengzang@outlook.com" 18 | ``` 19 | 20 | 2. git init 21 | 用法:git init [repository name] 22 | 该命令可用于创建一个新的代码库。 23 | ```bash 24 | dee@dee-Latitude:~$ git init ~/Documents/DEMO 25 | Initialized empty Git repository in /home/dee/Documents/DEMO/.git/ 26 | ``` 27 | 28 | 3. git clone 29 | 用法:git clone [url] 30 | 该命令可用于通过指定的URL获取一个代码库。 31 | ```bash 32 | dee@dee-Latitude:~/Documents$ git clone https://github.com/doonny/basic_knowledge 33 | Cloning into 'basic_knowledge'... 34 | remote: Enumerating objects: 79, done. 35 | remote: Counting objects: 100% (79/79), done. 36 | remote: Compressing objects: 100% (57/57), done. 37 | remote: Total 79 (delta 23), reused 68 (delta 15), pack-reused 0 38 | Unpacking objects: 100% (79/79), done. 39 | ``` 40 | 41 | 4. git status 42 | 用法:git status 43 | 该命令将显示所有需要提交的文件 44 | ```bash 45 | dee@dee-Latitude:~/Documents/basic_knowledge$ git status 46 | On branch master 47 | Your branch is up to date with 'origin/master'. 48 | 49 | Untracked files: 50 | (use "git add ..." to include in what will be committed) 51 | 52 | git_note.md 53 | 54 | nothing added to commit but untracked files present (use "git add" to track) 55 | ``` 56 | 57 | 5. git add 58 | 用法:git add [file] 59 | 该命令可以将一个文件添加至stage(暂存区)。 60 | ```bash 61 | dee@dee-Latitude:~/Documents/basic_knowledge$ git add git_note.md 62 | ``` 63 | 用法:git add * 64 | 该命令可以将多个文件添加至stage(暂存区)。 65 | ```bash 66 | dee@dee-Latitude:~/Documents/basic_knowledge$ git add * 67 | ``` 68 | 69 | 6. git commit 70 | 用法:git commit -m “[ Type in the commit message]” 71 | 该命令可以在版本历史记录中永久记录文件。 72 | ```bash 73 | dee@dee-Latitude:~/Documents/basic_knowledge$ git commit -m "git_note.md added" 74 | [master e1226c5] git_note.md added 75 | 1 file changed, 80 insertions(+) 76 | create mode 100755 git_note.md 77 | ``` 78 | 用法:git commit -a 79 | 该命令将提交git add命令添加的所有文件,并提交git add命令之后更改的所有文件。 80 | ```bash 81 | dee@dee-Latitude:~/Documents/basic_knowledge$ git commit -a -m "git_note.md edited" 82 | [master 12b077e] git_note.md edited 83 | 1 file changed, 16 insertions(+) 84 | ``` 85 | 86 | 7. git diff 87 | 用法:git diff 88 | 该命令可以显示尚未添加到stage的文件的变更。 89 | ```bash 90 | dee@dee-Latitude:~/Documents/basic_knowledge$ git diff 91 | diff --git a/git_note.md b/git_note.md 92 | index 37354bd..aa46d56 100755 93 | --- a/git_note.md 94 | +++ b/git_note.md 95 | @@ -97,5 +97,8 @@ 96 | 1 file changed, 16 insertions(+) 97 | ``` 98 | 99 | +7. git diff 100 | + 用法:git diff 101 | + 该命令可以显示尚未添加到stage的文件的变更。 102 | ``` 103 | 用法:git diff --staged 104 | 该命令可以显示添加到stage的文件与当前最新版本之间的差异。 105 | ```bash 106 | dee@dee-Latitude:~/Documents/basic_knowledge$ git diff --staged 107 | diff --git a/git_note.md b/git_note.md 108 | index 685f456..0afb41b 100755 109 | --- a/git_note.md 110 | +++ b/git_note.md 111 | @@ -117,7 +117,6 @@ 112 | - 113 | + 这里添加一句话 114 | ``` 115 | 用法:git diff [first branch] [second branch] 116 | 该命令可以显示两个分支之间的差异。 117 | 118 | 8. git reset 119 | 用法:git reset [file] 120 | 该命令将从stage中撤出指定的文件,但可以保留文件的内容。 121 | ```bash 122 | dee@dee-Latitude:~/Documents/basic_knowledge$ git reset git_note.md 123 | Unstaged changes after reset: 124 | M git_note.md 125 | ``` 126 | 用法:git reset [commit] 127 | 该命令可以撤销指定提交之后的所有提交,并在本地保留变更。([commit]可以使用git log查到) 128 | ```bash 129 | dee@dee-Latitude:~/Documents/basic_knowledge$ git reset 425fb563154d91c9e4d5eae57c2529cbd26919ee 130 | Unstaged changes after reset: 131 | M git_note.md 132 | ``` 133 | 134 | 用法:git reset --hard [commit] 135 | 该命令将丢弃所有的历史记录,并回滚到指定的提交。 136 | ```bash 137 | dee@dee-Latitude:~/Documents/basic_knowledge$ git reset --hard 425fb563154d91c9e4d5eae57c2529cbd26919ee 138 | HEAD is now at 425fb56 git_note.md edited 139 | ``` 140 | 9. git rm 141 | 用法:git rm [file] 142 | 该命令将删除工作目录中的文件,并将删除动作添加到stage。 143 | ```bash 144 | dee@dee-Latitude:~/Documents/basic_knowledge$ git rm example.txt 145 | rm 'example.txt' 146 | ``` 147 | 10. git log 148 | 用法:git log 149 | 该命令可用于显示当前分支的版本历史记录。 150 | ```bash 151 | dee@dee-Latitude:~/Documents/basic_knowledge$ git log 152 | commit 9db87d7e15506fad3e3c5333116090b7f4478bd5 (HEAD -> master) 153 | Author: dezengzang 154 | Date: Sat Nov 30 15:18:08 2019 +0800 155 | 156 | example.txt deleted 157 | 158 | commit 56b20dd9b39ddf65da4ab0465dd8a06b845d33c6 159 | Author: dezengzang 160 | Date: Sat Nov 30 15:17:23 2019 +0800 161 | 162 | add example.txt 163 | ``` 164 | 用法:git log --follow[file] 165 | 该命令可用于显示某个文件的版本历史记录,包括文件的重命名。 166 | ```bash 167 | dee@dee-Latitude:~/Documents/basic_knowledge$ git log --follow git_note.md 168 | commit a5ea2257c7fd8f1dc504d3bdd76f2e0916809315 169 | Author: dezengzang 170 | Date: Sat Nov 30 15:16:49 2019 +0800 171 | 172 | add example.txt 173 | ``` 174 | 11. git show 175 | 用法:git show [commit] 176 | 该命令经显示指定提交的元数据以及内容变更。 177 | ``` 178 | dee@dee-Latitude:~/Documents/basic_knowledge$ git show 9db87d7e15506fad3e3c5333116090b7f4478bd5 179 | commit 9db87d7e15506fad3e3c5333116090b7f4478bd5 (HEAD -> master) 180 | Author: dezengzang 181 | Date: Sat Nov 30 15:18:08 2019 +0800 182 | 183 | example.txt deleted 184 | 185 | diff --git a/example.txt b/example.txt 186 | deleted file mode 100644 187 | index ce01362..0000000 188 | --- a/example.txt 189 | +++ /dev/null 190 | @@ -1 +0,0 @@ 191 | -hello 192 | ``` 193 | 12. git tag 194 | 用法:git tag [commitID] 195 | 该命令可以给指定的提交添加标签。 196 | 13. git branch 197 | 用法:git branch 198 | 该命令将显示当前代码库中所有的本地分支。 199 | ```bash 200 | dee@dee-Latitude:~/Documents/basic_knowledge$ git branch 201 | * master 202 | ``` 203 | 用法:git branch [branch name] 204 | 该命令将创建一个分支。 205 | ``` 206 | dee@dee-Latitude:~/Documents/basic_knowledge$ git branch branch1 207 | ``` 208 | 用法:git branch -d [branch name] 209 | 该命令将删除指定的分支。 210 | ```bashrc 211 | dee@dee-Latitude:~/Documents/basic_knowledge$ git branch -d branch1 212 | Deleted branch branch1 (was 9db87d7). 213 | ``` 214 | 14. git checkout 215 | 用法:git checkout [branch name] 216 | 你可以通过该命令切换分支。 217 | ``` 218 | dee@dee-Latitude:~/Documents/basic_knowledge$ git checkout branch2 219 | M git_note.md 220 | Switched to branch 'branch2' 221 | ``` 222 | 用法:git checkout -b [branch name] 223 | 你可以通过该命令创建一个分支,并切换到新分支上。 224 | ```bash 225 | dee@dee-Latitude:~/Documents/basic_knowledge$ git checkout -b branch3 226 | M git_note.md 227 | Switched to a new branch 'branch3' 228 | ``` 229 | 15. git merge 230 | 用法:git merge [branch name] 231 | 该命令可以将指定分支的历史记录合并到当前分支。 232 | ``` 233 | dee@dee-Latitude:~/Documents/basic_knowledge$ git merge branch3 234 | Already up to date. 235 | ``` 236 | 16. git remote 237 | 用法:git remote add [variable name] [Remote Server Link] 238 | 你可以通过该命令将本地的代码库连接到远程服务器。 239 | ```bash 240 | dee@dee-Latitude:~/Documents/basic_knowledge$ git remote add origin1 git@github.com:doonny/basic_knowledge.git 241 | ``` 242 | 17. git push 243 | 用法:git push [variable name] master 244 | 该命令可以将主分支上提交的变更发送到远程代码库。 245 | ``` 246 | dee@dee-Latitude:~/Documents/basic_knowledge$ git push origin master 247 | Enumerating objects: 20, done. 248 | Counting objects: 100% (20/20), done. 249 | Delta compression using up to 4 threads 250 | Compressing objects: 100% (18/18), done. 251 | Writing objects: 100% (19/19), 3.31 KiB | 1.66 MiB/s, done. 252 | Total 19 (delta 11), reused 0 (delta 0) 253 | remote: Resolving deltas: 100% (11/11), completed with 1 local object. 254 | To https://github.com/doonny/basic_knowledge 255 | 2b2a690..9db87d7 master -> master 256 | ``` 257 | 用法:git push [variable name] [branch] 258 | 该命令可以将指定分支上的提交发送到远程代码库。 259 | ```bash 260 | git push origin branch1 261 | ``` 262 | 用法:git push --all [variable name] 263 | 该命令可以将所有分支发送到远程代码库。 264 | ```bash 265 | git push --all origin 266 | ``` 267 | 用法:git push [variable name] :[branch name] 268 | 该命令可以删除远程代码库上的一个分支。 269 | ```bash 270 | git push origin :branch1 271 | ``` 272 | 18. git pull 273 | 用法:git pull [Repository Link] 274 | 该命令将获取远程服务器上的变更,并合并到你的工作目录。 275 | ```bash 276 | (base) dee@dee-Latitude:~/Documents/basic_knowledge$ git pull 277 | Already up to date. 278 | ``` 279 | 19. git stash 280 | 用法:git stash save 281 | 该命令将临时保存所有修改的文件。 282 | ```bash 283 | dee@dee-Latitude:~/Documents/basic_knowledge$ git stash save 284 | Saved working directory and index state WIP on master: 9db87d7 example.txt deleted 285 | ``` 286 | 用法:git stash pop 287 | 该命令将恢复最近一次stash(储藏)的文件。 288 | ```bash 289 | git stash pop 290 | ``` 291 | 用法:git stash list 292 | 该命令将显示stash的所有变更。 293 | ```bash 294 | dee@dee-Latitude:~/Documents/basic_knowledge$ git stash list 295 | stash@{0}: WIP on master: 9db87d7 example.txt deleted 296 | ``` 297 | 用法:git stash drop 298 | 该命令将丢弃最近一次stash的变更。 299 | ```bash 300 | dee@dee-Latitude:~/Documents/basic_knowledge$ git stash drop 301 | Dropped refs/stash@{0} (72aec8b06c040b7f233f87734df587ef2e472af0) 302 | ``` 303 | 304 | 整理自[公众号CSDN](https://mp.weixin.qq.com/s?__biz=MjM5MjAwODM4MA==&mid=2650733891&idx=3&sn=ff358e64ec52838c7b030739579c567a&chksm=bea68c9089d10586178224580949fe432e3f033cba7bd748113baad89ff031d6cc901ccf34b7&mpshare=1&scene=23&srcid=&sharer_sharetime=1575079281233&sharer_shareid=ca4a235412dbd4ab62a02058c2d4d001#rd)(其实只有19个指令) 305 | 306 | -------------------------------------------------------------------------------- /PipeCNN_note_resource/explore_v4.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Use different parameters to execute the PipeCNN. 3 | 功能: 自动修改VEC_SIZE,LANE_NUM参数,并且编译运行,将.aocx文件中包含的资源消耗信息和run.log文件中包含的每层运行时间信息收集到一个excel表格中。 4 | 注意事项: 5 | 1. 首先修改makefile,选择hw,report,sw_emu,还要注意输出文件是aocx,还是aoco 6 | 2. 要想运行run.exe,需要在当前工程目录下存放数据,但是每个工程目录都存一个data文件夹显然太大,所以需要在project文件夹下创建data的软链接 7 | 3. 将本脚本放在和project同一级文件夹下 8 | 4. 修改bashrc来选择板卡 9 | 10 | CHANGES: 11 | 1. 增加板卡的验证,去掉了修改环境变量的功能,比较鸡肋。在实际使用中请直接修改bashrc 12 | 2. 增加是否删除Quartus工程的选项 13 | 3. 将编译好的数据存入excel表格 14 | 4. 使用subprocess替代部分system 15 | 5. 去掉VEC_SIZE,LANE_NUM编译后参数检查功能(鸡肋),该在编译前使用calculate_parameters.py脚本计算得到。 16 | 6. 增加读取.aocx和run_log.txt的功能,aocx涉及的硬件消耗信息和run_log.txt中的运行时间信息都会被采集到一个excel文件中!!!! 17 | ''' 18 | import os 19 | import subprocess 20 | import json 21 | import xlwt 22 | import datetime 23 | 24 | orig_dir = './project' 25 | [HW,SW_EMU,REPORT] = [0,1,2] 26 | 27 | ########使用前需要注意修改此处配置######### 28 | FUN_SEL = HW 29 | LAYER_NUM = 8 #alex net 8层 30 | CLEAN_PROJ = 1 #是否清理quartus工程;1:清理 0:不清理 31 | ####################################### 32 | 33 | [de5_net, de10_std, de10_nano, de5a_net] = [ i for i in range(4) ] 34 | 35 | def edit_make(proj_dir,VEC_SIZE, LANE_NUM, CONV_GP_SIZE_X): 36 | ''' 37 | edit the 'hw_param.cl' file and run the 'make' command 38 | Args: 39 | proj_dir: the project dir that contains the "project" dir. e.g. /PipeCNN-master 40 | VEC_SIZE: the depth 41 | LANE_NUM: 42 | CONV_GP_SIZE_X: 43 | ''' 44 | config_file = proj_dir + '/device/hw_param.cl' 45 | orig_config_file = config_file[0:-3]+'_orig.cl' 46 | 47 | if(os.path.exists(orig_config_file) == False): 48 | os.system('cp ' + config_file + ' ' + orig_config_file) 49 | 50 | out_data = "" 51 | with open(orig_config_file, 'r', encoding="utf-8") as f: 52 | for line in f: 53 | if '#define VEC_SIZE' in line: 54 | line = '#define VEC_SIZE %d // larger than 4, i.e., 4, 8, 16, ...\n'%VEC_SIZE 55 | 56 | if '#define LANE_NUM' in line: 57 | line = '#define LANE_NUM %d // larger than 1, for alexnet: 2, 3, 4, 8, 12, 15, 16, 22, 28, 32, 34, 48, 50, 51, 64, ...\n'%LANE_NUM 58 | 59 | if '#define CONV_GP_SIZE_X' in line: 60 | line = '#define CONV_GP_SIZE_X %d\n'%CONV_GP_SIZE_X 61 | 62 | out_data += line 63 | 64 | with open(config_file, 'w', encoding='utf-8') as cfg_f: 65 | cfg_f.write(out_data) 66 | 67 | ##make clean 68 | os.system('make clean --directory' + ' ' + proj_dir) 69 | ##complie the RTL 70 | os.system('make --directory' + ' ' + proj_dir + '/device/RTL/') 71 | ##comple the Make 72 | os.system('make --directory' + ' ' + proj_dir + ' > ' + proj_dir + '/make_log.txt') 73 | 74 | def run_emu(proj_dir):#run the emulation 75 | cmd1 = 'cd ' + proj_dir 76 | if FUN_SEL == SW_EMU: 77 | cmd2 = 'export CL_CONTEXT_EMULATOR_DEVICE_ALTERA=1'#use this cmd when doing the software emulation 78 | elif FUN_SEL == HW: 79 | cmd2 = 'echo'#use this cmd when using the fpga hardware 80 | cmd3 = './run.exe conv.aocx > run_log.txt' 81 | cmd4 = cmd1+'&&'+cmd2+'&&'+cmd3 82 | os.system(cmd4) 83 | 84 | def clean_project(proj_dir, clean_sel): 85 | ''' 86 | delet all the files in the "conv" folder, except the 'reports' dir in 'proj_dir/conv' dir. 87 | ''' 88 | if clean_sel == 1: 89 | for item in os.listdir(proj_dir + '/conv'): 90 | item_path = os.path.join(proj_dir + '/conv',item) 91 | if(not(os.path.isdir(item_path)and('reports' in item))): 92 | os.system('rm -r ' + item_path) 93 | 94 | def env_cfg(board): 95 | ''' 96 | 验证当前所选用板子在bashrc中是否配置正确 97 | ''' 98 | if(board == de5_net): 99 | if ("de5net_a7" not in subprocess.getoutput("aoc -list-boards")): 100 | print("check the board config in ~/.bashrc") 101 | exit() 102 | board_cfg = "_de5_net" 103 | 104 | if(board == de10_std):##TODO 105 | if ("de5net_a7" not in subprocess.getoutput("aoc -list-boards")): 106 | print("check the board config in ~/.bashrc") 107 | exit() 108 | board_cfg = "_de10_std" 109 | 110 | if(board == de10_nano):##TODO 111 | if ("de5net_a7" not in subprocess.getoutput("aoc -list-boards")): 112 | print("check the board config in ~/.bashrc") 113 | exit() 114 | board_cfg = "_de10_nano" 115 | 116 | if(board == de5a_net): 117 | if ("de5a_net_e1" not in subprocess.getoutput("aoc -list-boards")): 118 | print("check the board config in ~/.bashrc") 119 | exit() 120 | board_cfg = "_de5a_net" 121 | return board_cfg 122 | 123 | def get_rpt_data(proj_dir): 124 | ''' 125 | 从report中读取信息 126 | ''' 127 | report_dir = proj_dir + '/conv/reports/lib/json/summary.json' 128 | aa = json.load(open(report_dir)) 129 | for item in aa['estimatedResources']['children']: 130 | if(item['name']=='Total'): 131 | break 132 | return item['data']+item['data_percent'] 133 | 134 | def decode_hw_log(ch,line):# 135 | ''' 136 | 将aocx文件中的资源消耗信息解码 137 | Args: 138 | ch:是b'ALUTs:'之类的“头”,要保留这些头后面的东西 139 | ''' 140 | pt_end1 = line[len(ch):].find(b'\n')#找到结束位置 141 | pt_end2 = line[len(ch):].find(b' /') 142 | 143 | if pt_end1 < 0: 144 | pt_end1 = 100 145 | if pt_end2 < 0: 146 | pt_end2 = 100 147 | 148 | if pt_end1 < pt_end2: 149 | useful_data = line[len(ch):len(ch)+pt_end1] 150 | else: 151 | useful_data = line[len(ch):len(ch)+pt_end2] 152 | 153 | return float(useful_data.replace(b',',b''))#去掉','将并调整为整数 154 | 155 | def get_hw_data(proj_dir): 156 | ''' 157 | 从aocx文件中获取信息 158 | ''' 159 | fname = proj_dir + '/conv.aocx' 160 | hw_data = [0 for i in range(7)] 161 | res = [b'ALUTs:',b'Registers:',b'Logic utilization:',b'DSP blocks:',b'Memory bits:',b'RAM blocks:',b'Actual clock freq:'] 162 | 163 | i = 0 164 | with open(fname,'rb') as aocxf: 165 | offset =-1500#从文件尾向前偏移的单位 166 | aocxf.seek(offset,2) 167 | lines=aocxf.readlines() 168 | for line in lines: 169 | for r in res:#查看line里是否包括res资源 170 | pt=line.find(r) 171 | if(pt!=-1): 172 | hw_data[i]=decode_hw_log(r,line[pt:]) 173 | # print(line[pt+6:]) 174 | i += 1 175 | 176 | if i != len(res): 177 | print("get aocx information fail!!!!") 178 | exit() 179 | 180 | return hw_data 181 | 182 | def get_run_data(proj_dir): 183 | fname = proj_dir + '/run_log.txt' 184 | run_data = [[0 for col in range(LAYER_NUM)] for row in range(5)] 185 | run_time = [b'MemRd: ', b'Conv : ', b'Pool : ', b'MemWr: ', b'Lrn : '] 186 | with open(fname,'rb') as runf: 187 | offset =-1500#从文件尾向前偏移的单位 188 | runf.seek(offset,2) 189 | lines=runf.readlines() 190 | for i in range(len(lines)):#对每一层进行检查 191 | sp = lines[i].find(b'Layer-')#start point 192 | if(sp != -1):#如果是层数信息 193 | ep = lines[i].find(b':')#end point 194 | layer = int(lines[i][sp+len(b'Layer-'):ep]) 195 | for j in range(len(run_time)):#对没一项时间进行检查 196 | sp = lines[i+j+1].find(run_time[j]) 197 | if(sp != -1): 198 | ep = lines[i+j+1].find(b' ms') 199 | run_data[j][layer-1] = float(lines[i+j+1][sp+len(run_time[j]):ep]) 200 | else: 201 | print('collect run time information fail!!!!') 202 | exit() 203 | return run_data 204 | 205 | def create_project(orig_dir, board_cfg, VEC_SIZE, LANE_NUM, CONV_GP_SIZE_X): 206 | ''' 207 | 本函数建立对应于不同VEC_SIZE,LANE_NUM,CONV_GP_SIZE_X参数的工程。 208 | 不同的VEC_SIZE,LANE_NUM,CONV_GP_SIZE_X的工程都是有原始的“project”文件夹下的工程衍生而来。 209 | Argus: 210 | orig_dir:原始project文件夹 211 | board:不同的FPGA 212 | ''' 213 | 214 | proj_dir = './project_vec%d_lan%d_gpx%d'%(VEC_SIZE, LANE_NUM, CONV_GP_SIZE_X) + board_cfg 215 | if os.path.exists(proj_dir):#若之前存在,则删除 216 | os.system('rm -rf ' + proj_dir) 217 | 218 | os.system('cp -r '+ orig_dir + ' ' + proj_dir) 219 | edit_make(proj_dir, VEC_SIZE, LANE_NUM, CONV_GP_SIZE_X)#修改参数并编译工程 220 | 221 | return proj_dir 222 | 223 | # 创建一个workbook 设置编码 224 | workbook = xlwt.Workbook(encoding = 'utf-8') 225 | if FUN_SEL == REPORT: 226 | # 创建一个worksheet 227 | report_sheet = workbook.add_sheet('Report') 228 | # 写入excel 229 | # 参数对应 行, 列, 值 230 | LAB_REPORT=['Board','VEC_SIZE','LANE_NUM','CONV_GP_SIZE_X','ALUTs','FFs','RAMs','DSPs','ALUTs%','FFs%','RAMs%','DSPs%'] 231 | for i in range(len(LAB_REPORT)): 232 | report_sheet.write(0,i, label = LAB_REPORT[i]) 233 | if FUN_SEL == HW: 234 | report_sheet = workbook.add_sheet('Report') 235 | hwinfo_sheet = workbook.add_sheet('HW') 236 | runtim_sheet = workbook.add_sheet('Run time') 237 | 238 | LAB_REPORT=['Board','VEC_SIZE','LANE_NUM','CONV_GP_SIZE_X','ALUTs','FFs','RAMs','DSPs','ALUTs%','FFs%','RAMs%','DSPs%'] 239 | for i in range(len(LAB_REPORT)): 240 | report_sheet.write(0,i, label = LAB_REPORT[i]) 241 | 242 | LAB_HW=['Board','VEC_SIZE','LANE_NUM','CONV_GP_SIZE_X','ALUTs','Registers','Logic utilization','DSP blocks','Memory bits','RAM blocks','Actual clock freq'] 243 | for i in range(len(LAB_HW)): 244 | hwinfo_sheet.write(0, i, label = LAB_HW[i]) 245 | 246 | LAB_RUN=['Board','VEC_SIZE','LANE_NUM','CONV_GP_SIZE_X', 'Function']+[('Layer-%d'%(i+1)) for i in range(LAYER_NUM)] 247 | for i in range(len(LAB_RUN)): 248 | runtim_sheet.write(0, i, label = LAB_RUN[i]) 249 | 250 | row = 1 251 | for VEC_SIZE in [16]: #larger than 4, i.e., 4, 8, 16 252 | for LANE_NUM in [8, 22]: #larger than 1, for alexnet: 2, 3, 4, 8, 12, 15, 16, 22, 28, 32, 34, 48, 50, 51, 64, ... 253 | for CONV_GP_SIZE_X in [7]: 254 | for board in [de5_net]:#[de5_net, de10_std, de10_nano, de5a_net] 255 | board_cfg = env_cfg(board)#验证bashrc所配置的环境 256 | proj_dir = create_project(orig_dir, board_cfg, VEC_SIZE, LANE_NUM, CONV_GP_SIZE_X)#建立并编译工程 257 | 258 | if FUN_SEL == REPORT: 259 | #获取report信息并存储 260 | rpt_data = get_rpt_data(proj_dir) 261 | report_sheet.write(row, 0, label=board_cfg[1:]) 262 | report_sheet.write(row, 1, label=VEC_SIZE) 263 | report_sheet.write(row, 2, label=LANE_NUM) 264 | report_sheet.write(row, 3, label=CONV_GP_SIZE_X) 265 | for col in range(len(rpt_data)): 266 | report_sheet.write(row, 4+col, label=rpt_data[col]) 267 | 268 | row = row + 1 269 | 270 | if FUN_SEL == HW: 271 | run_emu(proj_dir)#run.exe conv.aocx 272 | #获取report信息并存储 273 | rpt_data = get_rpt_data(proj_dir) 274 | report_sheet.write(row,0,label=board_cfg[1:]) 275 | report_sheet.write(row,1,label=VEC_SIZE) 276 | report_sheet.write(row,2,label=LANE_NUM) 277 | report_sheet.write(row,3,label=CONV_GP_SIZE_X) 278 | for col in range(len(rpt_data)): 279 | report_sheet.write(row,4+col,label=rpt_data[col]) 280 | 281 | #获取aocx中的信息并存储 282 | hw_data = get_hw_data(proj_dir) 283 | hwinfo_sheet.write(row,0,label=board_cfg[1:]) 284 | hwinfo_sheet.write(row,1,label=VEC_SIZE) 285 | hwinfo_sheet.write(row,2,label=LANE_NUM) 286 | hwinfo_sheet.write(row,3,label=CONV_GP_SIZE_X) 287 | for col in range(len(hw_data)): 288 | hwinfo_sheet.write(row,4+col,label=hw_data[col]) 289 | 290 | #获取run_log.txt中的时间消耗信息 291 | run_data = get_run_data(proj_dir) 292 | 293 | runtim_sheet.write((row-1)*6+1,0,label=board_cfg[1:]) 294 | runtim_sheet.write((row-1)*6+1,1,label=VEC_SIZE) 295 | runtim_sheet.write((row-1)*6+1,2,label=LANE_NUM) 296 | runtim_sheet.write((row-1)*6+1,3,label=CONV_GP_SIZE_X) 297 | 298 | LAB_FUN = ['MemRd', 'Conv', 'Pool', 'MemWr', 'Lrn'] 299 | for fun in range(5): 300 | runtim_sheet.write((row-1)*6+fun+1,4,label=LAB_FUN[fun]) 301 | for col in range(LAYER_NUM): 302 | runtim_sheet.write((row-1)*6+fun+1,col+5,label=run_data[fun][col]) 303 | 304 | row = row + 1 305 | 306 | elif FUN_SEL == SW_EMU: 307 | run_emu(proj_dir)#run.exe conv.aocx 308 | run_data = get_run_data(proj_dir) 309 | 310 | runtim_sheet.write((row-1)*6+1,0,label=board_cfg[1:]) 311 | runtim_sheet.write((row-1)*6+1,1,label=VEC_SIZE) 312 | runtim_sheet.write((row-1)*6+1,2,label=LANE_NUM) 313 | runtim_sheet.write((row-1)*6+1,3,label=CONV_GP_SIZE_X) 314 | 315 | LAB_FUN = ['MemRd', 'Conv', 'Pool', 'MemWr', 'Lrn'] 316 | for fun in range(5):#对于每一层包含的每个功能 317 | runtim_sheet.write((row-1)*6+fun+1,4,label=LAB_FUN[fun]) 318 | for col in range(LAYER_NUM): 319 | runtim_sheet.write((row-1)*6+fun+1,col+5,label=run_data[fun][col]) 320 | 321 | row = row + 1 322 | 323 | clean_project(proj_dir, CLEAN_PROJ)#选择是否清除Quartus工程,默认0不清除 324 | 325 | if FUN_SEL == REPORT or FUN_SEL == HW: 326 | workbook.save('report_%s.xls'%datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')) -------------------------------------------------------------------------------- /PipeCNN_note_resource/flops_calculation_v2.py: -------------------------------------------------------------------------------- 1 | # 本程序用来计算给定网络的计算量 2 | 3 | import numpy as np 4 | import xlwt 5 | import matplotlib.pyplot as plt 6 | 7 | DDR_BANDWIDTH = 95400000000*2 #bits/s 8 | DATA_WIDTH = 8 #数据宽度为8bits 9 | 10 | #AlexNet 11 | LAYER_NUM = 8 12 | NUM_CONFIG_ITEM = 25 13 | layer_config_original = [[0, 14 | 227, 227, 3, 11, 11, 3, 96, 96, 15 | 0, 16 | 55, 55, 96, 4, 0, 0, 1, 17 | 1, 27, 27, 96, 3, 2, 18 | 1, 19 | 1],#Layer-1 20 | [0, 21 | 27, 27, 96, 5, 5, 48, 256, 256, 22 | 0, 23 | 27, 27, 256, 1, 2, 1, 1, 24 | 1, 13, 13, 256, 3, 2, 25 | 1, 26 | 1],#Layer-2 27 | [0, 28 | 13, 13, 256, 3, 3, 256, 384, 384, 29 | 0, 30 | 13, 13, 384, 1, 1, 0, 1, 31 | 0, 13, 13, 384, 0, 0, 32 | 0, 33 | 1],#Layer-3 34 | [0, 35 | 13, 13, 384, 3, 3, 192, 384, 384, 36 | 1, 37 | 13, 13, 384, 1, 1, 1, 1, 38 | 0, 13, 13, 384, 0, 0, 39 | 0, 40 | 0],#Layer-4 41 | [0, 42 | 13, 13, 384, 3, 3, 192, 256, 256, 43 | 0, 44 | 13, 13, 256, 1, 1, 1, 1, 45 | 1, 6, 6, 256, 3, 2, 46 | 0, 47 | 1],#Layer-5 Note: for last conv layer, outputs are write to fc buffer 48 | [1, 49 | 6, 6, 256, 6, 6, 256, 4096, 4096, # Note: The input size (dim1/dim2) is the combined data size (batched) 50 | 4, 51 | 1, 1, 4096, 6, 0, 0, 1, 52 | 0, 1, 1, 4096, 0, 0, 53 | 0, 54 | 2],#Layer-6 fc 55 | [1, 56 | 1, 1, 4096, 1, 1, 4096, 4096, 4096, 57 | 2, 58 | 1, 1, 4096, 1, 0, 0, 1, 59 | 0, 1, 1, 4096, 0, 0, 60 | 0, 61 | 3],#Layer-7 fc 62 | [1, 63 | 1, 1, 4096, 1, 1, 4096, 1024, 1024, 64 | 3, 65 | 1, 1, 1024, 1, 0, 0, 0, 66 | 0, 1, 1, 1024, 0, 0, 67 | 0, 68 | 2]]#Layer-8 fc 69 | 70 | # ##VGG 71 | # LAYER_NUM = 16 72 | # NUM_CONFIG_ITEM = 25 73 | # layer_config_original = [[0, 74 | # 224, 224, 3, 3, 3, 3, 64, 64, 75 | # 0, 76 | # 224, 224, 64, 1, 1, 0, 1, 77 | # 0, 224, 224, 64, 0, 0, 78 | # 0, 79 | # 1],#Layer-1 (conv1_1) 80 | # [0, 81 | # 224, 224, 64, 3, 3, 64, 64, 64, 82 | # 1, 83 | # 224, 224, 64, 1, 1, 0, 1, 84 | # 1, 112, 112, 64, 2, 2, 85 | # 0, 86 | # 0],#Layer-2 (conv1_2) 87 | # [0, 88 | # 112, 112, 64, 3, 3, 64, 128, 128, 89 | # 4, 90 | # 112, 112, 128, 1, 1, 0, 1, 91 | # 0, 112, 112, 128, 0, 0, 92 | # 0, 93 | # 1],#Layer-3 (conv2_1) 94 | # [0, 95 | # 112, 112, 128, 3, 3, 128, 128, 128, 96 | # 1, 97 | # 112, 112, 128, 1, 1, 0, 1, 98 | # 1, 56, 56, 128, 2, 2, 99 | # 0, 100 | # 0],#Layer-4 (conv2_2) 101 | # [0, 102 | # 56, 56, 128, 3, 3, 128, 256, 256, 103 | # 4, 104 | # 56, 56, 256, 1, 1, 0, 1, 105 | # 0, 56, 56, 256, 0, 0, 106 | # 0, 107 | # 1],#Layer-5 (conv3_1) 108 | # [0, 109 | # 56, 56, 256, 3, 3, 256, 256, 256, 110 | # 1, 111 | # 56, 56, 256, 1, 1, 0, 1, 112 | # 0, 56, 56, 256, 0, 0, 113 | # 0, 114 | # 0],#Layer-6 (conv3_2) 115 | # [0, 116 | # 56, 56, 256, 3, 3, 256, 256, 256, 117 | # 0, 118 | # 56, 56, 256, 1, 1, 0, 1, 119 | # 1, 28, 28, 256, 2, 2, 120 | # 0, 121 | # 1],#Layer-7 (conv3_3) 122 | # [0, 123 | # 28, 28, 256, 3, 3, 256, 512, 512, 124 | # 4, 125 | # 28, 28, 512, 1, 1, 0, 1, 126 | # 0, 28, 28, 512, 0, 0, 127 | # 0, 128 | # 0],#Layer-8 (conv4_1) 129 | # [0, 130 | # 28, 28, 512, 3, 3, 512, 512, 512, 131 | # 0, 132 | # 28, 28, 512, 1, 1, 0, 1, 133 | # 0, 28, 28, 512, 0, 0, 134 | # 0, 135 | # 1],#Layer-9 (conv4_2) 136 | # [0, 137 | # 28, 28, 512, 3, 3, 512, 512, 512, 138 | # 1, 139 | # 28, 28, 512, 1, 1, 0, 1, 140 | # 1, 14, 14, 512, 2, 2, 141 | # 0, 142 | # 0],#Layer-10 (conv4_3) 143 | # [0, 144 | # 14, 14, 512, 3, 3, 512, 512, 512, 145 | # 4, 146 | # 14, 14, 512, 1, 1, 0, 1, 147 | # 0, 14, 14, 512, 0, 0, 148 | # 0, 149 | # 1],#Layer-11 (conv5_1) 150 | # [0, 151 | # 14, 14, 512, 3, 3, 512, 512, 512, 152 | # 1, 153 | # 14, 14, 512, 1, 1, 0, 1, 154 | # 0, 14, 14, 512, 0, 0, 155 | # 0, 156 | # 0],#Layer-12 (conv5_2) 157 | # [0, 158 | # 14, 14, 512, 3, 3, 512, 512, 512, 159 | # 0, 160 | # 14, 14, 512, 1, 1, 0, 1, 161 | # 1, 7, 7, 512, 2, 2, 162 | # 0, 163 | # 1],#Layer-13 (conv5_3) Note: for last conv layer, outputs are write to fc buffer 164 | # [1, 165 | # 7, 7, 512, 7, 7, 512, 4096, 4096, 166 | # 4, 167 | # 1, 1, 4096, 7, 0, 0, 1, 168 | # 0, 1, 1, 4096, 0, 0, 169 | # 0, 170 | # 2],#Layer-14 (fc6) 171 | # [1, 172 | # 1, 1, 4096, 1, 1, 4096, 4096, 4096, 173 | # 2, 174 | # 1, 1, 4096, 1, 0, 0, 1, 175 | # 0, 1, 1, 4096, 0, 0, 176 | # 0, 177 | # 3],#Layer-15 (fc7) 178 | # [1, 179 | # 1, 1, 4096, 1, 1, 4096, 1024, 1024, 180 | # 3, 181 | # 1, 1, 1024, 1, 0, 0, 0, 182 | # 0, 1, 1, 1024, 0, 0, 183 | # 0, 184 | # 2]#Layer-16 (fc8) 185 | # ] 186 | 187 | # Configuration file instructions 188 | [layer_type, # "0" -> conv, "1" -> fc 189 | 190 | data_w, data_h, data_n, weight_w, weight_h, weight_n, weight_m, bias_size, #/memRd Parameters 191 | 192 | memrd_src, #"0"-> data_buf "1"-> output_buf "2"->"fc_1_buffer" "3"->"fc_2_buffer" "4"->"pool_buffer" "5"->"eltwise_buf"(resnet) 193 | 194 | conv_x, conv_y, conv_z, conv_stride, conv_padding, conv_split, conv_relu, #Conv Parameters 195 | 196 | pool_on, pool_x, pool_y, pool_z, pool_size, pool_stride, # Pooling Parameters 197 | 198 | lrn_on,# lrn on/off control 199 | 200 | memwr_dst] = [i for i in range(25)]#"0"-> data_buf "1"-> output_buf "2"->"fc_1_buffer" "3"->"fc_2_buffer" 201 | 202 | [CONV_LAYER, FC_LAYER] = [0, 1]#与上面的layer_type想对应 203 | 204 | def vail_vec_lan(layer_config): 205 | ''' 206 | 计算VEC_SIZE 和 LANE_NUM的可能值 207 | VEC_SIZE 需要大于等于4,并且除去输入层,其余各层都需要被weight_n整除, i.e., 4, 8, 16, ...;典型值为16 208 | LANE_NUM 需要大于1,对于alexnet: 2, 3, 4, 8, 12, 15, 16, 22, 28, 32, 34, 48, 50, 51, 64, ...; 典型值16 209 | ''' 210 | vec_size_list=[] 211 | for vec_size in range(4,100): 212 | for ll in range(1,LAYER_NUM): 213 | if (np.mod(layer_config[ll][weight_n],vec_size)!=0): 214 | break 215 | if(ll==LAYER_NUM-1): 216 | vec_size_list.append(vec_size) 217 | 218 | lane_num_list=[] 219 | for lan_num in range(2,100): 220 | for ll in range(1,LAYER_NUM): 221 | if (np.mod(np.ceil(layer_config[ll][weight_m]/lan_num),2)!=0 and (layer_config[ll][conv_split]==1)): 222 | break 223 | if(ll==LAYER_NUM-1): 224 | lane_num_list.append(lan_num) 225 | 226 | return vec_size_list,lane_num_list#返回可能的vec_size和lane_num 227 | 228 | 229 | def cal_ftp(layer_config, layer_config_original, layer_num, vec_size, lane_num, frequency): 230 | ''' 231 | 计算神经网络中每层的"计算量flops","吞吐量turoughput"和"参数量paras" 232 | Input Args: 233 | layer_config: 网络的配置参数 234 | layer_config_original:在prepare()操作之前的原始网络参数 235 | layer_num:总共网络的层数 236 | vec_size,lane_num:并行度参数 237 | frequency:FPGA运行频率,可以在conv.aocx文件中找到 238 | 239 | Output Args: 240 | flops:返回每层的计算量 241 | time_layer:每层网络消耗的时间 242 | paras:每层网络的参数量 243 | conv_time_group:每组的卷积时间 244 | read_time_group:每组读取数据的时间 245 | ''' 246 | flops = [ 0 for i in range(layer_num)]#每层的计算量 247 | time_layer = [ 0 for i in range(layer_num)]#每层的消耗时间 248 | paras = [ 0 for i in range(layer_num)]#每层的参数量 249 | 250 | conv_time_group = [ 0 for i in range(layer_num)]#每层网络,每组卷积时间 251 | read_time_group = [ 0 for i in range(layer_num)]#每层网络,每组读取时间 252 | 253 | for ll in range(layer_num): 254 | #这里mac记为两个运算,即“加法”和“乘法”,所以计算量是×2的,并且加上了bias的加法运算 255 | flops[ll] = 2 * layer_config[ll][weight_w] * layer_config[ll][weight_h] * layer_config[ll][weight_n] * layer_config[ll][conv_x] * layer_config[ll][conv_y] * layer_config[ll][weight_m] 256 | 257 | if(layer_config[ll][layer_type]==CONV_LAYER):#卷积层的瓶颈在于卷积计算 258 | time_layer[ll] = flops[ll]/(2*vec_size*lane_num*frequency) 259 | X = 7#一个组里面有几个卷积 260 | elif(layer_config[ll][layer_type]==FC_LAYER):#全连接层的瓶颈在于读取数据,数据的尺寸和卷积核尺寸相等,并且读入的是补零前的原始数据 261 | time_layer[ll] = layer_config_original[ll][weight_w] * layer_config_original[ll][weight_h] * layer_config_original[ll][weight_n] * layer_config_original[ll][weight_m] * DATA_WIDTH / DDR_BANDWIDTH #加载weight 262 | time_layer[ll] += layer_config_original[ll][weight_w] * layer_config_original[ll][weight_h] * layer_config_original[ll][weight_n] * DATA_WIDTH / DDR_BANDWIDTH #加载数据 263 | X = 1 264 | paras[ll] = layer_config[ll][weight_w] * layer_config[ll][weight_h] * layer_config[ll][weight_n] * layer_config[ll][weight_m]#每层参数个数 265 | 266 | conv_time_group[ll] = layer_config[ll][weight_w]*layer_config[ll][weight_h]*layer_config[ll][weight_n] * X * lane_num/(lane_num*vec_size*frequency) 267 | read_time_group[ll] = (((X - 1) * layer_config_original[ll][conv_stride] + layer_config_original[ll][weight_w]) * layer_config_original[ll][weight_h]*layer_config_original[ll][weight_n]+layer_config_original[ll][weight_w]*layer_config_original[ll][weight_h]*layer_config_original[ll][weight_n]*lane_num) * DATA_WIDTH / DDR_BANDWIDTH 268 | 269 | return flops, time_layer, paras, conv_time_group, read_time_group 270 | 271 | def prepare(layer_config_original, layer_num, vec_size, lane_num): 272 | ''' 273 | 类似于host端的prepare()函数 274 | ''' 275 | layer_config = [[0 for i in range(NUM_CONFIG_ITEM)] for j in range(layer_num)] 276 | for ll in range(layer_num): 277 | 278 | #First, create a new layer config 279 | for ii in range(NUM_CONFIG_ITEM): 280 | layer_config[ll][ii] = layer_config_original[ll][ii] 281 | 282 | #Second, perform padding on dim4, when it is not divisible by LANE_NUM 283 | if(layer_config[ll][weight_m]%lane_num != 0): 284 | layer_config[ll][weight_m] = np.ceil(float(layer_config[ll][weight_m])/lane_num)*lane_num 285 | layer_config[ll][bias_size] = layer_config[ll][weight_m] 286 | #对输入图像的深度进行扩展,使其是VEC_SIZE的公倍数 287 | layer_config[0][weight_n] = np.ceil(float(layer_config[0][weight_n])/vec_size)*vec_size 288 | layer_config[0][data_n] = layer_config[0][weight_n] 289 | return layer_config 290 | 291 | ## 主功能 292 | flops_orig, time_layer_orig, paras_orig, a, b = cal_ftp(layer_config_original,layer_config_original, LAYER_NUM, 1, 1,1)#原始网络每层的时间消耗没有太大意义 293 | 294 | vec_size_list = [4,4,4,4,4,4,4,8,8,8,8,8,8,8,8,16,16,16,16,16] 295 | lane_num_list = [2,4,8,16,22,32,64,2,4,8,16,22,28,32,48,2,4,8,16,22] 296 | frequency_list= [249.3,249.4,240.6,238.1,222.3,216.5,204.2,254.1,252.9,241.1,225.6,188,195.5,198.9,155.9,264.7,248.9,232.3,229.1,78.76]#频率 297 | 298 | # 创建一个workbook 设置编码 299 | workbook = xlwt.Workbook(encoding = 'utf-8') 300 | # 创建一个worksheet 301 | worksheet1 = workbook.add_sheet('total time') 302 | 303 | if not(len(vec_size_list)==len(lane_num_list) and len(lane_num_list)==len(frequency_list)): 304 | print("Check the vec_size list and lane_num_list and frequency_list!!") 305 | exit() 306 | # 创建表格表头 307 | for i in range(3 + LAYER_NUM): 308 | if i == 0: 309 | worksheet1.write(i,0,label='VEC_SIZE') 310 | if i == 1: 311 | worksheet1.write(i,0,label='LANE_NUMBER') 312 | if i == 2: 313 | worksheet1.write(i,0,label='Totle time') 314 | if i >= 3: 315 | worksheet1.write(i,0,label='Layer-%d'%(i-2)) 316 | # 分别将网络的总耗时和每层的耗时写入表格 317 | for i in range(len(vec_size_list)): 318 | layer_config = prepare(layer_config_original, LAYER_NUM, vec_size_list[i], lane_num_list[i]) 319 | flops, time_layer, paras, a, b= cal_ftp(layer_config, layer_config_original, LAYER_NUM, vec_size_list[i],lane_num_list[i], (frequency_list[i]*1000000)) 320 | worksheet1.write(0,1+i,label=vec_size_list[i]) 321 | worksheet1.write(1,1+i,label=lane_num_list[i]) 322 | worksheet1.write(2,1+i,label=(1000*np.sum(time_layer))) 323 | 324 | for j in range(LAYER_NUM): 325 | worksheet1.write(3+j,1+i,label='%.2f'%(1000*time_layer[j])) 326 | 327 | workbook.save('flops.xls') 328 | 329 | # 每层卷积和读取数据时间对比 330 | LAYER = -3#对选定的VEC_SIZE LANE_NUM进行观察每个group的时间 331 | layer_config = prepare(layer_config_original, LAYER_NUM, vec_size_list[LAYER], lane_num_list[LAYER]) 332 | flops, time_layer, paras, a, b= cal_ftp(layer_config, layer_config_original, LAYER_NUM, vec_size_list[LAYER],lane_num_list[LAYER], (frequency_list[LAYER]*1000000)) 333 | 334 | plt.figure() 335 | plt.bar(np.arange(LAYER_NUM)+1, b, alpha=0.5, width=0.3, color='yellow', edgecolor='red', label='read time', lw=3) 336 | plt.bar(np.arange(LAYER_NUM)+1+0.4, a, alpha=0.2, width=0.3, color='green', edgecolor='blue', label='conv time', lw=3) 337 | plt.legend(loc='upper left') 338 | plt.title("time consumption for reading and conving") 339 | plt.show() --------------------------------------------------------------------------------