├── .gitignore ├── DeepStream ├── Readme.md ├── config_infer_primary_yoloV4.txt ├── deepstream_app_config_yoloV4.txt ├── labels.txt └── nvdsinfer_custom_impl_Yolo │ ├── Makefile │ ├── Readme.md │ ├── kernels.cu │ ├── nvdsinfer_yolo_engine.cpp │ ├── nvdsparsebbox_Yolo.cpp │ ├── trt_utils.cpp │ ├── trt_utils.h │ ├── yolo.cpp │ ├── yolo.h │ ├── yoloPlugins.cpp │ └── yoloPlugins.h ├── License.txt ├── README.md ├── Use_yolov4_to_train_your_own_data.md ├── cfg.py ├── cfg ├── yolov3-tiny.cfg ├── yolov3.cfg ├── yolov4-custom.cfg ├── yolov4-sam-mish.cfg ├── yolov4-tiny.cfg └── yolov4.cfg ├── data ├── coco.names ├── dog.jpg ├── giraffe.jpg ├── prediction.jpg └── voc.names ├── dataset.py ├── demo.py ├── demo_darknet2onnx.py ├── demo_pytorch2onnx.py ├── demo_tensorflow.py ├── demo_trt.py ├── evaluate_on_coco.py ├── models.py ├── requirements.txt ├── tool ├── __init__.py ├── camera.py ├── coco_annotation.py ├── config.py ├── darknet2onnx.py ├── darknet2pytorch.py ├── onnx2tensorflow.py ├── region_loss.py ├── torch_utils.py ├── tv_reference │ ├── README.md │ ├── coco_eval.py │ ├── coco_utils.py │ ├── engine.py │ ├── group_by_aspect_ratio.py │ ├── train.py │ ├── transforms.py │ └── utils.py ├── utils.py ├── utils_iou.py ├── utils_iou_test.py └── yolo_layer.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | ttest 2 | *.weights 3 | *.pth 4 | *.onnx 5 | *.engine 6 | *.pyc 7 | *.infer 8 | *.npy 9 | 10 | z_demo_* 11 | 12 | __pycache__ 13 | .idea 14 | .vscode 15 | runs 16 | log 17 | 18 | *.jpg 19 | *.json 20 | data/outcome 21 | -------------------------------------------------------------------------------- /DeepStream/Readme.md: -------------------------------------------------------------------------------- 1 | # This should be run in JetPack 4.4 / JetPack 4.4 G.A. with DeepStream 5.0 / DeepStream 5.0 GA . 2 | 3 | 1. Compile the custom plugin for Yolo 4 | 2. Convert the ONNX file to TRT with TRTEXEC / TensorRT 5 | 3. Change the model-engine-file in config_infer_primary_yoloV4.txt 6 | 4. In the deepstream_app_config_yoloV4.txt, change 7 | a) source0 : uri=file: directory. 8 | b) primary-gie : model-engine-file= 9 | # Note that for multi-batch, overhead is large owing to NMS is not used. 10 | -------------------------------------------------------------------------------- /DeepStream/config_infer_primary_yoloV4.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | # Following properties are mandatory when engine files are not specified: 24 | # int8-calib-file(Only in INT8), model-file-format 25 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 26 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 27 | # ONNX: onnx-file 28 | # 29 | # Mandatory properties for detectors: 30 | # num-detected-classes 31 | # 32 | # Optional properties for detectors: 33 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0) 34 | # custom-lib-path 35 | # parse-bbox-func-name 36 | # 37 | # Mandatory properties for classifiers: 38 | # classifier-threshold, is-classifier 39 | # 40 | # Optional properties for classifiers: 41 | # classifier-async-mode(Secondary mode only, Default=false) 42 | # 43 | # Optional properties in secondary mode: 44 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 45 | # input-object-min-width, input-object-min-height, input-object-max-width, 46 | # input-object-max-height 47 | # 48 | # Following properties are always recommended: 49 | # batch-size(Default=1) 50 | # 51 | # Other optional properties: 52 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 53 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 54 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary), 55 | # custom-lib-path, network-mode(Default=0 i.e FP32) 56 | # 57 | # The values in the config file are overridden by values set through GObject 58 | # properties. 59 | 60 | [property] 61 | gpu-id=0 62 | net-scale-factor=0.0039215697906911373 63 | #0=RGB, 1=BGR 64 | model-color-format=0 65 | model-engine-file= 66 | labelfile-path=labels.txt 67 | ## 0=FP32, 1=INT8, 2=FP16 mode 68 | network-mode=2 69 | num-detected-classes=80 70 | gie-unique-id=1 71 | network-type=0 72 | is-classifier=0 73 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering) 74 | cluster-mode=2 75 | maintain-aspect-ratio=1 76 | parse-bbox-func-name=NvDsInferParseCustomYoloV4 77 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 78 | engine-create-func-name=NvDsInferYoloCudaEngineGet 79 | #scaling-filter=0 80 | #scaling-compute-hw=0 81 | #output-blob-names=2012 82 | 83 | [class-attrs-all] 84 | nms-iou-threshold=0.2 85 | pre-cluster-threshold=0.4 86 | -------------------------------------------------------------------------------- /DeepStream/deepstream_app_config_yoloV4.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | [application] 24 | enable-perf-measurement=1 25 | perf-measurement-interval-sec=5 26 | #gie-kitti-output-dir=streamscl 27 | 28 | [tiled-display] 29 | enable=0 30 | rows=1 31 | columns=1 32 | width=1280 33 | height=720 34 | gpu-id=0 35 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform 36 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla 37 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla 38 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla 39 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson 40 | nvbuf-memory-type=0 41 | 42 | [source0] 43 | enable=1 44 | #Type - 1=CameraV4L2 2=URI 3=MultiURI 45 | type=3 46 | uri=file: 47 | 48 | num-sources=1 49 | gpu-id=0 50 | # (0): memtype_device - Memory type Device 51 | # (1): memtype_pinned - Memory type Host Pinned 52 | # (2): memtype_unified - Memory type Unified 53 | cudadec-memtype=0 54 | 55 | [sink0] 56 | enable=1 57 | #Type - 1=FakeSink 2=EglSink 3=File 58 | type=2 59 | sync=1 60 | source-id=0 61 | gpu-id=0 62 | 63 | [osd] 64 | enable=1 65 | gpu-id=0 66 | border-width=1 67 | text-size=12 68 | text-color=1;1;1;1; 69 | text-bg-color=0.3;0.3;0.3;1 70 | font=Serif 71 | show-clock=0 72 | clock-x-offset=800 73 | clock-y-offset=820 74 | clock-text-size=12 75 | clock-color=1;0;0;0 76 | nvbuf-memory-type=0 77 | 78 | [streammux] 79 | gpu-id=0 80 | ##Boolean property to inform muxer that sources are live 81 | live-source=0 82 | batch-size=1 83 | ##time out in usec, to wait after the first buffer is available 84 | ##to push the batch even if the complete batch is not formed 85 | batched-push-timeout=40000 86 | ## Set muxer output width and height 87 | width=1280 88 | height=720 89 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works 90 | ##along with width, height properties 91 | enable-padding=0 92 | nvbuf-memory-type=0 93 | 94 | # config-file property is mandatory for any gie section. 95 | # Other properties are optional and if set will override the properties set in 96 | # the infer config file. 97 | [primary-gie] 98 | enable=1 99 | gpu-id=0 100 | model-engine-file= 101 | labelfile-path=labels.txt 102 | #batch-size=1 103 | #Required by the app for OSD, not a plugin property 104 | bbox-border-color0=1;0;0;1 105 | bbox-border-color1=0;1;1;1 106 | bbox-border-color2=0;0;1;1 107 | bbox-border-color3=0;1;0;1 108 | interval=0 109 | gie-unique-id=1 110 | nvbuf-memory-type=0 111 | config-file=config_infer_primary_yoloV4.txt 112 | 113 | [sink1] 114 | enable=1 115 | type=3 116 | #1=mp4 2=mkv 117 | container=1 118 | #1=h264 2=h265 3=mpeg4 119 | codec=1 120 | #encoder type 0=Hardware 1=Software 121 | enc-type=0 122 | sync=0 123 | bitrate=4000000 124 | #H264 Profile - 0=Baseline 2=Main 4=High 125 | #H265 Profile - 0=Main 1=Main10 126 | profile=0 127 | output-file=fp16_clip1_cam1.mp4 128 | source-id=0 129 | 130 | [tracker] 131 | enable=1 132 | # For the case of NvDCF tracker, tracker-width and tracker-height must be a multiple of 32, respectively 133 | tracker-width=608 134 | tracker-height=608 135 | #ll-lib-file=/opt/nvidia/deepstream/deepstream-5.0/lib/libnvds_mot_iou.so 136 | #ll-lib-file=/opt/nvidia/deepstream/deepstream-5.0/lib/libnvds_nvdcf.so 137 | ll-lib-file=/opt/nvidia/deepstream/deepstream-5.0/lib/libnvds_mot_klt.so 138 | #ll-config-file required for IOU only 139 | #ll-config-file=iou_config.txt 140 | gpu-id=0 141 | 142 | [tests] 143 | file-loop=0 144 | -------------------------------------------------------------------------------- /DeepStream/labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | CUDA_VER?= 24 | ifeq ($(CUDA_VER),) 25 | $(error "CUDA_VER is not set") 26 | endif 27 | CC:= g++ 28 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc 29 | 30 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations 31 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include 32 | 33 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs 34 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group 35 | 36 | INCS:= $(wildcard *.h) 37 | SRCFILES:= nvdsinfer_yolo_engine.cpp \ 38 | nvdsparsebbox_Yolo.cpp \ 39 | yoloPlugins.cpp \ 40 | trt_utils.cpp \ 41 | yolo.cpp \ 42 | kernels.cu 43 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so 44 | 45 | TARGET_OBJS:= $(SRCFILES:.cpp=.o) 46 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o) 47 | 48 | all: $(TARGET_LIB) 49 | 50 | %.o: %.cpp $(INCS) Makefile 51 | $(CC) -c -o $@ $(CFLAGS) $< 52 | 53 | %.o: %.cu $(INCS) Makefile 54 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $< 55 | 56 | $(TARGET_LIB) : $(TARGET_OBJS) 57 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS) 58 | 59 | clean: 60 | rm -rf $(TARGET_LIB) 61 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/Readme.md: -------------------------------------------------------------------------------- 1 | export CUDA_VER=X.Y 2 | make 3 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/kernels.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2019 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } 19 | 20 | __global__ void gpuYoloLayerV3(const float* input, float* output, const uint gridSize, const uint numOutputClasses, 21 | const uint numBBoxes) 22 | { 23 | uint x_id = blockIdx.x * blockDim.x + threadIdx.x; 24 | uint y_id = blockIdx.y * blockDim.y + threadIdx.y; 25 | uint z_id = blockIdx.z * blockDim.z + threadIdx.z; 26 | 27 | if ((x_id >= gridSize) || (y_id >= gridSize) || (z_id >= numBBoxes)) 28 | { 29 | return; 30 | } 31 | 32 | const int numGridCells = gridSize * gridSize; 33 | const int bbindex = y_id * gridSize + x_id; 34 | 35 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] 36 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]); 37 | 38 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] 39 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]); 40 | 41 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] 42 | = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]); 43 | 44 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] 45 | = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]); 46 | 47 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] 48 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); 49 | 50 | for (uint i = 0; i < numOutputClasses; ++i) 51 | { 52 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] 53 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]); 54 | } 55 | } 56 | 57 | cudaError_t cudaYoloLayerV3(const void* input, void* output, const uint& batchSize, const uint& gridSize, 58 | const uint& numOutputClasses, const uint& numBBoxes, 59 | uint64_t outputSize, cudaStream_t stream); 60 | 61 | cudaError_t cudaYoloLayerV3(const void* input, void* output, const uint& batchSize, const uint& gridSize, 62 | const uint& numOutputClasses, const uint& numBBoxes, 63 | uint64_t outputSize, cudaStream_t stream) 64 | { 65 | dim3 threads_per_block(16, 16, 4); 66 | dim3 number_of_blocks((gridSize / threads_per_block.x) + 1, 67 | (gridSize / threads_per_block.y) + 1, 68 | (numBBoxes / threads_per_block.z) + 1); 69 | for (unsigned int batch = 0; batch < batchSize; ++batch) 70 | { 71 | gpuYoloLayerV3<<>>( 72 | reinterpret_cast(input) + (batch * outputSize), 73 | reinterpret_cast(output) + (batch * outputSize), gridSize, numOutputClasses, 74 | numBBoxes); 75 | } 76 | return cudaGetLastError(); 77 | } 78 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #include "nvdsinfer_custom_impl.h" 24 | #include "nvdsinfer_context.h" 25 | #include "yoloPlugins.h" 26 | #include "yolo.h" 27 | 28 | #include 29 | 30 | #define USE_CUDA_ENGINE_GET_API 1 31 | 32 | static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContextInitParams* initParams) 33 | { 34 | std::string yoloCfg = initParams->customNetworkConfigFilePath; 35 | std::string yoloType; 36 | 37 | std::transform (yoloCfg.begin(), yoloCfg.end(), yoloCfg.begin(), [] (uint8_t c) { 38 | return std::tolower (c);}); 39 | 40 | if (yoloCfg.find("yolov2") != std::string::npos) { 41 | if (yoloCfg.find("yolov2-tiny") != std::string::npos) 42 | yoloType = "yolov2-tiny"; 43 | else 44 | yoloType = "yolov2"; 45 | } else if (yoloCfg.find("yolov3") != std::string::npos) { 46 | if (yoloCfg.find("yolov3-tiny") != std::string::npos) 47 | yoloType = "yolov3-tiny"; 48 | else 49 | yoloType = "yolov3"; 50 | } else { 51 | std::cerr << "Yolo type is not defined from config file name:" 52 | << yoloCfg << std::endl; 53 | return false; 54 | } 55 | 56 | networkInfo.networkType = yoloType; 57 | networkInfo.configFilePath = initParams->customNetworkConfigFilePath; 58 | networkInfo.wtsFilePath = initParams->modelFilePath; 59 | networkInfo.deviceType = (initParams->useDLA ? "kDLA" : "kGPU"); 60 | networkInfo.inputBlobName = "data"; 61 | 62 | if (networkInfo.configFilePath.empty() || 63 | networkInfo.wtsFilePath.empty()) { 64 | std::cerr << "Yolo config file or weights file is NOT specified." 65 | << std::endl; 66 | return false; 67 | } 68 | 69 | if (!fileExists(networkInfo.configFilePath) || 70 | !fileExists(networkInfo.wtsFilePath)) { 71 | std::cerr << "Yolo config file or weights file is NOT exist." 72 | << std::endl; 73 | return false; 74 | } 75 | 76 | return true; 77 | } 78 | 79 | #if !USE_CUDA_ENGINE_GET_API 80 | IModelParser* NvDsInferCreateModelParser( 81 | const NvDsInferContextInitParams* initParams) { 82 | NetworkInfo networkInfo; 83 | if (!getYoloNetworkInfo(networkInfo, initParams)) { 84 | return nullptr; 85 | } 86 | 87 | return new Yolo(networkInfo); 88 | } 89 | #else 90 | extern "C" 91 | bool NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder * const builder, 92 | const NvDsInferContextInitParams * const initParams, 93 | nvinfer1::DataType dataType, 94 | nvinfer1::ICudaEngine *& cudaEngine); 95 | 96 | extern "C" 97 | bool NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder * const builder, 98 | const NvDsInferContextInitParams * const initParams, 99 | nvinfer1::DataType dataType, 100 | nvinfer1::ICudaEngine *& cudaEngine) 101 | { 102 | NetworkInfo networkInfo; 103 | if (!getYoloNetworkInfo(networkInfo, initParams)) { 104 | return false; 105 | } 106 | 107 | Yolo yolo(networkInfo); 108 | cudaEngine = yolo.createEngine (builder); 109 | if (cudaEngine == nullptr) 110 | { 111 | std::cerr << "Failed to build cuda engine on " 112 | << networkInfo.configFilePath << std::endl; 113 | return false; 114 | } 115 | 116 | return true; 117 | } 118 | #endif 119 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/trt_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | 24 | #ifndef __TRT_UTILS_H__ 25 | #define __TRT_UTILS_H__ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "NvInfer.h" 36 | 37 | #define UNUSED(expr) (void)(expr) 38 | #define DIVUP(n, d) ((n) + (d)-1) / (d) 39 | 40 | std::string trim(std::string s); 41 | float clamp(const float val, const float minVal, const float maxVal); 42 | bool fileExists(const std::string fileName, bool verbose = true); 43 | std::vector loadWeights(const std::string weightsFilePath, const std::string& networkType); 44 | std::string dimsToString(const nvinfer1::Dims d); 45 | void displayDimType(const nvinfer1::Dims d); 46 | int getNumChannels(nvinfer1::ITensor* t); 47 | uint64_t get3DTensorVolume(nvinfer1::Dims inputDims); 48 | 49 | // Helper functions to create yolo engine 50 | nvinfer1::ILayer* netAddMaxpool(int layerIdx, std::map& block, 51 | nvinfer1::ITensor* input, nvinfer1::INetworkDefinition* network); 52 | nvinfer1::ILayer* netAddConvLinear(int layerIdx, std::map& block, 53 | std::vector& weights, 54 | std::vector& trtWeights, int& weightPtr, 55 | int& inputChannels, nvinfer1::ITensor* input, 56 | nvinfer1::INetworkDefinition* network); 57 | nvinfer1::ILayer* netAddConvBNLeaky(int layerIdx, std::map& block, 58 | std::vector& weights, 59 | std::vector& trtWeights, int& weightPtr, 60 | int& inputChannels, nvinfer1::ITensor* input, 61 | nvinfer1::INetworkDefinition* network); 62 | nvinfer1::ILayer* netAddUpsample(int layerIdx, std::map& block, 63 | std::vector& weights, 64 | std::vector& trtWeights, int& inputChannels, 65 | nvinfer1::ITensor* input, nvinfer1::INetworkDefinition* network); 66 | void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput, 67 | std::string layerOutput, std::string weightPtr); 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/yolo.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #ifndef _YOLO_H_ 24 | #define _YOLO_H_ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "NvInfer.h" 32 | #include "trt_utils.h" 33 | 34 | #include "nvdsinfer_custom_impl.h" 35 | 36 | /** 37 | * Holds all the file paths required to build a network. 38 | */ 39 | struct NetworkInfo 40 | { 41 | std::string networkType; 42 | std::string configFilePath; 43 | std::string wtsFilePath; 44 | std::string deviceType; 45 | std::string inputBlobName; 46 | }; 47 | 48 | /** 49 | * Holds information about an output tensor of the yolo network. 50 | */ 51 | struct TensorInfo 52 | { 53 | std::string blobName; 54 | uint stride{0}; 55 | uint gridSize{0}; 56 | uint numClasses{0}; 57 | uint numBBoxes{0}; 58 | uint64_t volume{0}; 59 | std::vector masks; 60 | std::vector anchors; 61 | int bindingIndex{-1}; 62 | float* hostBuffer{nullptr}; 63 | }; 64 | 65 | class Yolo : public IModelParser { 66 | public: 67 | Yolo(const NetworkInfo& networkInfo); 68 | ~Yolo() override; 69 | bool hasFullDimsSupported() const override { return false; } 70 | const char* getModelName() const override { 71 | return m_ConfigFilePath.empty() ? m_NetworkType.c_str() 72 | : m_ConfigFilePath.c_str(); 73 | } 74 | NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override; 75 | 76 | nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder); 77 | 78 | protected: 79 | const std::string m_NetworkType; 80 | const std::string m_ConfigFilePath; 81 | const std::string m_WtsFilePath; 82 | const std::string m_DeviceType; 83 | const std::string m_InputBlobName; 84 | std::vector m_OutputTensors; 85 | std::vector> m_ConfigBlocks; 86 | uint m_InputH; 87 | uint m_InputW; 88 | uint m_InputC; 89 | uint64_t m_InputSize; 90 | 91 | // TRT specific members 92 | std::vector m_TrtWeights; 93 | 94 | private: 95 | NvDsInferStatus buildYoloNetwork( 96 | std::vector& weights, nvinfer1::INetworkDefinition& network); 97 | std::vector> parseConfigFile( 98 | const std::string cfgFilePath); 99 | void parseConfigBlocks(); 100 | void destroyNetworkUtils(); 101 | }; 102 | 103 | #endif // _YOLO_H_ 104 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #include "yoloPlugins.h" 24 | #include "NvInferPlugin.h" 25 | #include 26 | #include 27 | #include 28 | 29 | namespace { 30 | template 31 | void write(char*& buffer, const T& val) 32 | { 33 | *reinterpret_cast(buffer) = val; 34 | buffer += sizeof(T); 35 | } 36 | 37 | template 38 | void read(const char*& buffer, T& val) 39 | { 40 | val = *reinterpret_cast(buffer); 41 | buffer += sizeof(T); 42 | } 43 | } //namespace 44 | 45 | // Forward declaration of cuda kernels 46 | cudaError_t cudaYoloLayerV3 ( 47 | const void* input, void* output, const uint& batchSize, 48 | const uint& gridSize, const uint& numOutputClasses, 49 | const uint& numBBoxes, uint64_t outputSize, cudaStream_t stream); 50 | 51 | YoloLayerV3::YoloLayerV3 (const void* data, size_t length) 52 | { 53 | const char *d = static_cast(data); 54 | read(d, m_NumBoxes); 55 | read(d, m_NumClasses); 56 | read(d, m_GridSize); 57 | read(d, m_OutputSize); 58 | }; 59 | 60 | YoloLayerV3::YoloLayerV3 ( 61 | const uint& numBoxes, const uint& numClasses, const uint& gridSize) : 62 | m_NumBoxes(numBoxes), 63 | m_NumClasses(numClasses), 64 | m_GridSize(gridSize) 65 | { 66 | assert(m_NumBoxes > 0); 67 | assert(m_NumClasses > 0); 68 | assert(m_GridSize > 0); 69 | m_OutputSize = m_GridSize * m_GridSize * (m_NumBoxes * (4 + 1 + m_NumClasses)); 70 | }; 71 | 72 | nvinfer1::Dims 73 | YoloLayerV3::getOutputDimensions( 74 | int index, const nvinfer1::Dims* inputs, int nbInputDims) 75 | { 76 | assert(index == 0); 77 | assert(nbInputDims == 1); 78 | return inputs[0]; 79 | } 80 | 81 | bool YoloLayerV3::supportsFormat ( 82 | nvinfer1::DataType type, nvinfer1::PluginFormat format) const { 83 | return (type == nvinfer1::DataType::kFLOAT && 84 | format == nvinfer1::PluginFormat::kNCHW); 85 | } 86 | 87 | void 88 | YoloLayerV3::configureWithFormat ( 89 | const nvinfer1::Dims* inputDims, int nbInputs, 90 | const nvinfer1::Dims* outputDims, int nbOutputs, 91 | nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) 92 | { 93 | assert(nbInputs == 1); 94 | assert (format == nvinfer1::PluginFormat::kNCHW); 95 | assert(inputDims != nullptr); 96 | } 97 | 98 | int YoloLayerV3::enqueue( 99 | int batchSize, const void* const* inputs, void** outputs, void* workspace, 100 | cudaStream_t stream) 101 | { 102 | CHECK(cudaYoloLayerV3( 103 | inputs[0], outputs[0], batchSize, m_GridSize, m_NumClasses, m_NumBoxes, 104 | m_OutputSize, stream)); 105 | return 0; 106 | } 107 | 108 | size_t YoloLayerV3::getSerializationSize() const 109 | { 110 | return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSize) + sizeof(m_OutputSize); 111 | } 112 | 113 | void YoloLayerV3::serialize(void* buffer) const 114 | { 115 | char *d = static_cast(buffer); 116 | write(d, m_NumBoxes); 117 | write(d, m_NumClasses); 118 | write(d, m_GridSize); 119 | write(d, m_OutputSize); 120 | } 121 | 122 | nvinfer1::IPluginV2* YoloLayerV3::clone() const 123 | { 124 | return new YoloLayerV3 (m_NumBoxes, m_NumClasses, m_GridSize); 125 | } 126 | 127 | REGISTER_TENSORRT_PLUGIN(YoloLayerV3PluginCreator); 128 | -------------------------------------------------------------------------------- /DeepStream/nvdsinfer_custom_impl_Yolo/yoloPlugins.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #ifndef __YOLO_PLUGINS__ 24 | #define __YOLO_PLUGINS__ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "NvInferPlugin.h" 33 | 34 | #define CHECK(status) \ 35 | { \ 36 | if (status != 0) \ 37 | { \ 38 | std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \ 39 | << " at line " << __LINE__ << std::endl; \ 40 | abort(); \ 41 | } \ 42 | } 43 | 44 | namespace 45 | { 46 | const char* YOLOV3LAYER_PLUGIN_VERSION {"1"}; 47 | const char* YOLOV3LAYER_PLUGIN_NAME {"YoloLayerV3_TRT"}; 48 | } // namespace 49 | 50 | class YoloLayerV3 : public nvinfer1::IPluginV2 51 | { 52 | public: 53 | YoloLayerV3 (const void* data, size_t length); 54 | YoloLayerV3 (const uint& numBoxes, const uint& numClasses, const uint& gridSize); 55 | const char* getPluginType () const override { return YOLOV3LAYER_PLUGIN_NAME; } 56 | const char* getPluginVersion () const override { return YOLOV3LAYER_PLUGIN_VERSION; } 57 | int getNbOutputs () const override { return 1; } 58 | 59 | nvinfer1::Dims getOutputDimensions ( 60 | int index, const nvinfer1::Dims* inputs, 61 | int nbInputDims) override; 62 | 63 | bool supportsFormat ( 64 | nvinfer1::DataType type, nvinfer1::PluginFormat format) const override; 65 | 66 | void configureWithFormat ( 67 | const nvinfer1::Dims* inputDims, int nbInputs, 68 | const nvinfer1::Dims* outputDims, int nbOutputs, 69 | nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override; 70 | 71 | int initialize () override { return 0; } 72 | void terminate () override {} 73 | size_t getWorkspaceSize (int maxBatchSize) const override { return 0; } 74 | int enqueue ( 75 | int batchSize, const void* const* inputs, void** outputs, 76 | void* workspace, cudaStream_t stream) override; 77 | size_t getSerializationSize() const override; 78 | void serialize (void* buffer) const override; 79 | void destroy () override { delete this; } 80 | nvinfer1::IPluginV2* clone() const override; 81 | 82 | void setPluginNamespace (const char* pluginNamespace)override { 83 | m_Namespace = pluginNamespace; 84 | } 85 | virtual const char* getPluginNamespace () const override { 86 | return m_Namespace.c_str(); 87 | } 88 | 89 | private: 90 | uint m_NumBoxes {0}; 91 | uint m_NumClasses {0}; 92 | uint m_GridSize {0}; 93 | uint64_t m_OutputSize {0}; 94 | std::string m_Namespace {""}; 95 | }; 96 | 97 | class YoloLayerV3PluginCreator : public nvinfer1::IPluginCreator 98 | { 99 | public: 100 | YoloLayerV3PluginCreator () {} 101 | ~YoloLayerV3PluginCreator () {} 102 | 103 | const char* getPluginName () const override { return YOLOV3LAYER_PLUGIN_NAME; } 104 | const char* getPluginVersion () const override { return YOLOV3LAYER_PLUGIN_VERSION; } 105 | 106 | const nvinfer1::PluginFieldCollection* getFieldNames() override { 107 | std::cerr<< "YoloLayerV3PluginCreator::getFieldNames is not implemented" << std::endl; 108 | return nullptr; 109 | } 110 | 111 | nvinfer1::IPluginV2* createPlugin ( 112 | const char* name, const nvinfer1::PluginFieldCollection* fc) override 113 | { 114 | std::cerr<< "YoloLayerV3PluginCreator::getFieldNames is not implemented.\n"; 115 | return nullptr; 116 | } 117 | 118 | nvinfer1::IPluginV2* deserializePlugin ( 119 | const char* name, const void* serialData, size_t serialLength) override 120 | { 121 | std::cout << "Deserialize yoloLayerV3 plugin: " << name << std::endl; 122 | return new YoloLayerV3(serialData, serialLength); 123 | } 124 | 125 | void setPluginNamespace(const char* libNamespace) override { 126 | m_Namespace = libNamespace; 127 | } 128 | const char* getPluginNamespace() const override { 129 | return m_Namespace.c_str(); 130 | } 131 | 132 | private: 133 | std::string m_Namespace {""}; 134 | }; 135 | 136 | #endif // __YOLO_PLUGINS__ 137 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch-YOLOv4 2 | 3 | ![](https://img.shields.io/static/v1?label=python&message=3.6|3.7&color=blue) 4 | ![](https://img.shields.io/static/v1?label=pytorch&message=1.4&color=) 5 | [![](https://img.shields.io/static/v1?label=license&message=Apache2&color=green)](./License.txt) 6 | 7 | A minimal PyTorch implementation of YOLOv4. 8 | - Paper Yolo v4: https://arxiv.org/abs/2004.10934 9 | - Source code:https://github.com/AlexeyAB/darknet 10 | - More details: http://pjreddie.com/darknet/yolo/ 11 | 12 | 13 | - [x] Inference 14 | - [x] Train 15 | - [x] Mosaic 16 | 17 | ``` 18 | ├── README.md 19 | ├── dataset.py dataset 20 | ├── demo.py demo to run pytorch --> tool/darknet2pytorch 21 | ├── demo_darknet2onnx.py tool to convert into onnx --> tool/darknet2pytorch 22 | ├── demo_pytorch2onnx.py tool to convert into onnx 23 | ├── models.py model for pytorch 24 | ├── train.py train models.py 25 | ├── cfg.py cfg.py for train 26 | ├── cfg cfg --> darknet2pytorch 27 | ├── data 28 | ├── weight --> darknet2pytorch 29 | ├── tool 30 | │   ├── camera.py a demo camera 31 | │   ├── coco_annotation.py coco dataset generator 32 | │   ├── config.py 33 | │   ├── darknet2pytorch.py 34 | │   ├── region_loss.py 35 | │   ├── utils.py 36 | │   └── yolo_layer.py 37 | ``` 38 | 39 | ![image](https://user-gold-cdn.xitu.io/2020/4/26/171b5a6c8b3bd513?w=768&h=576&f=jpeg&s=78882) 40 | 41 | # 0. Weights Download 42 | 43 | ## 0.1 darknet 44 | - baidu(https://pan.baidu.com/s/1dAGEW8cm-dqK14TbhhVetA Extraction code:dm5b) 45 | - google(https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT) 46 | 47 | ## 0.2 pytorch 48 | you can use darknet2pytorch to convert it yourself, or download my converted model. 49 | 50 | - baidu 51 | - yolov4.pth(https://pan.baidu.com/s/1ZroDvoGScDgtE1ja_QqJVw Extraction code:xrq9) 52 | - yolov4.conv.137.pth(https://pan.baidu.com/s/1ovBie4YyVQQoUrC3AY0joA Extraction code:kcel) 53 | - google 54 | - yolov4.pth(https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ) 55 | - yolov4.conv.137.pth(https://drive.google.com/open?id=1fcbR0bWzYfIEdLJPzOsn4R5mlvR6IQyA) 56 | 57 | # 1. Train 58 | 59 | [use yolov4 to train your own data](Use_yolov4_to_train_your_own_data.md) 60 | 61 | 1. Download weight 62 | 2. Transform data 63 | 64 | For coco dataset,you can use tool/coco_annotation.py. 65 | ``` 66 | # train.txt 67 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 68 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 69 | ... 70 | ... 71 | ``` 72 | 3. Train 73 | 74 | you can set parameters in cfg.py. 75 | ``` 76 | python train.py -g [GPU_ID] -dir [Dataset direction] ... 77 | ``` 78 | 79 | # 2. Inference 80 | 81 | ## 2.1 Performance on MS COCO dataset (using pretrained DarknetWeights from ) 82 | 83 | **ONNX and TensorRT models are converted from Pytorch (TianXiaomo): Pytorch->ONNX->TensorRT.** 84 | See following sections for more details of conversions. 85 | 86 | - val2017 dataset (input size: 416x416) 87 | 88 | | Model type | AP | AP50 | AP75 | APS | APM | APL | 89 | | ------------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: | 90 | | DarkNet (YOLOv4 paper)| 0.471 | 0.710 | 0.510 | 0.278 | 0.525 | 0.636 | 91 | | Pytorch (TianXiaomo)| 0.466 | 0.704 | 0.505 | 0.267 | 0.524 | 0.629 | 92 | | TensorRT FP32 + BatchedNMSPlugin | 0.472| 0.708 | 0.511 | 0.273 | 0.530 | 0.637 | 93 | | TensorRT FP16 + BatchedNMSPlugin | 0.472| 0.708 | 0.511 | 0.273 | 0.530 | 0.636 | 94 | 95 | - testdev2017 dataset (input size: 416x416) 96 | 97 | | Model type | AP | AP50 | AP75 | APS | APM | APL | 98 | | ------------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: | 99 | | DarkNet (YOLOv4 paper)| 0.412 | 0.628 | 0.443 | 0.204 | 0.444 | 0.560 | 100 | | Pytorch (TianXiaomo)| 0.404 | 0.615 | 0.436 | 0.196 | 0.438 | 0.552 | 101 | | TensorRT FP32 + BatchedNMSPlugin | 0.412| 0.625 | 0.445 | 0.200 | 0.446 | 0.564 | 102 | | TensorRT FP16 + BatchedNMSPlugin | 0.412| 0.625 | 0.445 | 0.200 | 0.446 | 0.563 | 103 | 104 | 105 | ## 2.2 Image input size for inference 106 | 107 | Image input size is NOT restricted in `320 * 320`, `416 * 416`, `512 * 512` and `608 * 608`. 108 | You can adjust your input sizes for a different input ratio, for example: `320 * 608`. 109 | Larger input size could help detect smaller targets, but may be slower and GPU memory exhausting. 110 | 111 | ```py 112 | height = 320 + 96 * n, n in {0, 1, 2, 3, ...} 113 | width = 320 + 96 * m, m in {0, 1, 2, 3, ...} 114 | ``` 115 | 116 | ## 2.3 **Different inference options** 117 | 118 | - Load the pretrained darknet model and darknet weights to do the inference (image size is configured in cfg file already) 119 | 120 | ```sh 121 | python demo.py -cfgfile -weightfile -imgfile 122 | ``` 123 | 124 | - Load pytorch weights (pth file) to do the inference 125 | 126 | ```sh 127 | python models.py 128 | ``` 129 | 130 | - Load converted ONNX file to do inference (See section 3 and 4) 131 | 132 | - Load converted TensorRT engine file to do inference (See section 5) 133 | 134 | ## 2.4 Inference output 135 | 136 | There are 2 inference outputs. 137 | - One is locations of bounding boxes, its shape is `[batch, num_boxes, 1, 4]` which represents x1, y1, x2, y2 of each bounding box. 138 | - The other one is scores of bounding boxes which is of shape `[batch, num_boxes, num_classes]` indicating scores of all classes for each bounding box. 139 | 140 | Until now, still a small piece of post-processing including NMS is required. We are trying to minimize time and complexity of post-processing. 141 | 142 | 143 | # 3. Darknet2ONNX 144 | 145 | - **This script is to convert the official pretrained darknet model into ONNX** 146 | 147 | - **Pytorch version Recommended:** 148 | 149 | - Pytorch 1.4.0 for TensorRT 7.0 and higher 150 | - Pytorch 1.5.0 and 1.6.0 for TensorRT 7.1.2 and higher 151 | 152 | - **Install onnxruntime** 153 | 154 | ```sh 155 | pip install onnxruntime 156 | ``` 157 | 158 | - **Run python script to generate ONNX model and run the demo** 159 | 160 | ```sh 161 | python demo_darknet2onnx.py 162 | ``` 163 | 164 | ## 3.1 Dynamic or static batch size 165 | 166 | - **Positive batch size will generate ONNX model of static batch size, otherwise, batch size will be dynamic** 167 | - Dynamic batch size will generate only one ONNX model 168 | - Static batch size will generate 2 ONNX models, one is for running the demo (batch_size=1) 169 | 170 | # 4. Pytorch2ONNX 171 | 172 | - **You can convert your trained pytorch model into ONNX using this script** 173 | 174 | - **Pytorch version Recommended:** 175 | 176 | - Pytorch 1.4.0 for TensorRT 7.0 and higher 177 | - Pytorch 1.5.0 and 1.6.0 for TensorRT 7.1.2 and higher 178 | 179 | - **Install onnxruntime** 180 | 181 | ```sh 182 | pip install onnxruntime 183 | ``` 184 | 185 | - **Run python script to generate ONNX model and run the demo** 186 | 187 | ```sh 188 | python demo_pytorch2onnx.py 189 | ``` 190 | 191 | For example: 192 | 193 | ```sh 194 | python demo_pytorch2onnx.py yolov4.pth dog.jpg 8 80 416 416 195 | ``` 196 | 197 | ## 4.1 Dynamic or static batch size 198 | 199 | - **Positive batch size will generate ONNX model of static batch size, otherwise, batch size will be dynamic** 200 | - Dynamic batch size will generate only one ONNX model 201 | - Static batch size will generate 2 ONNX models, one is for running the demo (batch_size=1) 202 | 203 | 204 | # 5. ONNX2TensorRT 205 | 206 | - **TensorRT version Recommended: 7.0, 7.1** 207 | 208 | ## 5.1 Convert from ONNX of static Batch size 209 | 210 | - **Run the following command to convert YOLOv4 ONNX model into TensorRT engine** 211 | 212 | ```sh 213 | trtexec --onnx= --explicitBatch --saveEngine= --workspace= --fp16 214 | ``` 215 | - Note: If you want to use int8 mode in conversion, extra int8 calibration is needed. 216 | 217 | ## 5.2 Convert from ONNX of dynamic Batch size 218 | 219 | - **Run the following command to convert YOLOv4 ONNX model into TensorRT engine** 220 | 221 | ```sh 222 | trtexec --onnx= \ 223 | --minShapes=input: --optShapes=input: --maxShapes=input: \ 224 | --workspace= --saveEngine= --fp16 225 | ``` 226 | - For example: 227 | 228 | ```sh 229 | trtexec --onnx=yolov4_-1_3_320_512_dynamic.onnx \ 230 | --minShapes=input:1x3x320x512 --optShapes=input:4x3x320x512 --maxShapes=input:8x3x320x512 \ 231 | --workspace=2048 --saveEngine=yolov4_-1_3_320_512_dynamic.engine --fp16 232 | ``` 233 | 234 | ## 5.3 Run the demo 235 | 236 | ```sh 237 | python demo_trt.py 238 | ``` 239 | 240 | - This demo here only works when batchSize is dynamic (1 should be within dynamic range) or batchSize=1, but you can update this demo a little for other dynamic or static batch sizes. 241 | 242 | - Note1: input_H and input_W should agree with the input size in the original ONNX file. 243 | 244 | - Note2: extra NMS operations are needed for the tensorRT output. This demo uses python NMS code from `tool/utils.py`. 245 | 246 | 247 | # 6. ONNX2Tensorflow 248 | 249 | - **First:Conversion to ONNX** 250 | 251 | tensorflow >=2.0 252 | 253 | 1: Thanks:github:https://github.com/onnx/onnx-tensorflow 254 | 255 | 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow 256 | Run pip install -e . 257 | 258 | Note:Errors will occur when using "pip install onnx-tf", at least for me,it is recommended to use source code installation 259 | 260 | # 7. ONNX2TensorRT and DeepStream Inference 261 | 262 | 1. Compile the DeepStream Nvinfer Plugin 263 | 264 | ``` 265 | cd DeepStream 266 | make 267 | ``` 268 | 2. Build a TRT Engine. 269 | 270 | For single batch, 271 | ``` 272 | trtexec --onnx= --explicitBatch --saveEngine= --workspace= --fp16 273 | ``` 274 | 275 | For multi-batch, 276 | ``` 277 | trtexec --onnx= --explicitBatch --shapes=input:Xx3xHxW --optShapes=input:Xx3xHxW --maxShapes=input:Xx3xHxW --minShape=input:1x3xHxW --saveEngine= --fp16 278 | ``` 279 | 280 | Note :The maxShapes could not be larger than model original shape. 281 | 282 | 3. Write the deepstream config file for the TRT Engine. 283 | 284 | 285 | 286 | Reference: 287 | - https://github.com/eriklindernoren/PyTorch-YOLOv3 288 | - https://github.com/marvis/pytorch-caffe-darknet-convert 289 | - https://github.com/marvis/pytorch-yolo3 290 | 291 | ``` 292 | @article{yolov4, 293 | title={YOLOv4: YOLOv4: Optimal Speed and Accuracy of Object Detection}, 294 | author={Alexey Bochkovskiy, Chien-Yao Wang, Hong-Yuan Mark Liao}, 295 | journal = {arXiv}, 296 | year={2020} 297 | } 298 | ``` 299 | -------------------------------------------------------------------------------- /Use_yolov4_to_train_your_own_data.md: -------------------------------------------------------------------------------- 1 | The release of yolov4 has attracted a lot of attention, but because darknet is written in big brother c language, there are many unchanged reading of the code, so the weekend wrote a pytorch version (to rub a wave of heat). Although pytorch - yolov4 write good has been a while, but for a variety of reasons have not been validated (mainly lazy), people raised many questions to help fix many bugs, there are big brothers together to add new features, thank you for your help. These days the highest call is how to how to use their own data for training, and yesterday was the weekend, so the thing that has dragged on for a long time to do. It is not like using a lot of data, so I made a simple dataset myself 2 | 3 | 4 | # 1. Code Preparation 5 | 6 | github Cloning Code 7 | ``` 8 | git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git 9 | ``` 10 | # 2. Data Preparation 11 | 12 | Prepare train.txt, which contains the image name and box in the following format 13 | 14 | ``` 15 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 16 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 17 | ... 18 | ``` 19 | - image_path : Image Name 20 | - x1,y1 : Coordinates of the upper left corner 21 | - x2,y2 : Coordinates of the lower right corner 22 | - id : Object Class 23 | 24 | I use their own data is their own production of a small data set to detect a variety of coins (also 1 yuan, 50 cents, 10 cents three), why not use other things to produce data sets, no ah, only these coins on hand feel more appropriate, relatively simple compared to other things。 25 | 26 | ![UTOOLS1590383513325.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a3e953909b1b?w=1649&h=791&f=png&s=1290382) 27 | 28 | A total of a few prepared。 29 | 30 | # 3. Parameter Setting 31 | 32 | When I started training, I directly used the original parameters, batch size set to 64, ran a few epochs found that it is not right, my data is only a total of more than 20. After modifying the network update strategy, not in accordance with the step of each epoch update, using the total steps update, observe the loss seems to be able to train, so sleep, tomorrow to see how the training (the ghost knows what I changed) 33 | 34 | Today, I opened my computer and saw that what xx,loss converged to 2.e+4, which must be strange again, so I killed it. So I set the batch size to 4 directly, and can train normally。 35 | 36 | ``` 37 | Cfg.batch = 4 38 | Cfg.subdivisions = 1 39 | ``` 40 | 41 | # 4. Start training 42 | 43 | ``` 44 | python train.py -l 0.001 -g 4 -pretrained ./yolov4.conv.137.pth -classes 3 -dir /home/OCR/coins 45 | 46 | -l learning rate 47 | -g gpu id 48 | -pretrained Pre-trained backbone network, converted from yolov4.conv.137 of darknet given by AlexeyAB 49 | -classes NO. of classes 50 | -dir Training image dir 51 | ``` 52 | 53 | 54 | Look at the loss curve 55 | ``` 56 | tensorboard --logdir log --host 192.168.212.75 --port 6008 57 | ``` 58 | ![UTOOLS1590386319240.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a696148d13f3?w=1357&h=795&f=png&s=151465) 59 | 60 | # 5. Inference 61 | 62 | ``` 63 | python model.py 3 weight/Yolov4_epoch166_coins.pth data/coin2.jpg data/coins.names 64 | 65 | python model.py num_classes weightfile imagepath namefile 66 | ``` 67 | coins.names 68 | ``` 69 | 1yuan 70 | 5jiao 71 | 1jiao 72 | 73 | ``` 74 | 75 | ![UTOOLS1590386705468.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a6f46e826bb8?w=774&h=1377&f=png&s=1191048) 76 | 77 | The results were poor (only 3 types of coins were available for the training data). 78 | 79 | # Attachment 80 | 81 | - coins dataset (link: https://pan.baidu.com/s/1y701NRKSdpj6UKDIH-GpqA) 82 | (Extraction code: j09s) 83 | - yolov4.conv.137.pth (Link: https://pan.baidu.com/s/1ovBie4YyVQQoUrC3AY0joA Extraction code: kcel) 84 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/05/06 21:05 4 | @Author : Tianxiaomo 5 | @File : Cfg.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | import os 14 | from easydict import EasyDict 15 | 16 | 17 | _BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 18 | 19 | Cfg = EasyDict() 20 | 21 | Cfg.use_darknet_cfg = True 22 | Cfg.cfgfile = os.path.join(_BASE_DIR, 'cfg', 'yolov4.cfg') 23 | 24 | Cfg.batch = 64 25 | Cfg.subdivisions = 16 26 | Cfg.width = 608 27 | Cfg.height = 608 28 | Cfg.channels = 3 29 | Cfg.momentum = 0.949 30 | Cfg.decay = 0.0005 31 | Cfg.angle = 0 32 | Cfg.saturation = 1.5 33 | Cfg.exposure = 1.5 34 | Cfg.hue = .1 35 | 36 | Cfg.learning_rate = 0.00261 37 | Cfg.burn_in = 1000 38 | Cfg.max_batches = 500500 39 | Cfg.steps = [400000, 450000] 40 | Cfg.policy = Cfg.steps 41 | Cfg.scales = .1, .1 42 | 43 | Cfg.cutmix = 0 44 | Cfg.mosaic = 1 45 | 46 | Cfg.letter_box = 0 47 | Cfg.jitter = 0.2 48 | Cfg.classes = 80 49 | Cfg.track = 0 50 | Cfg.w = Cfg.width 51 | Cfg.h = Cfg.height 52 | Cfg.flip = 1 53 | Cfg.blur = 0 54 | Cfg.gaussian = 0 55 | Cfg.boxes = 60 # box num 56 | Cfg.TRAIN_EPOCHS = 300 57 | Cfg.train_label = os.path.join(_BASE_DIR, 'data', 'train.txt') 58 | Cfg.val_label = os.path.join(_BASE_DIR, 'data' ,'val.txt') 59 | Cfg.TRAIN_OPTIMIZER = 'adam' 60 | ''' 61 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 62 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 63 | ... 64 | ''' 65 | 66 | if Cfg.mosaic and Cfg.cutmix: 67 | Cfg.mixup = 4 68 | elif Cfg.cutmix: 69 | Cfg.mixup = 2 70 | elif Cfg.mosaic: 71 | Cfg.mixup = 3 72 | 73 | Cfg.checkpoints = os.path.join(_BASE_DIR, 'checkpoints') 74 | Cfg.TRAIN_TENSORBOARD_DIR = os.path.join(_BASE_DIR, 'log') 75 | 76 | Cfg.iou_type = 'iou' # 'giou', 'diou', 'ciou' 77 | 78 | Cfg.keep_checkpoint_max = 10 79 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | # 0 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | # 1 35 | [maxpool] 36 | size=2 37 | stride=2 38 | 39 | # 2 40 | [convolutional] 41 | batch_normalize=1 42 | filters=32 43 | size=3 44 | stride=1 45 | pad=1 46 | activation=leaky 47 | 48 | # 3 49 | [maxpool] 50 | size=2 51 | stride=2 52 | 53 | # 4 54 | [convolutional] 55 | batch_normalize=1 56 | filters=64 57 | size=3 58 | stride=1 59 | pad=1 60 | activation=leaky 61 | 62 | # 5 63 | [maxpool] 64 | size=2 65 | stride=2 66 | 67 | # 6 68 | [convolutional] 69 | batch_normalize=1 70 | filters=128 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | # 7 77 | [maxpool] 78 | size=2 79 | stride=2 80 | 81 | # 8 82 | [convolutional] 83 | batch_normalize=1 84 | filters=256 85 | size=3 86 | stride=1 87 | pad=1 88 | activation=leaky 89 | 90 | # 9 91 | [maxpool] 92 | size=2 93 | stride=2 94 | 95 | # 10 96 | [convolutional] 97 | batch_normalize=1 98 | filters=512 99 | size=3 100 | stride=1 101 | pad=1 102 | activation=leaky 103 | 104 | # 11 105 | [maxpool] 106 | size=2 107 | stride=1 108 | 109 | # 12 110 | [convolutional] 111 | batch_normalize=1 112 | filters=1024 113 | size=3 114 | stride=1 115 | pad=1 116 | activation=leaky 117 | 118 | ########### 119 | 120 | # 13 121 | [convolutional] 122 | batch_normalize=1 123 | filters=256 124 | size=1 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | # 14 130 | [convolutional] 131 | batch_normalize=1 132 | filters=512 133 | size=3 134 | stride=1 135 | pad=1 136 | activation=leaky 137 | 138 | # 15 139 | [convolutional] 140 | size=1 141 | stride=1 142 | pad=1 143 | filters=255 144 | activation=linear 145 | 146 | 147 | 148 | # 16 149 | [yolo] 150 | mask = 3,4,5 151 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 152 | classes=80 153 | num=6 154 | jitter=.3 155 | ignore_thresh = .7 156 | truth_thresh = 1 157 | random=1 158 | 159 | # 17 160 | [route] 161 | layers = -4 162 | 163 | # 18 164 | [convolutional] 165 | batch_normalize=1 166 | filters=128 167 | size=1 168 | stride=1 169 | pad=1 170 | activation=leaky 171 | 172 | # 19 173 | [upsample] 174 | stride=2 175 | 176 | # 20 177 | [route] 178 | layers = -1, 8 179 | 180 | # 21 181 | [convolutional] 182 | batch_normalize=1 183 | filters=256 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | # 22 190 | [convolutional] 191 | size=1 192 | stride=1 193 | pad=1 194 | filters=255 195 | activation=linear 196 | 197 | # 23 198 | [yolo] 199 | mask = 1,2,3 200 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 201 | classes=80 202 | num=6 203 | jitter=.3 204 | ignore_thresh = .7 205 | truth_thresh = 1 206 | random=1 207 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /cfg/yolov4-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.00261 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=2 30 | pad=1 31 | activation=leaky 32 | 33 | [convolutional] 34 | batch_normalize=1 35 | filters=64 36 | size=3 37 | stride=2 38 | pad=1 39 | activation=leaky 40 | 41 | [convolutional] 42 | batch_normalize=1 43 | filters=64 44 | size=3 45 | stride=1 46 | pad=1 47 | activation=leaky 48 | 49 | [route] 50 | layers=-1 51 | groups=2 52 | group_id=1 53 | 54 | [convolutional] 55 | batch_normalize=1 56 | filters=32 57 | size=3 58 | stride=1 59 | pad=1 60 | activation=leaky 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=32 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [route] 71 | layers = -1,-2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [route] 82 | layers = -6,-1 83 | 84 | [maxpool] 85 | size=2 86 | stride=2 87 | 88 | [convolutional] 89 | batch_normalize=1 90 | filters=128 91 | size=3 92 | stride=1 93 | pad=1 94 | activation=leaky 95 | 96 | [route] 97 | layers=-1 98 | groups=2 99 | group_id=1 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=64 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [convolutional] 110 | batch_normalize=1 111 | filters=64 112 | size=3 113 | stride=1 114 | pad=1 115 | activation=leaky 116 | 117 | [route] 118 | layers = -1,-2 119 | 120 | [convolutional] 121 | batch_normalize=1 122 | filters=128 123 | size=1 124 | stride=1 125 | pad=1 126 | activation=leaky 127 | 128 | [route] 129 | layers = -6,-1 130 | 131 | [maxpool] 132 | size=2 133 | stride=2 134 | 135 | [convolutional] 136 | batch_normalize=1 137 | filters=256 138 | size=3 139 | stride=1 140 | pad=1 141 | activation=leaky 142 | 143 | [route] 144 | layers=-1 145 | groups=2 146 | group_id=1 147 | 148 | [convolutional] 149 | batch_normalize=1 150 | filters=128 151 | size=3 152 | stride=1 153 | pad=1 154 | activation=leaky 155 | 156 | [convolutional] 157 | batch_normalize=1 158 | filters=128 159 | size=3 160 | stride=1 161 | pad=1 162 | activation=leaky 163 | 164 | [route] 165 | layers = -1,-2 166 | 167 | [convolutional] 168 | batch_normalize=1 169 | filters=256 170 | size=1 171 | stride=1 172 | pad=1 173 | activation=leaky 174 | 175 | [route] 176 | layers = -6,-1 177 | 178 | [maxpool] 179 | size=2 180 | stride=2 181 | 182 | [convolutional] 183 | batch_normalize=1 184 | filters=512 185 | size=3 186 | stride=1 187 | pad=1 188 | activation=leaky 189 | 190 | ################################## 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | filters=256 195 | size=1 196 | stride=1 197 | pad=1 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | filters=512 203 | size=3 204 | stride=1 205 | pad=1 206 | activation=leaky 207 | 208 | [convolutional] 209 | size=1 210 | stride=1 211 | pad=1 212 | filters=255 213 | activation=linear 214 | 215 | 216 | 217 | [yolo] 218 | mask = 3,4,5 219 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 220 | classes=80 221 | num=6 222 | jitter=.3 223 | scale_x_y = 1.05 224 | cls_normalizer=1.0 225 | iou_normalizer=0.07 226 | iou_loss=ciou 227 | ignore_thresh = .7 228 | truth_thresh = 1 229 | random=0 230 | resize=1.5 231 | nms_kind=greedynms 232 | beta_nms=0.6 233 | 234 | [route] 235 | layers = -4 236 | 237 | [convolutional] 238 | batch_normalize=1 239 | filters=128 240 | size=1 241 | stride=1 242 | pad=1 243 | activation=leaky 244 | 245 | [upsample] 246 | stride=2 247 | 248 | [route] 249 | layers = -1, 23 250 | 251 | [convolutional] 252 | batch_normalize=1 253 | filters=256 254 | size=3 255 | stride=1 256 | pad=1 257 | activation=leaky 258 | 259 | [convolutional] 260 | size=1 261 | stride=1 262 | pad=1 263 | filters=255 264 | activation=linear 265 | 266 | [yolo] 267 | mask = 1,2,3 268 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 269 | classes=80 270 | num=6 271 | jitter=.3 272 | scale_x_y = 1.05 273 | cls_normalizer=1.0 274 | iou_normalizer=0.07 275 | iou_loss=ciou 276 | ignore_thresh = .7 277 | truth_thresh = 1 278 | random=0 279 | resize=1.5 280 | nms_kind=greedynms 281 | beta_nms=0.6 282 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tianxiaomo/pytorch-YOLOv4/a65d219f9066bae4e12003bd7cdc04531860c672/data/dog.jpg -------------------------------------------------------------------------------- /data/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tianxiaomo/pytorch-YOLOv4/a65d219f9066bae4e12003bd7cdc04531860c672/data/giraffe.jpg -------------------------------------------------------------------------------- /data/prediction.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tianxiaomo/pytorch-YOLOv4/a65d219f9066bae4e12003bd7cdc04531860c672/data/prediction.jpg -------------------------------------------------------------------------------- /data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 20/04/25 15:49 4 | @Author : huguanghao 5 | @File : demo.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | ''' 12 | 13 | # import sys 14 | # import time 15 | # from PIL import Image, ImageDraw 16 | # from models.tiny_yolo import TinyYoloNet 17 | from tool.utils import * 18 | from tool.torch_utils import * 19 | from tool.darknet2pytorch import Darknet 20 | import torch 21 | import argparse 22 | 23 | """hyper parameters""" 24 | use_cuda = True 25 | 26 | def detect_cv2(cfgfile, weightfile, imgfile): 27 | import cv2 28 | m = Darknet(cfgfile) 29 | 30 | m.print_network() 31 | m.load_weights(weightfile) 32 | print('Loading weights from %s... Done!' % (weightfile)) 33 | 34 | if use_cuda: 35 | m.cuda() 36 | 37 | num_classes = m.num_classes 38 | if num_classes == 20: 39 | namesfile = 'data/voc.names' 40 | elif num_classes == 80: 41 | namesfile = 'data/coco.names' 42 | else: 43 | namesfile = 'data/x.names' 44 | class_names = load_class_names(namesfile) 45 | 46 | img = cv2.imread(imgfile) 47 | sized = cv2.resize(img, (m.width, m.height)) 48 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 49 | 50 | for i in range(2): 51 | start = time.time() 52 | boxes = do_detect(m, sized, 0.4, 0.6, use_cuda) 53 | finish = time.time() 54 | if i == 1: 55 | print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) 56 | 57 | plot_boxes_cv2(img, boxes[0], savename='predictions.jpg', class_names=class_names) 58 | 59 | 60 | def detect_cv2_camera(cfgfile, weightfile): 61 | import cv2 62 | m = Darknet(cfgfile) 63 | 64 | m.print_network() 65 | if args.torch: 66 | m.load_state_dict(torch.load(weightfile)) 67 | else: 68 | m.load_weights(weightfile) 69 | print('Loading weights from %s... Done!' % (weightfile)) 70 | 71 | if use_cuda: 72 | m.cuda() 73 | 74 | cap = cv2.VideoCapture(0) 75 | # cap = cv2.VideoCapture("./test.mp4") 76 | cap.set(3, 1280) 77 | cap.set(4, 720) 78 | print("Starting the YOLO loop...") 79 | 80 | num_classes = m.num_classes 81 | if num_classes == 20: 82 | namesfile = 'data/voc.names' 83 | elif num_classes == 80: 84 | namesfile = 'data/coco.names' 85 | else: 86 | namesfile = 'data/x.names' 87 | class_names = load_class_names(namesfile) 88 | 89 | while True: 90 | ret, img = cap.read() 91 | sized = cv2.resize(img, (m.width, m.height)) 92 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 93 | 94 | start = time.time() 95 | boxes = do_detect(m, sized, 0.4, 0.6, use_cuda) 96 | finish = time.time() 97 | print('Predicted in %f seconds.' % (finish - start)) 98 | 99 | result_img = plot_boxes_cv2(img, boxes[0], savename=None, class_names=class_names) 100 | 101 | cv2.imshow('Yolo demo', result_img) 102 | cv2.waitKey(1) 103 | 104 | cap.release() 105 | 106 | 107 | def detect_skimage(cfgfile, weightfile, imgfile): 108 | from skimage import io 109 | from skimage.transform import resize 110 | m = Darknet(cfgfile) 111 | 112 | m.print_network() 113 | m.load_weights(weightfile) 114 | print('Loading weights from %s... Done!' % (weightfile)) 115 | 116 | if use_cuda: 117 | m.cuda() 118 | 119 | num_classes = m.num_classes 120 | if num_classes == 20: 121 | namesfile = 'data/voc.names' 122 | elif num_classes == 80: 123 | namesfile = 'data/coco.names' 124 | else: 125 | namesfile = 'data/x.names' 126 | class_names = load_class_names(namesfile) 127 | 128 | img = io.imread(imgfile) 129 | sized = resize(img, (m.width, m.height)) * 255 130 | 131 | for i in range(2): 132 | start = time.time() 133 | boxes = do_detect(m, sized, 0.4, 0.4, use_cuda) 134 | finish = time.time() 135 | if i == 1: 136 | print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) 137 | 138 | plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) 139 | 140 | 141 | def get_args(): 142 | parser = argparse.ArgumentParser('Test your image or video by trained model.') 143 | parser.add_argument('-cfgfile', type=str, default='./cfg/yolov4.cfg', 144 | help='path of cfg file', dest='cfgfile') 145 | parser.add_argument('-weightfile', type=str, 146 | default='./checkpoints/Yolov4_epoch1.pth', 147 | help='path of trained model.', dest='weightfile') 148 | parser.add_argument('-imgfile', type=str, 149 | default='./data/mscoco2017/train2017/190109_180343_00154162.jpg', 150 | help='path of your image file.', dest='imgfile') 151 | parser.add_argument('-torch', type=bool, default=false, 152 | help='use torch weights') 153 | args = parser.parse_args() 154 | 155 | return args 156 | 157 | 158 | if __name__ == '__main__': 159 | args = get_args() 160 | if args.imgfile: 161 | detect_cv2(args.cfgfile, args.weightfile, args.imgfile) 162 | # detect_imges(args.cfgfile, args.weightfile) 163 | # detect_cv2(args.cfgfile, args.weightfile, args.imgfile) 164 | # detect_skimage(args.cfgfile, args.weightfile, args.imgfile) 165 | else: 166 | detect_cv2_camera(args.cfgfile, args.weightfile) 167 | -------------------------------------------------------------------------------- /demo_darknet2onnx.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import onnx 3 | import os 4 | import argparse 5 | import numpy as np 6 | import cv2 7 | import onnxruntime 8 | 9 | from tool.utils import * 10 | from tool.darknet2onnx import * 11 | 12 | 13 | def main(cfg_file, namesfile, weight_file, image_path, batch_size): 14 | 15 | if batch_size <= 0: 16 | onnx_path_demo = transform_to_onnx(cfg_file, weight_file, batch_size) 17 | else: 18 | # Transform to onnx as specified batch size 19 | transform_to_onnx(cfg_file, weight_file, batch_size) 20 | # Transform to onnx as demo 21 | onnx_path_demo = transform_to_onnx(cfg_file, weight_file, 1) 22 | 23 | session = onnxruntime.InferenceSession(onnx_path_demo) 24 | # session = onnx.load(onnx_path) 25 | print("The model expects input shape: ", session.get_inputs()[0].shape) 26 | 27 | image_src = cv2.imread(image_path) 28 | detect(session, image_src, namesfile) 29 | 30 | 31 | 32 | def detect(session, image_src, namesfile): 33 | IN_IMAGE_H = session.get_inputs()[0].shape[2] 34 | IN_IMAGE_W = session.get_inputs()[0].shape[3] 35 | 36 | # Input 37 | resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR) 38 | img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) 39 | img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) 40 | img_in = np.expand_dims(img_in, axis=0) 41 | img_in /= 255.0 42 | print("Shape of the network input: ", img_in.shape) 43 | 44 | # Compute 45 | input_name = session.get_inputs()[0].name 46 | 47 | outputs = session.run(None, {input_name: img_in}) 48 | 49 | boxes = post_processing(img_in, 0.4, 0.6, outputs) 50 | 51 | class_names = load_class_names(namesfile) 52 | plot_boxes_cv2(image_src, boxes[0], savename='predictions_onnx.jpg', class_names=class_names) 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | print("Converting to onnx and running demo ...") 58 | if len(sys.argv) == 6: 59 | cfg_file = sys.argv[1] 60 | namesfile = sys.argv[2] 61 | weight_file = sys.argv[3] 62 | image_path = sys.argv[4] 63 | batch_size = int(sys.argv[5]) 64 | main(cfg_file, namesfile, weight_file, image_path, batch_size) 65 | else: 66 | print('Please run this way:\n') 67 | print(' python demo_onnx.py ') 68 | -------------------------------------------------------------------------------- /demo_pytorch2onnx.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import onnx 3 | import os 4 | import argparse 5 | import numpy as np 6 | import cv2 7 | import onnxruntime 8 | import torch 9 | 10 | from tool.utils import * 11 | from models import Yolov4 12 | from demo_darknet2onnx import detect 13 | 14 | 15 | def transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W): 16 | 17 | model = Yolov4(n_classes=n_classes, inference=True) 18 | 19 | pretrained_dict = torch.load(weight_file, map_location=torch.device('cuda')) 20 | model.load_state_dict(pretrained_dict) 21 | 22 | input_names = ["input"] 23 | output_names = ['boxes', 'confs'] 24 | 25 | dynamic = False 26 | if batch_size <= 0: 27 | dynamic = True 28 | 29 | if dynamic: 30 | x = torch.randn((1, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True) 31 | onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(IN_IMAGE_H, IN_IMAGE_W) 32 | dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}} 33 | # Export the model 34 | print('Export the onnx model ...') 35 | torch.onnx.export(model, 36 | x, 37 | onnx_file_name, 38 | export_params=True, 39 | opset_version=11, 40 | do_constant_folding=True, 41 | input_names=input_names, output_names=output_names, 42 | dynamic_axes=dynamic_axes) 43 | 44 | print('Onnx model exporting done') 45 | return onnx_file_name 46 | 47 | else: 48 | x = torch.randn((batch_size, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True) 49 | onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, IN_IMAGE_H, IN_IMAGE_W) 50 | # Export the model 51 | print('Export the onnx model ...') 52 | torch.onnx.export(model, 53 | x, 54 | onnx_file_name, 55 | export_params=True, 56 | opset_version=11, 57 | do_constant_folding=True, 58 | input_names=input_names, output_names=output_names, 59 | dynamic_axes=None) 60 | 61 | print('Onnx model exporting done') 62 | return onnx_file_name 63 | 64 | 65 | 66 | def main(weight_file, image_path, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W): 67 | 68 | if batch_size <= 0: 69 | onnx_path_demo = transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W) 70 | else: 71 | # Transform to onnx as specified batch size 72 | transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W) 73 | # Transform to onnx for demo 74 | onnx_path_demo = transform_to_onnx(weight_file, 1, n_classes, IN_IMAGE_H, IN_IMAGE_W) 75 | 76 | session = onnxruntime.InferenceSession(onnx_path_demo) 77 | # session = onnx.load(onnx_path) 78 | print("The model expects input shape: ", session.get_inputs()[0].shape) 79 | 80 | image_src = cv2.imread(image_path) 81 | detect(session, image_src) 82 | 83 | 84 | 85 | if __name__ == '__main__': 86 | print("Converting to onnx and running demo ...") 87 | if len(sys.argv) == 7: 88 | 89 | weight_file = sys.argv[1] 90 | image_path = sys.argv[2] 91 | batch_size = int(sys.argv[3]) 92 | n_classes = int(sys.argv[4]) 93 | IN_IMAGE_H = int(sys.argv[5]) 94 | IN_IMAGE_W = int(sys.argv[6]) 95 | 96 | main(weight_file, image_path, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W) 97 | else: 98 | print('Please run this way:\n') 99 | print(' python demo_onnx.py ') 100 | -------------------------------------------------------------------------------- /demo_tensorflow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import tensorflow as tf 4 | from tensorflow.python.platform import gfile 5 | 6 | import cv2 7 | from tool.utils import post_processing, load_class_names, plot_boxes_cv2 8 | 9 | 10 | def demo_tensorflow(tfpb_file="./weight/yolov4.pb", image_path=None, print_sensor_name=False): 11 | graph_name = 'yolov4' 12 | tf.compat.v1.disable_eager_execution() 13 | with tf.compat.v1.Session() as persisted_sess: 14 | print("loading graph...") 15 | with gfile.FastGFile(tfpb_file, 'rb') as f: 16 | graph_def = tf.compat.v1.GraphDef() 17 | graph_def.ParseFromString(f.read()) 18 | 19 | persisted_sess.graph.as_default() 20 | tf.import_graph_def(graph_def, name=graph_name) 21 | 22 | # print all sensor_name 23 | if print_sensor_name: 24 | tensor_name_list = [tensor.name for tensor in tf.compat.v1.get_default_graph().as_graph_def().node] 25 | for tensor_name in tensor_name_list: 26 | print(tensor_name) 27 | 28 | inp = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'input:0') 29 | print(inp.shape) 30 | out1 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_1:0') 31 | out2 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_2:0') 32 | out3 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_3:0') 33 | print(out1.shape, out2.shape, out3.shape) 34 | 35 | # image_src = np.random.rand(1, 3, 608, 608).astype(np.float32) # input image 36 | # Input 37 | image_src = cv2.imread(image_path) 38 | resized = cv2.resize(image_src, (inp.shape[2], inp.shape[3]), interpolation=cv2.INTER_LINEAR) 39 | img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) 40 | img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) 41 | img_in = np.expand_dims(img_in, axis=0) 42 | img_in /= 255.0 43 | print("Shape of the network input: ", img_in.shape) 44 | 45 | feed_dict = {inp: img_in} 46 | 47 | outputs = persisted_sess.run([out1, out2, out3], feed_dict) 48 | print(outputs[0].shape) 49 | print(outputs[1].shape) 50 | print(outputs[2].shape) 51 | 52 | boxes = post_processing(img_in, 0.4, outputs) 53 | 54 | num_classes = 80 55 | if num_classes == 20: 56 | namesfile = 'data/voc.names' 57 | elif num_classes == 80: 58 | namesfile = 'data/coco.names' 59 | else: 60 | namesfile = 'data/names' 61 | 62 | class_names = load_class_names(namesfile) 63 | result = plot_boxes_cv2(image_src, boxes, savename=None, class_names=class_names) 64 | cv2.imshow("tensorflow predicted", result) 65 | cv2.waitKey() 66 | 67 | 68 | if __name__ == '__main__': 69 | if len(sys.argv) == 1: 70 | sys.argv.append('weight/yolov4.pb') 71 | sys.argv.append('data/dog.jpg') 72 | if len(sys.argv) == 3: 73 | tfpbfile = sys.argv[1] 74 | image_path = sys.argv[2] 75 | demo_tensorflow(tfpbfile, image_path) 76 | else: 77 | print('Please execute this script this way:\n') 78 | print(' python demo_tensorflow.py ') 79 | -------------------------------------------------------------------------------- /demo_trt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import argparse 5 | import numpy as np 6 | import cv2 7 | # from PIL import Image 8 | import tensorrt as trt 9 | import pycuda.driver as cuda 10 | import pycuda.autoinit 11 | 12 | from tool.utils import * 13 | 14 | try: 15 | # Sometimes python2 does not understand FileNotFoundError 16 | FileNotFoundError 17 | except NameError: 18 | FileNotFoundError = IOError 19 | 20 | def GiB(val): 21 | return val * 1 << 30 22 | 23 | def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]): 24 | ''' 25 | Parses sample arguments. 26 | Args: 27 | description (str): Description of the sample. 28 | subfolder (str): The subfolder containing data relevant to this sample 29 | find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path. 30 | Returns: 31 | str: Path of data directory. 32 | Raises: 33 | FileNotFoundError 34 | ''' 35 | 36 | # Standard command-line arguments for all samples. 37 | kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data") 38 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 39 | parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.", default=kDEFAULT_DATA_ROOT) 40 | args, unknown_args = parser.parse_known_args() 41 | 42 | # If data directory is not specified, use the default. 43 | data_root = args.datadir 44 | # If the subfolder exists, append it to the path, otherwise use the provided path as-is. 45 | subfolder_path = os.path.join(data_root, subfolder) 46 | data_path = subfolder_path 47 | if not os.path.exists(subfolder_path): 48 | print("WARNING: " + subfolder_path + " does not exist. Trying " + data_root + " instead.") 49 | data_path = data_root 50 | 51 | # Make sure data directory exists. 52 | if not (os.path.exists(data_path)): 53 | raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.") 54 | 55 | # Find all requested files. 56 | for index, f in enumerate(find_files): 57 | find_files[index] = os.path.abspath(os.path.join(data_path, f)) 58 | if not os.path.exists(find_files[index]): 59 | raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.") 60 | 61 | return data_path, find_files 62 | 63 | # Simple helper data class that's a little nicer to use than a 2-tuple. 64 | class HostDeviceMem(object): 65 | def __init__(self, host_mem, device_mem): 66 | self.host = host_mem 67 | self.device = device_mem 68 | 69 | def __str__(self): 70 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 71 | 72 | def __repr__(self): 73 | return self.__str__() 74 | 75 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 76 | def allocate_buffers(engine, batch_size): 77 | inputs = [] 78 | outputs = [] 79 | bindings = [] 80 | stream = cuda.Stream() 81 | for binding in engine: 82 | 83 | size = trt.volume(engine.get_binding_shape(binding)) * batch_size 84 | dims = engine.get_binding_shape(binding) 85 | 86 | # in case batch dimension is -1 (dynamic) 87 | if dims[0] < 0: 88 | size *= -1 89 | 90 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 91 | # Allocate host and device buffers 92 | host_mem = cuda.pagelocked_empty(size, dtype) 93 | device_mem = cuda.mem_alloc(host_mem.nbytes) 94 | # Append the device buffer to device bindings. 95 | bindings.append(int(device_mem)) 96 | # Append to the appropriate list. 97 | if engine.binding_is_input(binding): 98 | inputs.append(HostDeviceMem(host_mem, device_mem)) 99 | else: 100 | outputs.append(HostDeviceMem(host_mem, device_mem)) 101 | return inputs, outputs, bindings, stream 102 | 103 | # This function is generalized for multiple inputs/outputs. 104 | # inputs and outputs are expected to be lists of HostDeviceMem objects. 105 | def do_inference(context, bindings, inputs, outputs, stream): 106 | # Transfer input data to the GPU. 107 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 108 | # Run inference. 109 | context.execute_async(bindings=bindings, stream_handle=stream.handle) 110 | # Transfer predictions back from the GPU. 111 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 112 | # Synchronize the stream 113 | stream.synchronize() 114 | # Return only the host outputs. 115 | return [out.host for out in outputs] 116 | 117 | 118 | TRT_LOGGER = trt.Logger() 119 | 120 | def main(engine_path, image_path, image_size): 121 | with get_engine(engine_path) as engine, engine.create_execution_context() as context: 122 | buffers = allocate_buffers(engine, 1) 123 | IN_IMAGE_H, IN_IMAGE_W = image_size 124 | context.set_binding_shape(0, (1, 3, IN_IMAGE_H, IN_IMAGE_W)) 125 | 126 | image_src = cv2.imread(image_path) 127 | 128 | num_classes = 80 129 | 130 | for i in range(2): # This 'for' loop is for speed check 131 | # Because the first iteration is usually longer 132 | boxes = detect(context, buffers, image_src, image_size, num_classes) 133 | 134 | if num_classes == 20: 135 | namesfile = 'data/voc.names' 136 | elif num_classes == 80: 137 | namesfile = 'data/coco.names' 138 | else: 139 | namesfile = 'data/names' 140 | 141 | class_names = load_class_names(namesfile) 142 | plot_boxes_cv2(image_src, boxes[0], savename='predictions_trt.jpg', class_names=class_names) 143 | 144 | 145 | def get_engine(engine_path): 146 | # If a serialized engine exists, use it instead of building an engine. 147 | print("Reading engine from file {}".format(engine_path)) 148 | with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 149 | return runtime.deserialize_cuda_engine(f.read()) 150 | 151 | 152 | 153 | def detect(context, buffers, image_src, image_size, num_classes): 154 | IN_IMAGE_H, IN_IMAGE_W = image_size 155 | 156 | ta = time.time() 157 | # Input 158 | resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR) 159 | img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) 160 | img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) 161 | img_in = np.expand_dims(img_in, axis=0) 162 | img_in /= 255.0 163 | img_in = np.ascontiguousarray(img_in) 164 | print("Shape of the network input: ", img_in.shape) 165 | # print(img_in) 166 | 167 | inputs, outputs, bindings, stream = buffers 168 | print('Length of inputs: ', len(inputs)) 169 | inputs[0].host = img_in 170 | 171 | trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) 172 | 173 | print('Len of outputs: ', len(trt_outputs)) 174 | 175 | trt_outputs[0] = trt_outputs[0].reshape(1, -1, 1, 4) 176 | trt_outputs[1] = trt_outputs[1].reshape(1, -1, num_classes) 177 | 178 | tb = time.time() 179 | 180 | print('-----------------------------------') 181 | print(' TRT inference time: %f' % (tb - ta)) 182 | print('-----------------------------------') 183 | 184 | boxes = post_processing(img_in, 0.4, 0.6, trt_outputs) 185 | 186 | return boxes 187 | 188 | 189 | 190 | if __name__ == '__main__': 191 | engine_path = sys.argv[1] 192 | image_path = sys.argv[2] 193 | 194 | if len(sys.argv) < 4: 195 | image_size = (416, 416) 196 | elif len(sys.argv) < 5: 197 | image_size = (int(sys.argv[3]), int(sys.argv[3])) 198 | else: 199 | image_size = (int(sys.argv[3]), int(sys.argv[4])) 200 | 201 | main(engine_path, image_path, image_size) 202 | -------------------------------------------------------------------------------- /evaluate_on_coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to evaluate the model's performance using pre-trained weights using COCO API. 3 | Example usage: python evaluate_on_coco.py -dir D:\cocoDataset\val2017\val2017 -gta D:\cocoDataset\annotatio 4 | ns_trainval2017\annotations\instances_val2017.json -c cfg/yolov4-smaller-input.cfg -g 0 5 | Explanation: set where your images can be found using -dir, then use -gta to point to the ground truth annotations file 6 | and finally -c to point to the config file you want to use to load the network using. 7 | """ 8 | 9 | import argparse 10 | import datetime 11 | import json 12 | import logging 13 | import os 14 | import sys 15 | import time 16 | from collections import defaultdict 17 | 18 | import numpy as np 19 | import torch 20 | from PIL import Image, ImageDraw 21 | from easydict import EasyDict as edict 22 | from pycocotools.coco import COCO 23 | from pycocotools.cocoeval import COCOeval 24 | 25 | from cfg import Cfg 26 | from tool.darknet2pytorch import Darknet 27 | from tool.utils import load_class_names 28 | from tool.torch_utils import do_detect 29 | 30 | 31 | def get_class_name(cat): 32 | class_names = load_class_names("./data/coco.names") 33 | if cat >= 1 and cat <= 11: 34 | cat = cat - 1 35 | elif cat >= 13 and cat <= 25: 36 | cat = cat - 2 37 | elif cat >= 27 and cat <= 28: 38 | cat = cat - 3 39 | elif cat >= 31 and cat <= 44: 40 | cat = cat - 5 41 | elif cat >= 46 and cat <= 65: 42 | cat = cat - 6 43 | elif cat == 67: 44 | cat = cat - 7 45 | elif cat == 70: 46 | cat = cat - 9 47 | elif cat >= 72 and cat <= 82: 48 | cat = cat - 10 49 | elif cat >= 84 and cat <= 90: 50 | cat = cat - 11 51 | return class_names[cat] 52 | 53 | def convert_cat_id_and_reorientate_bbox(single_annotation): 54 | cat = single_annotation['category_id'] 55 | bbox = single_annotation['bbox'] 56 | x, y, w, h = bbox 57 | x1, y1, x2, y2 = x - w / 2, y - h / 2, x + w / 2, y + h / 2 58 | if 0 <= cat <= 10: 59 | cat = cat + 1 60 | elif 11 <= cat <= 23: 61 | cat = cat + 2 62 | elif 24 <= cat <= 25: 63 | cat = cat + 3 64 | elif 26 <= cat <= 39: 65 | cat = cat + 5 66 | elif 40 <= cat <= 59: 67 | cat = cat + 6 68 | elif cat == 60: 69 | cat = cat + 7 70 | elif cat == 61: 71 | cat = cat + 9 72 | elif 62 <= cat <= 72: 73 | cat = cat + 10 74 | elif 73 <= cat <= 79: 75 | cat = cat + 11 76 | single_annotation['category_id'] = cat 77 | single_annotation['bbox'] = [x1, y1, w, h] 78 | return single_annotation 79 | 80 | 81 | 82 | def myconverter(obj): 83 | if isinstance(obj, np.integer): 84 | return int(obj) 85 | elif isinstance(obj, np.floating): 86 | return float(obj) 87 | elif isinstance(obj, np.ndarray): 88 | return obj.tolist() 89 | elif isinstance(obj, datetime.datetime): 90 | return obj.__str__() 91 | else: 92 | return obj 93 | 94 | def evaluate_on_coco(cfg, resFile): 95 | annType = "bbox" # specify type here 96 | with open(resFile, 'r') as f: 97 | unsorted_annotations = json.load(f) 98 | sorted_annotations = list(sorted(unsorted_annotations, key=lambda single_annotation: single_annotation["image_id"])) 99 | sorted_annotations = list(map(convert_cat_id_and_reorientate_bbox, sorted_annotations)) 100 | reshaped_annotations = defaultdict(list) 101 | for annotation in sorted_annotations: 102 | reshaped_annotations[annotation['image_id']].append(annotation) 103 | 104 | with open('temp.json', 'w') as f: 105 | json.dump(sorted_annotations, f) 106 | 107 | cocoGt = COCO(cfg.gt_annotations_path) 108 | cocoDt = cocoGt.loadRes('temp.json') 109 | 110 | with open(cfg.gt_annotations_path, 'r') as f: 111 | gt_annotation_raw = json.load(f) 112 | gt_annotation_raw_images = gt_annotation_raw["images"] 113 | gt_annotation_raw_labels = gt_annotation_raw["annotations"] 114 | 115 | rgb_label = (255, 0, 0) 116 | rgb_pred = (0, 255, 0) 117 | 118 | for i, image_id in enumerate(reshaped_annotations): 119 | image_annotations = reshaped_annotations[image_id] 120 | gt_annotation_image_raw = list(filter( 121 | lambda image_json: image_json['id'] == image_id, gt_annotation_raw_images 122 | )) 123 | gt_annotation_labels_raw = list(filter( 124 | lambda label_json: label_json['image_id'] == image_id, gt_annotation_raw_labels 125 | )) 126 | if len(gt_annotation_image_raw) == 1: 127 | image_path = os.path.join(cfg.dataset_dir, gt_annotation_image_raw[0]["file_name"]) 128 | actual_image = Image.open(image_path).convert('RGB') 129 | draw = ImageDraw.Draw(actual_image) 130 | 131 | for annotation in image_annotations: 132 | x1_pred, y1_pred, w, h = annotation['bbox'] 133 | x2_pred, y2_pred = x1_pred + w, y1_pred + h 134 | cls_id = annotation['category_id'] 135 | label = get_class_name(cls_id) 136 | draw.text((x1_pred, y1_pred), label, fill=rgb_pred) 137 | draw.rectangle([x1_pred, y1_pred, x2_pred, y2_pred], outline=rgb_pred) 138 | for annotation in gt_annotation_labels_raw: 139 | x1_truth, y1_truth, w, h = annotation['bbox'] 140 | x2_truth, y2_truth = x1_truth + w, y1_truth + h 141 | cls_id = annotation['category_id'] 142 | label = get_class_name(cls_id) 143 | draw.text((x1_truth, y1_truth), label, fill=rgb_label) 144 | draw.rectangle([x1_truth, y1_truth, x2_truth, y2_truth], outline=rgb_label) 145 | actual_image.save("./data/outcome/predictions_{}".format(gt_annotation_image_raw[0]["file_name"])) 146 | else: 147 | print('please check') 148 | break 149 | if (i + 1) % 100 == 0: # just see first 100 150 | break 151 | 152 | imgIds = sorted(cocoGt.getImgIds()) 153 | cocoEval = COCOeval(cocoGt, cocoDt, annType) 154 | cocoEval.params.imgIds = imgIds 155 | cocoEval.evaluate() 156 | cocoEval.accumulate() 157 | cocoEval.summarize() 158 | 159 | 160 | def test(model, annotations, cfg): 161 | if not annotations["images"]: 162 | print("Annotations do not have 'images' key") 163 | return 164 | images = annotations["images"] 165 | # images = images[:10] 166 | resFile = 'data/coco_val_outputs.json' 167 | 168 | if torch.cuda.is_available(): 169 | use_cuda = 1 170 | else: 171 | use_cuda = 0 172 | 173 | # do one forward pass first to circumvent cold start 174 | throwaway_image = Image.open('data/dog.jpg').convert('RGB').resize((model.width, model.height)) 175 | do_detect(model, throwaway_image, 0.5, 80, 0.4, use_cuda) 176 | boxes_json = [] 177 | 178 | for i, image_annotation in enumerate(images): 179 | logging.info("currently on image: {}/{}".format(i + 1, len(images))) 180 | image_file_name = image_annotation["file_name"] 181 | image_id = image_annotation["id"] 182 | image_height = image_annotation["height"] 183 | image_width = image_annotation["width"] 184 | 185 | # open and resize each image first 186 | img = Image.open(os.path.join(cfg.dataset_dir, image_file_name)).convert('RGB') 187 | sized = img.resize((model.width, model.height)) 188 | 189 | if use_cuda: 190 | model.cuda() 191 | 192 | start = time.time() 193 | boxes = do_detect(model, sized, 0.0, 80, 0.4, use_cuda) 194 | finish = time.time() 195 | if type(boxes) == list: 196 | for box in boxes: 197 | box_json = {} 198 | category_id = box[-1] 199 | score = box[-2] 200 | bbox_normalized = box[:4] 201 | box_json["category_id"] = int(category_id) 202 | box_json["image_id"] = int(image_id) 203 | bbox = [] 204 | for i, bbox_coord in enumerate(bbox_normalized): 205 | modified_bbox_coord = float(bbox_coord) 206 | if i % 2: 207 | modified_bbox_coord *= image_height 208 | else: 209 | modified_bbox_coord *= image_width 210 | modified_bbox_coord = round(modified_bbox_coord, 2) 211 | bbox.append(modified_bbox_coord) 212 | box_json["bbox_normalized"] = list(map(lambda x: round(float(x), 2), bbox_normalized)) 213 | box_json["bbox"] = bbox 214 | box_json["score"] = round(float(score), 2) 215 | box_json["timing"] = float(finish - start) 216 | boxes_json.append(box_json) 217 | # print("see box_json: ", box_json) 218 | with open(resFile, 'w') as outfile: 219 | json.dump(boxes_json, outfile, default=myconverter) 220 | else: 221 | print("warning: output from model after postprocessing is not a list, ignoring") 222 | return 223 | 224 | # namesfile = 'data/coco.names' 225 | # class_names = load_class_names(namesfile) 226 | # plot_boxes(img, boxes, 'data/outcome/predictions_{}.jpg'.format(image_id), class_names) 227 | 228 | with open(resFile, 'w') as outfile: 229 | json.dump(boxes_json, outfile, default=myconverter) 230 | 231 | evaluate_on_coco(cfg, resFile) 232 | 233 | 234 | def get_args(**kwargs): 235 | cfg = kwargs 236 | parser = argparse.ArgumentParser(description='Test model on test dataset', 237 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 238 | parser.add_argument('-f', '--load', dest='load', type=str, default=None, 239 | help='Load model from a .pth file') 240 | parser.add_argument('-g', '--gpu', metavar='G', type=str, default='-1', 241 | help='GPU', dest='gpu') 242 | parser.add_argument('-dir', '--data-dir', type=str, default=None, 243 | help='dataset dir', dest='dataset_dir') 244 | parser.add_argument('-gta', '--ground_truth_annotations', type=str, default='instances_val2017.json', 245 | help='ground truth annotations file', dest='gt_annotations_path') 246 | parser.add_argument('-w', '--weights_file', type=str, default='weights/yolov4.weights', 247 | help='weights file to load', dest='weights_file') 248 | parser.add_argument('-c', '--model_config', type=str, default='cfg/yolov4.cfg', 249 | help='model config file to load', dest='model_config') 250 | args = vars(parser.parse_args()) 251 | 252 | for k in args.keys(): 253 | cfg[k] = args.get(k) 254 | return edict(cfg) 255 | 256 | 257 | def init_logger(log_file=None, log_dir=None, log_level=logging.INFO, mode='w', stdout=True): 258 | """ 259 | log_dir: 日志文件的文件夹路径 260 | mode: 'a', append; 'w', 覆盖原文件写入. 261 | """ 262 | import datetime 263 | def get_date_str(): 264 | now = datetime.datetime.now() 265 | return now.strftime('%Y-%m-%d_%H-%M-%S') 266 | 267 | fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s: %(message)s' 268 | if log_dir is None: 269 | log_dir = '~/temp/log/' 270 | if log_file is None: 271 | log_file = 'log_' + get_date_str() + '.txt' 272 | if not os.path.exists(log_dir): 273 | os.makedirs(log_dir) 274 | log_file = os.path.join(log_dir, log_file) 275 | # 此处不能使用logging输出 276 | print('log file path:' + log_file) 277 | 278 | logging.basicConfig(level=logging.DEBUG, 279 | format=fmt, 280 | filename=log_file, 281 | filemode=mode) 282 | 283 | if stdout: 284 | console = logging.StreamHandler(stream=sys.stdout) 285 | console.setLevel(log_level) 286 | formatter = logging.Formatter(fmt) 287 | console.setFormatter(formatter) 288 | logging.getLogger('').addHandler(console) 289 | 290 | return logging 291 | 292 | 293 | if __name__ == "__main__": 294 | logging = init_logger(log_dir='log') 295 | cfg = get_args(**Cfg) 296 | os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu 297 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 298 | logging.info(f'Using device {device}') 299 | 300 | model = Darknet(cfg.model_config) 301 | 302 | model.print_network() 303 | model.load_weights(cfg.weights_file) 304 | model.eval() # set model away from training 305 | 306 | if torch.cuda.device_count() > 1: 307 | model = torch.nn.DataParallel(model) 308 | 309 | model.to(device=device) 310 | 311 | annotations_file_path = cfg.gt_annotations_path 312 | with open(annotations_file_path) as annotations_file: 313 | try: 314 | annotations = json.load(annotations_file) 315 | except: 316 | print("annotations file not a json") 317 | exit() 318 | test(model=model, 319 | annotations=annotations, 320 | cfg=cfg, ) 321 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.18.2 2 | torch==1.4.0 3 | tensorboardX==2.0 4 | scikit_image==0.16.2 5 | matplotlib==2.2.3 6 | tqdm==4.43.0 7 | easydict==1.9 8 | Pillow==7.1.2 9 | opencv_python 10 | pycocotools 11 | -------------------------------------------------------------------------------- /tool/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tianxiaomo/pytorch-YOLOv4/a65d219f9066bae4e12003bd7cdc04531860c672/tool/__init__.py -------------------------------------------------------------------------------- /tool/camera.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/04/26 15:48 4 | @Author : Tianxiaomo 5 | @File : camera.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | from __future__ import division 14 | import cv2 15 | from tool.darknet2pytorch import Darknet 16 | import argparse 17 | from tool.utils import * 18 | from tool.torch_utils import * 19 | 20 | 21 | def arg_parse(): 22 | """ 23 | Parse arguements to the detect module 24 | 25 | """ 26 | 27 | parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') 28 | parser.add_argument("--confidence", dest="confidence", help="Object Confidence to filter predictions", default=0.25) 29 | parser.add_argument("--nms_thresh", dest="nms_thresh", help="NMS Threshhold", default=0.4) 30 | parser.add_argument("--reso", dest='reso', help= 31 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 32 | default="160", type=str) 33 | return parser.parse_args() 34 | 35 | 36 | if __name__ == '__main__': 37 | cfgfile = "cfg/yolov4.cfg" 38 | weightsfile = "weight/yolov4.weights" 39 | 40 | args = arg_parse() 41 | confidence = float(args.confidence) 42 | nms_thesh = float(args.nms_thresh) 43 | CUDA = torch.cuda.is_available() 44 | num_classes = 80 45 | bbox_attrs = 5 + num_classes 46 | class_names = load_class_names("data/coco.names") 47 | 48 | model = Darknet(cfgfile) 49 | model.load_weights(weightsfile) 50 | 51 | if CUDA: 52 | model.cuda() 53 | 54 | model.eval() 55 | cap = cv2.VideoCapture(0) 56 | 57 | assert cap.isOpened(), 'Cannot capture source' 58 | 59 | frames = 0 60 | start = time.time() 61 | while cap.isOpened(): 62 | ret, frame = cap.read() 63 | if ret: 64 | sized = cv2.resize(frame, (model.width, model.height)) 65 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 66 | boxes = do_detect(model, sized, 0.5, 0.4, CUDA) 67 | 68 | orig_im = plot_boxes_cv2(frame, boxes, class_names=class_names) 69 | 70 | cv2.imshow("frame", orig_im) 71 | key = cv2.waitKey(1) 72 | if key & 0xFF == ord('q'): 73 | break 74 | frames += 1 75 | print("FPS of the video is {:5.2f}".format(frames / (time.time() - start))) 76 | else: 77 | break 78 | -------------------------------------------------------------------------------- /tool/coco_annotation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/05/08 11:45 4 | @Author : Tianxiaomo 5 | @File : coco_annotatin.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | import json 14 | from collections import defaultdict 15 | from tqdm import tqdm 16 | import os 17 | 18 | """hyper parameters""" 19 | json_file_path = 'E:/Dataset/mscoco2017/annotations/instances_train2017.json' 20 | images_dir_path = 'mscoco2017/train2017/' 21 | output_path = '../data/val.txt' 22 | 23 | """load json file""" 24 | name_box_id = defaultdict(list) 25 | id_name = dict() 26 | with open(json_file_path, encoding='utf-8') as f: 27 | data = json.load(f) 28 | 29 | """generate labels""" 30 | images = data['images'] 31 | annotations = data['annotations'] 32 | for ant in tqdm(annotations): 33 | id = ant['image_id'] 34 | # name = os.path.join(images_dir_path, images[id]['file_name']) 35 | name = os.path.join(images_dir_path, '{:012d}.jpg'.format(id)) 36 | cat = ant['category_id'] 37 | 38 | if cat >= 1 and cat <= 11: 39 | cat = cat - 1 40 | elif cat >= 13 and cat <= 25: 41 | cat = cat - 2 42 | elif cat >= 27 and cat <= 28: 43 | cat = cat - 3 44 | elif cat >= 31 and cat <= 44: 45 | cat = cat - 5 46 | elif cat >= 46 and cat <= 65: 47 | cat = cat - 6 48 | elif cat == 67: 49 | cat = cat - 7 50 | elif cat == 70: 51 | cat = cat - 9 52 | elif cat >= 72 and cat <= 82: 53 | cat = cat - 10 54 | elif cat >= 84 and cat <= 90: 55 | cat = cat - 11 56 | 57 | name_box_id[name].append([ant['bbox'], cat]) 58 | 59 | """write to txt""" 60 | with open(output_path, 'w') as f: 61 | for key in tqdm(name_box_id.keys()): 62 | f.write(key) 63 | box_infos = name_box_id[key] 64 | for info in box_infos: 65 | x_min = int(info[0][0]) 66 | y_min = int(info[0][1]) 67 | x_max = x_min + int(info[0][2]) 68 | y_max = y_min + int(info[0][3]) 69 | 70 | box_info = " %d,%d,%d,%d,%d" % ( 71 | x_min, y_min, x_max, y_max, int(info[1])) 72 | f.write(box_info) 73 | f.write('\n') 74 | -------------------------------------------------------------------------------- /tool/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tool.torch_utils import convert2cpu 3 | 4 | 5 | def parse_cfg(cfgfile): 6 | blocks = [] 7 | fp = open(cfgfile, 'r') 8 | block = None 9 | line = fp.readline() 10 | while line != '': 11 | line = line.rstrip() 12 | if line == '' or line[0] == '#': 13 | line = fp.readline() 14 | continue 15 | elif line[0] == '[': 16 | if block: 17 | blocks.append(block) 18 | block = dict() 19 | block['type'] = line.lstrip('[').rstrip(']') 20 | # set default value 21 | if block['type'] == 'convolutional': 22 | block['batch_normalize'] = 0 23 | else: 24 | key, value = line.split('=') 25 | key = key.strip() 26 | if key == 'type': 27 | key = '_type' 28 | value = value.strip() 29 | block[key] = value 30 | line = fp.readline() 31 | 32 | if block: 33 | blocks.append(block) 34 | fp.close() 35 | return blocks 36 | 37 | 38 | def print_cfg(blocks): 39 | print('layer filters size input output'); 40 | prev_width = 416 41 | prev_height = 416 42 | prev_filters = 3 43 | out_filters = [] 44 | out_widths = [] 45 | out_heights = [] 46 | ind = -2 47 | for block in blocks: 48 | ind = ind + 1 49 | if block['type'] == 'net': 50 | prev_width = int(block['width']) 51 | prev_height = int(block['height']) 52 | continue 53 | elif block['type'] == 'convolutional': 54 | filters = int(block['filters']) 55 | kernel_size = int(block['size']) 56 | stride = int(block['stride']) 57 | is_pad = int(block['pad']) 58 | pad = (kernel_size - 1) // 2 if is_pad else 0 59 | width = (prev_width + 2 * pad - kernel_size) // stride + 1 60 | height = (prev_height + 2 * pad - kernel_size) // stride + 1 61 | print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 62 | ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width, 63 | height, filters)) 64 | prev_width = width 65 | prev_height = height 66 | prev_filters = filters 67 | out_widths.append(prev_width) 68 | out_heights.append(prev_height) 69 | out_filters.append(prev_filters) 70 | elif block['type'] == 'maxpool': 71 | pool_size = int(block['size']) 72 | stride = int(block['stride']) 73 | width = prev_width // stride 74 | height = prev_height // stride 75 | print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 76 | ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height, 77 | filters)) 78 | prev_width = width 79 | prev_height = height 80 | prev_filters = filters 81 | out_widths.append(prev_width) 82 | out_heights.append(prev_height) 83 | out_filters.append(prev_filters) 84 | elif block['type'] == 'avgpool': 85 | width = 1 86 | height = 1 87 | print('%5d %-6s %3d x %3d x%4d -> %3d' % ( 88 | ind, 'avg', prev_width, prev_height, prev_filters, prev_filters)) 89 | prev_width = width 90 | prev_height = height 91 | prev_filters = filters 92 | out_widths.append(prev_width) 93 | out_heights.append(prev_height) 94 | out_filters.append(prev_filters) 95 | elif block['type'] == 'softmax': 96 | print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters)) 97 | out_widths.append(prev_width) 98 | out_heights.append(prev_height) 99 | out_filters.append(prev_filters) 100 | elif block['type'] == 'cost': 101 | print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters)) 102 | out_widths.append(prev_width) 103 | out_heights.append(prev_height) 104 | out_filters.append(prev_filters) 105 | elif block['type'] == 'reorg': 106 | stride = int(block['stride']) 107 | filters = stride * stride * prev_filters 108 | width = prev_width // stride 109 | height = prev_height // stride 110 | print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 111 | ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters)) 112 | prev_width = width 113 | prev_height = height 114 | prev_filters = filters 115 | out_widths.append(prev_width) 116 | out_heights.append(prev_height) 117 | out_filters.append(prev_filters) 118 | elif block['type'] == 'upsample': 119 | stride = int(block['stride']) 120 | filters = prev_filters 121 | width = prev_width * stride 122 | height = prev_height * stride 123 | print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 124 | ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters)) 125 | prev_width = width 126 | prev_height = height 127 | prev_filters = filters 128 | out_widths.append(prev_width) 129 | out_heights.append(prev_height) 130 | out_filters.append(prev_filters) 131 | elif block['type'] == 'route': 132 | layers = block['layers'].split(',') 133 | layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] 134 | if len(layers) == 1: 135 | print('%5d %-6s %d' % (ind, 'route', layers[0])) 136 | prev_width = out_widths[layers[0]] 137 | prev_height = out_heights[layers[0]] 138 | prev_filters = out_filters[layers[0]] 139 | elif len(layers) == 2: 140 | print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1])) 141 | prev_width = out_widths[layers[0]] 142 | prev_height = out_heights[layers[0]] 143 | assert (prev_width == out_widths[layers[1]]) 144 | assert (prev_height == out_heights[layers[1]]) 145 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] 146 | elif len(layers) == 4: 147 | print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3])) 148 | prev_width = out_widths[layers[0]] 149 | prev_height = out_heights[layers[0]] 150 | assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]]) 151 | assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]]) 152 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[ 153 | layers[3]] 154 | else: 155 | print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename, 156 | sys._getframe().f_code.co_name, sys._getframe().f_lineno)) 157 | 158 | out_widths.append(prev_width) 159 | out_heights.append(prev_height) 160 | out_filters.append(prev_filters) 161 | elif block['type'] in ['region', 'yolo']: 162 | print('%5d %-6s' % (ind, 'detection')) 163 | out_widths.append(prev_width) 164 | out_heights.append(prev_height) 165 | out_filters.append(prev_filters) 166 | elif block['type'] == 'shortcut': 167 | from_id = int(block['from']) 168 | from_id = from_id if from_id > 0 else from_id + ind 169 | print('%5d %-6s %d' % (ind, 'shortcut', from_id)) 170 | prev_width = out_widths[from_id] 171 | prev_height = out_heights[from_id] 172 | prev_filters = out_filters[from_id] 173 | out_widths.append(prev_width) 174 | out_heights.append(prev_height) 175 | out_filters.append(prev_filters) 176 | elif block['type'] == 'sam': 177 | from_id = int(block['from']) 178 | from_id = from_id if from_id > 0 else from_id + ind 179 | print('%5d %-6s %d' % (ind, 'sam', from_id)) 180 | prev_width = out_widths[from_id] 181 | prev_height = out_heights[from_id] 182 | prev_filters = out_filters[from_id] 183 | out_widths.append(prev_width) 184 | out_heights.append(prev_height) 185 | out_filters.append(prev_filters) 186 | elif block['type'] == 'connected': 187 | filters = int(block['output']) 188 | print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters)) 189 | prev_filters = filters 190 | out_widths.append(1) 191 | out_heights.append(1) 192 | out_filters.append(prev_filters) 193 | else: 194 | print('unknown type %s' % (block['type'])) 195 | 196 | 197 | def load_conv(buf, start, conv_model): 198 | num_w = conv_model.weight.numel() 199 | num_b = conv_model.bias.numel() 200 | conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 201 | start = start + num_b 202 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); 203 | start = start + num_w 204 | return start 205 | 206 | 207 | def save_conv(fp, conv_model): 208 | if conv_model.bias.is_cuda: 209 | convert2cpu(conv_model.bias.data).numpy().tofile(fp) 210 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 211 | else: 212 | conv_model.bias.data.numpy().tofile(fp) 213 | conv_model.weight.data.numpy().tofile(fp) 214 | 215 | 216 | def load_conv_bn(buf, start, conv_model, bn_model): 217 | num_w = conv_model.weight.numel() 218 | num_b = bn_model.bias.numel() 219 | bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 220 | start = start + num_b 221 | bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b])); 222 | start = start + num_b 223 | bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b])); 224 | start = start + num_b 225 | bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b])); 226 | start = start + num_b 227 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); 228 | start = start + num_w 229 | return start 230 | 231 | 232 | def save_conv_bn(fp, conv_model, bn_model): 233 | if bn_model.bias.is_cuda: 234 | convert2cpu(bn_model.bias.data).numpy().tofile(fp) 235 | convert2cpu(bn_model.weight.data).numpy().tofile(fp) 236 | convert2cpu(bn_model.running_mean).numpy().tofile(fp) 237 | convert2cpu(bn_model.running_var).numpy().tofile(fp) 238 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 239 | else: 240 | bn_model.bias.data.numpy().tofile(fp) 241 | bn_model.weight.data.numpy().tofile(fp) 242 | bn_model.running_mean.numpy().tofile(fp) 243 | bn_model.running_var.numpy().tofile(fp) 244 | conv_model.weight.data.numpy().tofile(fp) 245 | 246 | 247 | def load_fc(buf, start, fc_model): 248 | num_w = fc_model.weight.numel() 249 | num_b = fc_model.bias.numel() 250 | fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 251 | start = start + num_b 252 | fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w])); 253 | start = start + num_w 254 | return start 255 | 256 | 257 | def save_fc(fp, fc_model): 258 | fc_model.bias.data.numpy().tofile(fp) 259 | fc_model.weight.data.numpy().tofile(fp) 260 | 261 | 262 | if __name__ == '__main__': 263 | import sys 264 | 265 | blocks = parse_cfg('cfg/yolo.cfg') 266 | if len(sys.argv) == 2: 267 | blocks = parse_cfg(sys.argv[1]) 268 | print_cfg(blocks) 269 | -------------------------------------------------------------------------------- /tool/darknet2onnx.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from tool.darknet2pytorch import Darknet 4 | 5 | 6 | def transform_to_onnx(cfgfile, weightfile, batch_size=1, onnx_file_name=None): 7 | model = Darknet(cfgfile) 8 | 9 | model.print_network() 10 | model.load_weights(weightfile) 11 | print('Loading weights from %s... Done!' % (weightfile)) 12 | 13 | dynamic = False 14 | if batch_size <= 0: 15 | dynamic = True 16 | 17 | input_names = ["input"] 18 | output_names = ['boxes', 'confs'] 19 | 20 | if dynamic: 21 | x = torch.randn((1, 3, model.height, model.width), requires_grad=True) 22 | if not onnx_file_name: 23 | onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(model.height, model.width) 24 | dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}} 25 | # Export the model 26 | print('Export the onnx model ...') 27 | torch.onnx.export(model, 28 | x, 29 | onnx_file_name, 30 | export_params=True, 31 | opset_version=11, 32 | do_constant_folding=True, 33 | input_names=input_names, output_names=output_names, 34 | dynamic_axes=dynamic_axes) 35 | 36 | print('Onnx model exporting done') 37 | return onnx_file_name 38 | 39 | else: 40 | x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True) 41 | onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, model.height, model.width) 42 | torch.onnx.export(model, 43 | x, 44 | onnx_file_name, 45 | export_params=True, 46 | opset_version=11, 47 | do_constant_folding=True, 48 | input_names=input_names, output_names=output_names, 49 | dynamic_axes=None) 50 | 51 | print('Onnx model exporting done') 52 | return onnx_file_name 53 | 54 | 55 | if __name__ == '__main__': 56 | from argparse import ArgumentParser 57 | parser = ArgumentParser() 58 | parser.add_argument('config') 59 | parser.add_argument('weightfile') 60 | parser.add_argument('--batch_size', type=int, help="Static Batchsize of the model. use batch_size<=0 for dynamic batch size") 61 | parser.add_argument('--onnx_file_path', help="Output onnx file path") 62 | args = parser.parse_args() 63 | transform_to_onnx(args.config, args.weightfile, args.batch_size, args.onnx_file_path) 64 | 65 | -------------------------------------------------------------------------------- /tool/onnx2tensorflow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import onnx 3 | from onnx_tf.backend import prepare 4 | 5 | 6 | # tensorflow >=2.0 7 | # 1: Thanks:github:https://github.com/onnx/onnx-tensorflow 8 | # 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow 9 | # Run pip install -e . 10 | # Note: 11 | # Errors will occur when using "pip install onnx-tf", at least for me, 12 | # it is recommended to use source code installation 13 | def transform_to_tensorflow(onnx_input_path, pb_output_path): 14 | onnx_model = onnx.load(onnx_input_path) # load onnx model 15 | tf_exp = prepare(onnx_model) # prepare tf representation 16 | tf_exp.export_graph(pb_output_path) # export the model 17 | 18 | 19 | if __name__ == '__main__': 20 | if len(sys.argv) == 1: 21 | sys.argv.append('../weight/yolov4_1_3_608_608.onnx') # use:darknet2onnx.py 22 | sys.argv.append('../weight/yolov4.pb') # use:onnx2tensorflow.py 23 | if len(sys.argv) == 3: 24 | onnxfile = sys.argv[1] 25 | tfpb_outfile = sys.argv[2] 26 | transform_to_tensorflow(onnxfile, tfpb_outfile) 27 | else: 28 | print('Please execute this script this way:\n') 29 | print(' python onnx2tensorflow.py ') 30 | -------------------------------------------------------------------------------- /tool/region_loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from tool.torch_utils import * 4 | 5 | 6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, 7 | sil_thresh, seen): 8 | nB = target.size(0) 9 | nA = num_anchors 10 | nC = num_classes 11 | anchor_step = len(anchors) / num_anchors 12 | conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale 13 | coord_mask = torch.zeros(nB, nA, nH, nW) 14 | cls_mask = torch.zeros(nB, nA, nH, nW) 15 | tx = torch.zeros(nB, nA, nH, nW) 16 | ty = torch.zeros(nB, nA, nH, nW) 17 | tw = torch.zeros(nB, nA, nH, nW) 18 | th = torch.zeros(nB, nA, nH, nW) 19 | tconf = torch.zeros(nB, nA, nH, nW) 20 | tcls = torch.zeros(nB, nA, nH, nW) 21 | 22 | nAnchors = nA * nH * nW 23 | nPixels = nH * nW 24 | for b in range(nB): 25 | cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() 26 | cur_ious = torch.zeros(nAnchors) 27 | for t in range(50): 28 | if target[b][t * 5 + 1] == 0: 29 | break 30 | gx = target[b][t * 5 + 1] * nW 31 | gy = target[b][t * 5 + 2] * nH 32 | gw = target[b][t * 5 + 3] * nW 33 | gh = target[b][t * 5 + 4] * nH 34 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() 35 | cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) 36 | conf_mask[b][cur_ious > sil_thresh] = 0 37 | if seen < 12800: 38 | if anchor_step == 4: 39 | tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1, 40 | 1).repeat( 41 | nB, 1, nH, nW) 42 | ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view( 43 | 1, nA, 1, 1).repeat(nB, 1, nH, nW) 44 | else: 45 | tx.fill_(0.5) 46 | ty.fill_(0.5) 47 | tw.zero_() 48 | th.zero_() 49 | coord_mask.fill_(1) 50 | 51 | nGT = 0 52 | nCorrect = 0 53 | for b in range(nB): 54 | for t in range(50): 55 | if target[b][t * 5 + 1] == 0: 56 | break 57 | nGT = nGT + 1 58 | best_iou = 0.0 59 | best_n = -1 60 | min_dist = 10000 61 | gx = target[b][t * 5 + 1] * nW 62 | gy = target[b][t * 5 + 2] * nH 63 | gi = int(gx) 64 | gj = int(gy) 65 | gw = target[b][t * 5 + 3] * nW 66 | gh = target[b][t * 5 + 4] * nH 67 | gt_box = [0, 0, gw, gh] 68 | for n in range(nA): 69 | aw = anchors[anchor_step * n] 70 | ah = anchors[anchor_step * n + 1] 71 | anchor_box = [0, 0, aw, ah] 72 | iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) 73 | if anchor_step == 4: 74 | ax = anchors[anchor_step * n + 2] 75 | ay = anchors[anchor_step * n + 3] 76 | dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2) 77 | if iou > best_iou: 78 | best_iou = iou 79 | best_n = n 80 | elif anchor_step == 4 and iou == best_iou and dist < min_dist: 81 | best_iou = iou 82 | best_n = n 83 | min_dist = dist 84 | 85 | gt_box = [gx, gy, gw, gh] 86 | pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] 87 | 88 | coord_mask[b][best_n][gj][gi] = 1 89 | cls_mask[b][best_n][gj][gi] = 1 90 | conf_mask[b][best_n][gj][gi] = object_scale 91 | tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi 92 | ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj 93 | tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n]) 94 | th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1]) 95 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou 96 | tconf[b][best_n][gj][gi] = iou 97 | tcls[b][best_n][gj][gi] = target[b][t * 5] 98 | if iou > 0.5: 99 | nCorrect = nCorrect + 1 100 | 101 | return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls 102 | 103 | 104 | class RegionLoss(nn.Module): 105 | def __init__(self, num_classes=0, anchors=[], num_anchors=1): 106 | super(RegionLoss, self).__init__() 107 | self.num_classes = num_classes 108 | self.anchors = anchors 109 | self.num_anchors = num_anchors 110 | self.anchor_step = len(anchors) / num_anchors 111 | self.coord_scale = 1 112 | self.noobject_scale = 1 113 | self.object_scale = 5 114 | self.class_scale = 1 115 | self.thresh = 0.6 116 | self.seen = 0 117 | 118 | def forward(self, output, target): 119 | # output : BxAs*(4+1+num_classes)*H*W 120 | t0 = time.time() 121 | nB = output.data.size(0) 122 | nA = self.num_anchors 123 | nC = self.num_classes 124 | nH = output.data.size(2) 125 | nW = output.data.size(3) 126 | 127 | output = output.view(nB, nA, (5 + nC), nH, nW) 128 | x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW)) 129 | y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW)) 130 | w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW) 131 | h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW) 132 | conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW)) 133 | cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda())) 134 | cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC) 135 | t1 = time.time() 136 | 137 | pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW) 138 | grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 139 | grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 140 | anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda() 141 | anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda() 142 | anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 143 | anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 144 | pred_boxes[0] = x.data + grid_x 145 | pred_boxes[1] = y.data + grid_y 146 | pred_boxes[2] = torch.exp(w.data) * anchor_w 147 | pred_boxes[3] = torch.exp(h.data) * anchor_h 148 | pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)) 149 | t2 = time.time() 150 | 151 | nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes, 152 | target.data, 153 | self.anchors, nA, 154 | nC, \ 155 | nH, nW, 156 | self.noobject_scale, 157 | self.object_scale, 158 | self.thresh, 159 | self.seen) 160 | cls_mask = (cls_mask == 1) 161 | nProposals = int((conf > 0.25).sum().data[0]) 162 | 163 | tx = Variable(tx.cuda()) 164 | ty = Variable(ty.cuda()) 165 | tw = Variable(tw.cuda()) 166 | th = Variable(th.cuda()) 167 | tconf = Variable(tconf.cuda()) 168 | tcls = Variable(tcls.view(-1)[cls_mask].long().cuda()) 169 | 170 | coord_mask = Variable(coord_mask.cuda()) 171 | conf_mask = Variable(conf_mask.cuda().sqrt()) 172 | cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda()) 173 | cls = cls[cls_mask].view(-1, nC) 174 | 175 | t3 = time.time() 176 | 177 | loss_x = self.coord_scale * nn.MSELoss(reduction='sum')(x * coord_mask, tx * coord_mask) / 2.0 178 | loss_y = self.coord_scale * nn.MSELoss(reduction='sum')(y * coord_mask, ty * coord_mask) / 2.0 179 | loss_w = self.coord_scale * nn.MSELoss(reduction='sum')(w * coord_mask, tw * coord_mask) / 2.0 180 | loss_h = self.coord_scale * nn.MSELoss(reduction='sum')(h * coord_mask, th * coord_mask) / 2.0 181 | loss_conf = nn.MSELoss(reduction='sum')(conf * conf_mask, tconf * conf_mask) / 2.0 182 | loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls) 183 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls 184 | t4 = time.time() 185 | if False: 186 | print('-----------------------------------') 187 | print(' activation : %f' % (t1 - t0)) 188 | print(' create pred_boxes : %f' % (t2 - t1)) 189 | print(' build targets : %f' % (t3 - t2)) 190 | print(' create loss : %f' % (t4 - t3)) 191 | print(' total : %f' % (t4 - t0)) 192 | print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % ( 193 | self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0], 194 | loss_conf.data[0], loss_cls.data[0], loss.data[0])) 195 | return loss 196 | -------------------------------------------------------------------------------- /tool/torch_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import math 5 | import torch 6 | import numpy as np 7 | from torch.autograd import Variable 8 | 9 | import itertools 10 | import struct # get_image_size 11 | import imghdr # get_image_size 12 | 13 | from tool import utils 14 | 15 | 16 | def bbox_ious(boxes1, boxes2, x1y1x2y2=True): 17 | if x1y1x2y2: 18 | mx = torch.min(boxes1[0], boxes2[0]) 19 | Mx = torch.max(boxes1[2], boxes2[2]) 20 | my = torch.min(boxes1[1], boxes2[1]) 21 | My = torch.max(boxes1[3], boxes2[3]) 22 | w1 = boxes1[2] - boxes1[0] 23 | h1 = boxes1[3] - boxes1[1] 24 | w2 = boxes2[2] - boxes2[0] 25 | h2 = boxes2[3] - boxes2[1] 26 | else: 27 | mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0) 28 | Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0) 29 | my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0) 30 | My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0) 31 | w1 = boxes1[2] 32 | h1 = boxes1[3] 33 | w2 = boxes2[2] 34 | h2 = boxes2[3] 35 | uw = Mx - mx 36 | uh = My - my 37 | cw = w1 + w2 - uw 38 | ch = h1 + h2 - uh 39 | mask = ((cw <= 0) + (ch <= 0) > 0) 40 | area1 = w1 * h1 41 | area2 = w2 * h2 42 | carea = cw * ch 43 | carea[mask] = 0 44 | uarea = area1 + area2 - carea 45 | return carea / uarea 46 | 47 | 48 | def get_region_boxes(boxes_and_confs): 49 | 50 | # print('Getting boxes from boxes and confs ...') 51 | 52 | boxes_list = [] 53 | confs_list = [] 54 | 55 | for item in boxes_and_confs: 56 | boxes_list.append(item[0]) 57 | confs_list.append(item[1]) 58 | 59 | # boxes: [batch, num1 + num2 + num3, 1, 4] 60 | # confs: [batch, num1 + num2 + num3, num_classes] 61 | boxes = torch.cat(boxes_list, dim=1) 62 | confs = torch.cat(confs_list, dim=1) 63 | 64 | return [boxes, confs] 65 | 66 | 67 | def convert2cpu(gpu_matrix): 68 | return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix) 69 | 70 | 71 | def convert2cpu_long(gpu_matrix): 72 | return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix) 73 | 74 | 75 | 76 | def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1): 77 | model.eval() 78 | with torch.no_grad(): 79 | t0 = time.time() 80 | 81 | if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image 82 | img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) 83 | elif type(img) == np.ndarray and len(img.shape) == 4: 84 | img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) 85 | else: 86 | print("unknow image type") 87 | exit(-1) 88 | 89 | if use_cuda: 90 | img = img.cuda() 91 | img = torch.autograd.Variable(img) 92 | 93 | t1 = time.time() 94 | 95 | output = model(img) 96 | 97 | t2 = time.time() 98 | 99 | print('-----------------------------------') 100 | print(' Preprocess : %f' % (t1 - t0)) 101 | print(' Model Inference : %f' % (t2 - t1)) 102 | print('-----------------------------------') 103 | 104 | return utils.post_processing(img, conf_thresh, nms_thresh, output) 105 | 106 | -------------------------------------------------------------------------------- /tool/tv_reference/README.md: -------------------------------------------------------------------------------- 1 | # Object detection reference training scripts 2 | 3 | This folder contains reference training scripts for object detection. 4 | They serve as a log of how to train specific models, to provide baseline 5 | training and evaluation scripts to quickly bootstrap research. 6 | 7 | To execute the example commands below you must install the following: 8 | 9 | ``` 10 | cython 11 | pycocotools 12 | matplotlib 13 | ``` 14 | 15 | You must modify the following flags: 16 | 17 | `--data-path=/path/to/coco/dataset` 18 | 19 | `--nproc_per_node=` 20 | 21 | Except otherwise noted, all models have been trained on 8x V100 GPUs. 22 | 23 | ### Faster R-CNN 24 | ``` 25 | python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ 26 | --dataset coco --model fasterrcnn_resnet50_fpn --epochs 26\ 27 | --lr-steps 16 22 --aspect-ratio-group-factor 3 28 | ``` 29 | 30 | 31 | ### Mask R-CNN 32 | ``` 33 | python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ 34 | --dataset coco --model maskrcnn_resnet50_fpn --epochs 26\ 35 | --lr-steps 16 22 --aspect-ratio-group-factor 3 36 | ``` 37 | 38 | 39 | ### Keypoint R-CNN 40 | ``` 41 | python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ 42 | --dataset coco_kp --model keypointrcnn_resnet50_fpn --epochs 46\ 43 | --lr-steps 36 43 --aspect-ratio-group-factor 3 44 | ``` 45 | 46 | -------------------------------------------------------------------------------- /tool/tv_reference/coco_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | from PIL import Image 4 | 5 | import torch 6 | import torch.utils.data 7 | import torchvision 8 | 9 | from pycocotools import mask as coco_mask 10 | from pycocotools.coco import COCO 11 | 12 | from . import transforms as T 13 | 14 | 15 | class FilterAndRemapCocoCategories(object): 16 | def __init__(self, categories, remap=True): 17 | self.categories = categories 18 | self.remap = remap 19 | 20 | def __call__(self, image, target): 21 | anno = target["annotations"] 22 | anno = [obj for obj in anno if obj["category_id"] in self.categories] 23 | if not self.remap: 24 | target["annotations"] = anno 25 | return image, target 26 | anno = copy.deepcopy(anno) 27 | for obj in anno: 28 | obj["category_id"] = self.categories.index(obj["category_id"]) 29 | target["annotations"] = anno 30 | return image, target 31 | 32 | 33 | def convert_coco_poly_to_mask(segmentations, height, width): 34 | masks = [] 35 | for polygons in segmentations: 36 | rles = coco_mask.frPyObjects(polygons, height, width) 37 | mask = coco_mask.decode(rles) 38 | if len(mask.shape) < 3: 39 | mask = mask[..., None] 40 | mask = torch.as_tensor(mask, dtype=torch.uint8) 41 | mask = mask.any(dim=2) 42 | masks.append(mask) 43 | if masks: 44 | masks = torch.stack(masks, dim=0) 45 | else: 46 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 47 | return masks 48 | 49 | 50 | class ConvertCocoPolysToMask(object): 51 | def __call__(self, image, target): 52 | w, h = image.size 53 | 54 | image_id = target["image_id"] 55 | image_id = torch.tensor([image_id]) 56 | 57 | anno = target["annotations"] 58 | 59 | anno = [obj for obj in anno if obj['iscrowd'] == 0] 60 | 61 | boxes = [obj["bbox"] for obj in anno] 62 | # guard against no boxes via resizing 63 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 64 | boxes[:, 2:] += boxes[:, :2] 65 | boxes[:, 0::2].clamp_(min=0, max=w) 66 | boxes[:, 1::2].clamp_(min=0, max=h) 67 | 68 | classes = [obj["category_id"] for obj in anno] 69 | classes = torch.tensor(classes, dtype=torch.int64) 70 | 71 | segmentations = [obj["segmentation"] for obj in anno] 72 | masks = convert_coco_poly_to_mask(segmentations, h, w) 73 | 74 | keypoints = None 75 | if anno and "keypoints" in anno[0]: 76 | keypoints = [obj["keypoints"] for obj in anno] 77 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32) 78 | num_keypoints = keypoints.shape[0] 79 | if num_keypoints: 80 | keypoints = keypoints.view(num_keypoints, -1, 3) 81 | 82 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 83 | boxes = boxes[keep] 84 | classes = classes[keep] 85 | masks = masks[keep] 86 | if keypoints is not None: 87 | keypoints = keypoints[keep] 88 | 89 | target = {} 90 | target["boxes"] = boxes 91 | target["labels"] = classes 92 | target["masks"] = masks 93 | target["image_id"] = image_id 94 | if keypoints is not None: 95 | target["keypoints"] = keypoints 96 | 97 | # for conversion to coco api 98 | area = torch.tensor([obj["area"] for obj in anno]) 99 | iscrowd = torch.tensor([obj["iscrowd"] for obj in anno]) 100 | target["area"] = area 101 | target["iscrowd"] = iscrowd 102 | 103 | return image, target 104 | 105 | 106 | def _coco_remove_images_without_annotations(dataset, cat_list=None): 107 | def _has_only_empty_bbox(anno): 108 | return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) 109 | 110 | def _count_visible_keypoints(anno): 111 | return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) 112 | 113 | min_keypoints_per_image = 10 114 | 115 | def _has_valid_annotation(anno): 116 | # if it's empty, there is no annotation 117 | if len(anno) == 0: 118 | return False 119 | # if all boxes have close to zero area, there is no annotation 120 | if _has_only_empty_bbox(anno): 121 | return False 122 | # keypoints task have a slight different critera for considering 123 | # if an annotation is valid 124 | if "keypoints" not in anno[0]: 125 | return True 126 | # for keypoint detection tasks, only consider valid images those 127 | # containing at least min_keypoints_per_image 128 | if _count_visible_keypoints(anno) >= min_keypoints_per_image: 129 | return True 130 | return False 131 | 132 | assert isinstance(dataset, torchvision.datasets.CocoDetection) 133 | ids = [] 134 | for ds_idx, img_id in enumerate(dataset.ids): 135 | ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) 136 | anno = dataset.coco.loadAnns(ann_ids) 137 | if cat_list: 138 | anno = [obj for obj in anno if obj["category_id"] in cat_list] 139 | if _has_valid_annotation(anno): 140 | ids.append(ds_idx) 141 | 142 | dataset = torch.utils.data.Subset(dataset, ids) 143 | return dataset 144 | 145 | 146 | def convert_to_coco_api(ds, bbox_fmt='voc'): 147 | """ 148 | """ 149 | print("in function convert_to_coco_api...") 150 | coco_ds = COCO() 151 | # annotation IDs need to start at 1, not 0, see torchvision issue #1530 152 | ann_id = 1 153 | dataset = {'images': [], 'categories': [], 'annotations': []} 154 | categories = set() 155 | for img_idx in range(len(ds)): 156 | # find better way to get target 157 | # targets = ds.get_annotations(img_idx) 158 | img, targets = ds[img_idx] 159 | image_id = targets["image_id"].item() 160 | img_dict = {} 161 | img_dict['id'] = image_id 162 | img_dict['height'] = img.shape[-2] 163 | img_dict['width'] = img.shape[-1] 164 | dataset['images'].append(img_dict) 165 | bboxes = targets["boxes"] 166 | # to coco format: xmin, ymin, w, h 167 | if bbox_fmt.lower() == "voc": # xmin, ymin, xmax, ymax 168 | bboxes[:, 2:] -= bboxes[:, :2] 169 | elif bbox_fmt.lower() == "yolo": # xcen, ycen, w, h 170 | bboxes[:, :2] = bboxes[:, :2] - bboxes[:, 2:]/2 171 | elif bbox_fmt.lower() == "coco": 172 | pass 173 | else: 174 | raise ValueError(f"bounding box format {bbox_fmt} not supported!") 175 | bboxes = bboxes.tolist() 176 | labels = targets['labels'].tolist() 177 | areas = targets['area'].tolist() 178 | iscrowd = targets['iscrowd'].tolist() 179 | if 'masks' in targets: 180 | masks = targets['masks'] 181 | # make masks Fortran contiguous for coco_mask 182 | masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1) 183 | if 'keypoints' in targets: 184 | keypoints = targets['keypoints'] 185 | keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist() 186 | num_objs = len(bboxes) 187 | for i in range(num_objs): 188 | ann = {} 189 | ann['image_id'] = image_id 190 | ann['bbox'] = bboxes[i] 191 | ann['category_id'] = labels[i] 192 | categories.add(labels[i]) 193 | ann['area'] = areas[i] 194 | ann['iscrowd'] = iscrowd[i] 195 | ann['id'] = ann_id 196 | if 'masks' in targets: 197 | ann["segmentation"] = coco_mask.encode(masks[i].numpy()) 198 | if 'keypoints' in targets: 199 | ann['keypoints'] = keypoints[i] 200 | ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3]) 201 | dataset['annotations'].append(ann) 202 | ann_id += 1 203 | dataset['categories'] = [{'id': i} for i in sorted(categories)] 204 | coco_ds.dataset = dataset 205 | coco_ds.createIndex() 206 | return coco_ds 207 | 208 | 209 | def get_coco_api_from_dataset(dataset): 210 | for _ in range(10): 211 | if isinstance(dataset, torchvision.datasets.CocoDetection): 212 | break 213 | if isinstance(dataset, torch.utils.data.Subset): 214 | dataset = dataset.dataset 215 | if isinstance(dataset, torchvision.datasets.CocoDetection): 216 | return dataset.coco 217 | return convert_to_coco_api(dataset) 218 | 219 | 220 | class CocoDetection(torchvision.datasets.CocoDetection): 221 | def __init__(self, img_folder, ann_file, transforms): 222 | super(CocoDetection, self).__init__(img_folder, ann_file) 223 | self._transforms = transforms 224 | 225 | def __getitem__(self, idx): 226 | img, target = super(CocoDetection, self).__getitem__(idx) 227 | image_id = self.ids[idx] 228 | target = dict(image_id=image_id, annotations=target) 229 | if self._transforms is not None: 230 | img, target = self._transforms(img, target) 231 | return img, target 232 | 233 | 234 | def get_coco(root, image_set, transforms, mode='instances'): 235 | anno_file_template = "{}_{}2017.json" 236 | PATHS = { 237 | "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))), 238 | "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))), 239 | # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))) 240 | } 241 | 242 | t = [ConvertCocoPolysToMask()] 243 | 244 | if transforms is not None: 245 | t.append(transforms) 246 | transforms = T.Compose(t) 247 | 248 | img_folder, ann_file = PATHS[image_set] 249 | img_folder = os.path.join(root, img_folder) 250 | ann_file = os.path.join(root, ann_file) 251 | 252 | dataset = CocoDetection(img_folder, ann_file, transforms=transforms) 253 | 254 | if image_set == "train": 255 | dataset = _coco_remove_images_without_annotations(dataset) 256 | 257 | # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)]) 258 | 259 | return dataset 260 | 261 | 262 | def get_coco_kp(root, image_set, transforms): 263 | return get_coco(root, image_set, transforms, mode="person_keypoints") 264 | -------------------------------------------------------------------------------- /tool/tv_reference/engine.py: -------------------------------------------------------------------------------- 1 | import math 2 | import sys 3 | import time 4 | import torch 5 | 6 | import torchvision.models.detection.mask_rcnn 7 | 8 | from .coco_utils import get_coco_api_from_dataset 9 | from .coco_eval import CocoEvaluator 10 | from . import utils 11 | 12 | 13 | def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): 14 | model.train() 15 | metric_logger = utils.MetricLogger(delimiter=" ") 16 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 17 | header = 'Epoch: [{}]'.format(epoch) 18 | 19 | lr_scheduler = None 20 | if epoch == 0: 21 | warmup_factor = 1. / 1000 22 | warmup_iters = min(1000, len(data_loader) - 1) 23 | 24 | lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) 25 | 26 | for images, targets in metric_logger.log_every(data_loader, print_freq, header): 27 | images = list(image.to(device) for image in images) 28 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 29 | 30 | loss_dict = model(images, targets) 31 | 32 | losses = sum(loss for loss in loss_dict.values()) 33 | 34 | # reduce losses over all GPUs for logging purposes 35 | loss_dict_reduced = utils.reduce_dict(loss_dict) 36 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 37 | 38 | loss_value = losses_reduced.item() 39 | 40 | if not math.isfinite(loss_value): 41 | print("Loss is {}, stopping training".format(loss_value)) 42 | print(loss_dict_reduced) 43 | sys.exit(1) 44 | 45 | optimizer.zero_grad() 46 | losses.backward() 47 | optimizer.step() 48 | 49 | if lr_scheduler is not None: 50 | lr_scheduler.step() 51 | 52 | metric_logger.update(loss=losses_reduced, **loss_dict_reduced) 53 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 54 | 55 | return metric_logger 56 | 57 | 58 | def _get_iou_types(model): 59 | model_without_ddp = model 60 | if isinstance(model, torch.nn.parallel.DistributedDataParallel): 61 | model_without_ddp = model.module 62 | iou_types = ["bbox"] 63 | if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN): 64 | iou_types.append("segm") 65 | if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN): 66 | iou_types.append("keypoints") 67 | return iou_types 68 | 69 | 70 | @torch.no_grad() 71 | def evaluate(model, data_loader, device): 72 | n_threads = torch.get_num_threads() 73 | # FIXME remove this and make paste_masks_in_image run on the GPU 74 | torch.set_num_threads(1) 75 | cpu_device = torch.device("cpu") 76 | model.eval() 77 | metric_logger = utils.MetricLogger(delimiter=" ") 78 | header = 'Test:' 79 | 80 | coco = get_coco_api_from_dataset(data_loader.dataset) 81 | iou_types = _get_iou_types(model) 82 | coco_evaluator = CocoEvaluator(coco, iou_types) 83 | 84 | for images, targets in metric_logger.log_every(data_loader, 100, header): 85 | images = list(img.to(device) for img in images) 86 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 87 | 88 | torch.cuda.synchronize() 89 | model_time = time.time() 90 | outputs = model(images) 91 | 92 | outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] 93 | model_time = time.time() - model_time 94 | 95 | res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} 96 | evaluator_time = time.time() 97 | coco_evaluator.update(res) 98 | evaluator_time = time.time() - evaluator_time 99 | metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) 100 | 101 | # gather the stats from all processes 102 | metric_logger.synchronize_between_processes() 103 | print("Averaged stats:", metric_logger) 104 | coco_evaluator.synchronize_between_processes() 105 | 106 | # accumulate predictions from all images 107 | coco_evaluator.accumulate() 108 | coco_evaluator.summarize() 109 | torch.set_num_threads(n_threads) 110 | return coco_evaluator 111 | -------------------------------------------------------------------------------- /tool/tv_reference/group_by_aspect_ratio.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | from collections import defaultdict 3 | import copy 4 | from itertools import repeat, chain 5 | import math 6 | import numpy as np 7 | 8 | import torch 9 | import torch.utils.data 10 | from torch.utils.data.sampler import BatchSampler, Sampler 11 | from torch.utils.model_zoo import tqdm 12 | import torchvision 13 | 14 | from PIL import Image 15 | 16 | 17 | def _repeat_to_at_least(iterable, n): 18 | repeat_times = math.ceil(n / len(iterable)) 19 | repeated = chain.from_iterable(repeat(iterable, repeat_times)) 20 | return list(repeated) 21 | 22 | 23 | class GroupedBatchSampler(BatchSampler): 24 | """ 25 | Wraps another sampler to yield a mini-batch of indices. 26 | It enforces that the batch only contain elements from the same group. 27 | It also tries to provide mini-batches which follows an ordering which is 28 | as close as possible to the ordering from the original sampler. 29 | Arguments: 30 | sampler (Sampler): Base sampler. 31 | group_ids (list[int]): If the sampler produces indices in range [0, N), 32 | `group_ids` must be a list of `N` ints which contains the group id of each sample. 33 | The group ids must be a continuous set of integers starting from 34 | 0, i.e. they must be in the range [0, num_groups). 35 | batch_size (int): Size of mini-batch. 36 | """ 37 | def __init__(self, sampler, group_ids, batch_size): 38 | if not isinstance(sampler, Sampler): 39 | raise ValueError( 40 | "sampler should be an instance of " 41 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 42 | ) 43 | self.sampler = sampler 44 | self.group_ids = group_ids 45 | self.batch_size = batch_size 46 | 47 | def __iter__(self): 48 | buffer_per_group = defaultdict(list) 49 | samples_per_group = defaultdict(list) 50 | 51 | num_batches = 0 52 | for idx in self.sampler: 53 | group_id = self.group_ids[idx] 54 | buffer_per_group[group_id].append(idx) 55 | samples_per_group[group_id].append(idx) 56 | if len(buffer_per_group[group_id]) == self.batch_size: 57 | yield buffer_per_group[group_id] 58 | num_batches += 1 59 | del buffer_per_group[group_id] 60 | assert len(buffer_per_group[group_id]) < self.batch_size 61 | 62 | # now we have run out of elements that satisfy 63 | # the group criteria, let's return the remaining 64 | # elements so that the size of the sampler is 65 | # deterministic 66 | expected_num_batches = len(self) 67 | num_remaining = expected_num_batches - num_batches 68 | if num_remaining > 0: 69 | # for the remaining batches, take first the buffers with largest number 70 | # of elements 71 | for group_id, _ in sorted(buffer_per_group.items(), 72 | key=lambda x: len(x[1]), reverse=True): 73 | remaining = self.batch_size - len(buffer_per_group[group_id]) 74 | samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining) 75 | buffer_per_group[group_id].extend(samples_from_group_id[:remaining]) 76 | assert len(buffer_per_group[group_id]) == self.batch_size 77 | yield buffer_per_group[group_id] 78 | num_remaining -= 1 79 | if num_remaining == 0: 80 | break 81 | assert num_remaining == 0 82 | 83 | def __len__(self): 84 | return len(self.sampler) // self.batch_size 85 | 86 | 87 | def _compute_aspect_ratios_slow(dataset, indices=None): 88 | print("Your dataset doesn't support the fast path for " 89 | "computing the aspect ratios, so will iterate over " 90 | "the full dataset and load every image instead. " 91 | "This might take some time...") 92 | if indices is None: 93 | indices = range(len(dataset)) 94 | 95 | class SubsetSampler(Sampler): 96 | def __init__(self, indices): 97 | self.indices = indices 98 | 99 | def __iter__(self): 100 | return iter(self.indices) 101 | 102 | def __len__(self): 103 | return len(self.indices) 104 | 105 | sampler = SubsetSampler(indices) 106 | data_loader = torch.utils.data.DataLoader( 107 | dataset, batch_size=1, sampler=sampler, 108 | num_workers=14, # you might want to increase it for faster processing 109 | collate_fn=lambda x: x[0]) 110 | aspect_ratios = [] 111 | with tqdm(total=len(dataset)) as pbar: 112 | for _i, (img, _) in enumerate(data_loader): 113 | pbar.update(1) 114 | height, width = img.shape[-2:] 115 | aspect_ratio = float(width) / float(height) 116 | aspect_ratios.append(aspect_ratio) 117 | return aspect_ratios 118 | 119 | 120 | def _compute_aspect_ratios_custom_dataset(dataset, indices=None): 121 | if indices is None: 122 | indices = range(len(dataset)) 123 | aspect_ratios = [] 124 | for i in indices: 125 | height, width = dataset.get_height_and_width(i) 126 | aspect_ratio = float(width) / float(height) 127 | aspect_ratios.append(aspect_ratio) 128 | return aspect_ratios 129 | 130 | 131 | def _compute_aspect_ratios_coco_dataset(dataset, indices=None): 132 | if indices is None: 133 | indices = range(len(dataset)) 134 | aspect_ratios = [] 135 | for i in indices: 136 | img_info = dataset.coco.imgs[dataset.ids[i]] 137 | aspect_ratio = float(img_info["width"]) / float(img_info["height"]) 138 | aspect_ratios.append(aspect_ratio) 139 | return aspect_ratios 140 | 141 | 142 | def _compute_aspect_ratios_voc_dataset(dataset, indices=None): 143 | if indices is None: 144 | indices = range(len(dataset)) 145 | aspect_ratios = [] 146 | for i in indices: 147 | # this doesn't load the data into memory, because PIL loads it lazily 148 | width, height = Image.open(dataset.images[i]).size 149 | aspect_ratio = float(width) / float(height) 150 | aspect_ratios.append(aspect_ratio) 151 | return aspect_ratios 152 | 153 | 154 | def _compute_aspect_ratios_subset_dataset(dataset, indices=None): 155 | if indices is None: 156 | indices = range(len(dataset)) 157 | 158 | ds_indices = [dataset.indices[i] for i in indices] 159 | return compute_aspect_ratios(dataset.dataset, ds_indices) 160 | 161 | 162 | def compute_aspect_ratios(dataset, indices=None): 163 | if hasattr(dataset, "get_height_and_width"): 164 | return _compute_aspect_ratios_custom_dataset(dataset, indices) 165 | 166 | if isinstance(dataset, torchvision.datasets.CocoDetection): 167 | return _compute_aspect_ratios_coco_dataset(dataset, indices) 168 | 169 | if isinstance(dataset, torchvision.datasets.VOCDetection): 170 | return _compute_aspect_ratios_voc_dataset(dataset, indices) 171 | 172 | if isinstance(dataset, torch.utils.data.Subset): 173 | return _compute_aspect_ratios_subset_dataset(dataset, indices) 174 | 175 | # slow path 176 | return _compute_aspect_ratios_slow(dataset, indices) 177 | 178 | 179 | def _quantize(x, bins): 180 | bins = copy.deepcopy(bins) 181 | bins = sorted(bins) 182 | quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) 183 | return quantized 184 | 185 | 186 | def create_aspect_ratio_groups(dataset, k=0): 187 | aspect_ratios = compute_aspect_ratios(dataset) 188 | bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0] 189 | groups = _quantize(aspect_ratios, bins) 190 | # count number of elements per group 191 | counts = np.unique(groups, return_counts=True)[1] 192 | fbins = [0] + bins + [np.inf] 193 | print("Using {} as bins for aspect ratio quantization".format(fbins)) 194 | print("Count of instances per bin: {}".format(counts)) 195 | return groups 196 | -------------------------------------------------------------------------------- /tool/tv_reference/train.py: -------------------------------------------------------------------------------- 1 | r"""PyTorch Detection Training. 2 | 3 | To run in a multi-gpu environment, use the distributed launcher:: 4 | 5 | python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \ 6 | train.py ... --world-size $NGPU 7 | 8 | The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu. 9 | --lr 0.02 --batch-size 2 --world-size 8 10 | If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU. 11 | 12 | On top of that, for training Faster/Mask R-CNN, the default hyperparameters are 13 | --epochs 26 --lr-steps 16 22 --aspect-ratio-group-factor 3 14 | 15 | Also, if you train Keypoint R-CNN, the default hyperparameters are 16 | --epochs 46 --lr-steps 36 43 --aspect-ratio-group-factor 3 17 | Because the number of images is smaller in the person keypoint subset of COCO, 18 | the number of epochs should be adapted so that we have the same number of iterations. 19 | """ 20 | import datetime 21 | import os 22 | import time 23 | 24 | import torch 25 | import torch.utils.data 26 | from torch import nn 27 | import torchvision 28 | import torchvision.models.detection 29 | import torchvision.models.detection.mask_rcnn 30 | 31 | from .coco_utils import get_coco, get_coco_kp 32 | 33 | from .group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups 34 | from .engine import train_one_epoch, evaluate 35 | 36 | from . import utils 37 | from . import transforms as T 38 | 39 | 40 | def get_dataset(name, image_set, transform, data_path): 41 | paths = { 42 | "coco": (data_path, get_coco, 91), 43 | "coco_kp": (data_path, get_coco_kp, 2) 44 | } 45 | p, ds_fn, num_classes = paths[name] 46 | 47 | ds = ds_fn(p, image_set=image_set, transforms=transform) 48 | return ds, num_classes 49 | 50 | 51 | def get_transform(train): 52 | transforms = [] 53 | transforms.append(T.ToTensor()) 54 | if train: 55 | transforms.append(T.RandomHorizontalFlip(0.5)) 56 | return T.Compose(transforms) 57 | 58 | 59 | def main(args): 60 | utils.init_distributed_mode(args) 61 | print(args) 62 | 63 | device = torch.device(args.device) 64 | 65 | # Data loading code 66 | print("Loading data") 67 | 68 | dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) 69 | dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) 70 | 71 | print("Creating data loaders") 72 | if args.distributed: 73 | train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) 74 | test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test) 75 | else: 76 | train_sampler = torch.utils.data.RandomSampler(dataset) 77 | test_sampler = torch.utils.data.SequentialSampler(dataset_test) 78 | 79 | if args.aspect_ratio_group_factor >= 0: 80 | group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) 81 | train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) 82 | else: 83 | train_batch_sampler = torch.utils.data.BatchSampler( 84 | train_sampler, args.batch_size, drop_last=True) 85 | 86 | data_loader = torch.utils.data.DataLoader( 87 | dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, 88 | collate_fn=utils.collate_fn) 89 | 90 | data_loader_test = torch.utils.data.DataLoader( 91 | dataset_test, batch_size=1, 92 | sampler=test_sampler, num_workers=args.workers, 93 | collate_fn=utils.collate_fn) 94 | 95 | print("Creating model") 96 | model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, 97 | pretrained=args.pretrained) 98 | model.to(device) 99 | 100 | model_without_ddp = model 101 | if args.distributed: 102 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 103 | model_without_ddp = model.module 104 | 105 | params = [p for p in model.parameters() if p.requires_grad] 106 | optimizer = torch.optim.SGD( 107 | params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) 108 | 109 | # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) 110 | lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) 111 | 112 | if args.resume: 113 | checkpoint = torch.load(args.resume, map_location='cpu') 114 | model_without_ddp.load_state_dict(checkpoint['model']) 115 | optimizer.load_state_dict(checkpoint['optimizer']) 116 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 117 | args.start_epoch = checkpoint['epoch'] + 1 118 | 119 | if args.test_only: 120 | evaluate(model, data_loader_test, device=device) 121 | return 122 | 123 | print("Start training") 124 | start_time = time.time() 125 | for epoch in range(args.start_epoch, args.epochs): 126 | if args.distributed: 127 | train_sampler.set_epoch(epoch) 128 | train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) 129 | lr_scheduler.step() 130 | if args.output_dir: 131 | utils.save_on_master({ 132 | 'model': model_without_ddp.state_dict(), 133 | 'optimizer': optimizer.state_dict(), 134 | 'lr_scheduler': lr_scheduler.state_dict(), 135 | 'args': args, 136 | 'epoch': epoch}, 137 | os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) 138 | 139 | # evaluate after every epoch 140 | evaluate(model, data_loader_test, device=device) 141 | 142 | total_time = time.time() - start_time 143 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 144 | print('Training time {}'.format(total_time_str)) 145 | 146 | 147 | if __name__ == "__main__": 148 | import argparse 149 | parser = argparse.ArgumentParser( 150 | description=__doc__) 151 | 152 | parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset') 153 | parser.add_argument('--dataset', default='coco', help='dataset') 154 | parser.add_argument('--model', default='maskrcnn_resnet50_fpn', help='model') 155 | parser.add_argument('--device', default='cuda', help='device') 156 | parser.add_argument('-b', '--batch-size', default=2, type=int, 157 | help='images per gpu, the total batch size is $NGPU x batch_size') 158 | parser.add_argument('--epochs', default=26, type=int, metavar='N', 159 | help='number of total epochs to run') 160 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 161 | help='number of data loading workers (default: 4)') 162 | parser.add_argument('--lr', default=0.02, type=float, 163 | help='initial learning rate, 0.02 is the default value for training ' 164 | 'on 8 gpus and 2 images_per_gpu') 165 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 166 | help='momentum') 167 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 168 | metavar='W', help='weight decay (default: 1e-4)', 169 | dest='weight_decay') 170 | parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs') 171 | parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int, help='decrease lr every step-size epochs') 172 | parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma') 173 | parser.add_argument('--print-freq', default=20, type=int, help='print frequency') 174 | parser.add_argument('--output-dir', default='.', help='path where to save') 175 | parser.add_argument('--resume', default='', help='resume from checkpoint') 176 | parser.add_argument('--start_epoch', default=0, type=int, help='start epoch') 177 | parser.add_argument('--aspect-ratio-group-factor', default=3, type=int) 178 | parser.add_argument( 179 | "--test-only", 180 | dest="test_only", 181 | help="Only test the model", 182 | action="store_true", 183 | ) 184 | parser.add_argument( 185 | "--pretrained", 186 | dest="pretrained", 187 | help="Use pre-trained models from the modelzoo", 188 | action="store_true", 189 | ) 190 | 191 | # distributed training parameters 192 | parser.add_argument('--world-size', default=1, type=int, 193 | help='number of distributed processes') 194 | parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training') 195 | 196 | args = parser.parse_args() 197 | 198 | if args.output_dir: 199 | utils.mkdir(args.output_dir) 200 | 201 | main(args) 202 | -------------------------------------------------------------------------------- /tool/tv_reference/transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | from torchvision.transforms import functional as F 5 | 6 | 7 | def _flip_coco_person_keypoints(kps, width): 8 | flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] 9 | flipped_data = kps[:, flip_inds] 10 | flipped_data[..., 0] = width - flipped_data[..., 0] 11 | # Maintain COCO convention that if visibility == 0, then x, y = 0 12 | inds = flipped_data[..., 2] == 0 13 | flipped_data[inds] = 0 14 | return flipped_data 15 | 16 | 17 | class Compose(object): 18 | def __init__(self, transforms): 19 | self.transforms = transforms 20 | 21 | def __call__(self, image, target): 22 | for t in self.transforms: 23 | image, target = t(image, target) 24 | return image, target 25 | 26 | 27 | class RandomHorizontalFlip(object): 28 | def __init__(self, prob): 29 | self.prob = prob 30 | 31 | def __call__(self, image, target): 32 | if random.random() < self.prob: 33 | height, width = image.shape[-2:] 34 | image = image.flip(-1) 35 | bbox = target["boxes"] 36 | bbox[:, [0, 2]] = width - bbox[:, [2, 0]] 37 | target["boxes"] = bbox 38 | if "masks" in target: 39 | target["masks"] = target["masks"].flip(-1) 40 | if "keypoints" in target: 41 | keypoints = target["keypoints"] 42 | keypoints = _flip_coco_person_keypoints(keypoints, width) 43 | target["keypoints"] = keypoints 44 | return image, target 45 | 46 | 47 | class ToTensor(object): 48 | def __call__(self, image, target): 49 | image = F.to_tensor(image) 50 | return image, target 51 | -------------------------------------------------------------------------------- /tool/tv_reference/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, deque 2 | import datetime 3 | import pickle 4 | import time 5 | 6 | import torch 7 | import torch.distributed as dist 8 | 9 | import errno 10 | import os 11 | 12 | 13 | class SmoothedValue(object): 14 | """Track a series of values and provide access to smoothed values over a 15 | window or the global series average. 16 | """ 17 | 18 | def __init__(self, window_size=20, fmt=None): 19 | if fmt is None: 20 | fmt = "{median:.4f} ({global_avg:.4f})" 21 | self.deque = deque(maxlen=window_size) 22 | self.total = 0.0 23 | self.count = 0 24 | self.fmt = fmt 25 | 26 | def update(self, value, n=1): 27 | self.deque.append(value) 28 | self.count += n 29 | self.total += value * n 30 | 31 | def synchronize_between_processes(self): 32 | """ 33 | Warning: does not synchronize the deque! 34 | """ 35 | if not is_dist_avail_and_initialized(): 36 | return 37 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') 38 | dist.barrier() 39 | dist.all_reduce(t) 40 | t = t.tolist() 41 | self.count = int(t[0]) 42 | self.total = t[1] 43 | 44 | @property 45 | def median(self): 46 | d = torch.tensor(list(self.deque)) 47 | return d.median().item() 48 | 49 | @property 50 | def avg(self): 51 | d = torch.tensor(list(self.deque), dtype=torch.float32) 52 | return d.mean().item() 53 | 54 | @property 55 | def global_avg(self): 56 | return self.total / self.count 57 | 58 | @property 59 | def max(self): 60 | return max(self.deque) 61 | 62 | @property 63 | def value(self): 64 | return self.deque[-1] 65 | 66 | def __str__(self): 67 | return self.fmt.format( 68 | median=self.median, 69 | avg=self.avg, 70 | global_avg=self.global_avg, 71 | max=self.max, 72 | value=self.value) 73 | 74 | 75 | def all_gather(data): 76 | """ 77 | Run all_gather on arbitrary picklable data (not necessarily tensors) 78 | Args: 79 | data: any picklable object 80 | Returns: 81 | list[data]: list of data gathered from each rank 82 | """ 83 | world_size = get_world_size() 84 | if world_size == 1: 85 | return [data] 86 | 87 | # serialized to a Tensor 88 | buffer = pickle.dumps(data) 89 | storage = torch.ByteStorage.from_buffer(buffer) 90 | tensor = torch.ByteTensor(storage).to("cuda") 91 | 92 | # obtain Tensor size of each rank 93 | local_size = torch.tensor([tensor.numel()], device="cuda") 94 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] 95 | dist.all_gather(size_list, local_size) 96 | size_list = [int(size.item()) for size in size_list] 97 | max_size = max(size_list) 98 | 99 | # receiving Tensor from all ranks 100 | # we pad the tensor because torch all_gather does not support 101 | # gathering tensors of different shapes 102 | tensor_list = [] 103 | for _ in size_list: 104 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) 105 | if local_size != max_size: 106 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") 107 | tensor = torch.cat((tensor, padding), dim=0) 108 | dist.all_gather(tensor_list, tensor) 109 | 110 | data_list = [] 111 | for size, tensor in zip(size_list, tensor_list): 112 | buffer = tensor.cpu().numpy().tobytes()[:size] 113 | data_list.append(pickle.loads(buffer)) 114 | 115 | return data_list 116 | 117 | 118 | def reduce_dict(input_dict, average=True): 119 | """ 120 | Args: 121 | input_dict (dict): all the values will be reduced 122 | average (bool): whether to do average or sum 123 | Reduce the values in the dictionary from all processes so that all processes 124 | have the averaged results. Returns a dict with the same fields as 125 | input_dict, after reduction. 126 | """ 127 | world_size = get_world_size() 128 | if world_size < 2: 129 | return input_dict 130 | with torch.no_grad(): 131 | names = [] 132 | values = [] 133 | # sort the keys so that they are consistent across processes 134 | for k in sorted(input_dict.keys()): 135 | names.append(k) 136 | values.append(input_dict[k]) 137 | values = torch.stack(values, dim=0) 138 | dist.all_reduce(values) 139 | if average: 140 | values /= world_size 141 | reduced_dict = {k: v for k, v in zip(names, values)} 142 | return reduced_dict 143 | 144 | 145 | class MetricLogger(object): 146 | def __init__(self, delimiter="\t"): 147 | self.meters = defaultdict(SmoothedValue) 148 | self.delimiter = delimiter 149 | 150 | def update(self, **kwargs): 151 | for k, v in kwargs.items(): 152 | if isinstance(v, torch.Tensor): 153 | v = v.item() 154 | assert isinstance(v, (float, int)) 155 | self.meters[k].update(v) 156 | 157 | def __getattr__(self, attr): 158 | if attr in self.meters: 159 | return self.meters[attr] 160 | if attr in self.__dict__: 161 | return self.__dict__[attr] 162 | raise AttributeError("'{}' object has no attribute '{}'".format( 163 | type(self).__name__, attr)) 164 | 165 | def __str__(self): 166 | loss_str = [] 167 | for name, meter in self.meters.items(): 168 | loss_str.append( 169 | "{}: {}".format(name, str(meter)) 170 | ) 171 | return self.delimiter.join(loss_str) 172 | 173 | def synchronize_between_processes(self): 174 | for meter in self.meters.values(): 175 | meter.synchronize_between_processes() 176 | 177 | def add_meter(self, name, meter): 178 | self.meters[name] = meter 179 | 180 | def log_every(self, iterable, print_freq, header=None): 181 | i = 0 182 | if not header: 183 | header = '' 184 | start_time = time.time() 185 | end = time.time() 186 | iter_time = SmoothedValue(fmt='{avg:.4f}') 187 | data_time = SmoothedValue(fmt='{avg:.4f}') 188 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd' 189 | if torch.cuda.is_available(): 190 | log_msg = self.delimiter.join([ 191 | header, 192 | '[{0' + space_fmt + '}/{1}]', 193 | 'eta: {eta}', 194 | '{meters}', 195 | 'time: {time}', 196 | 'data: {data}', 197 | 'max mem: {memory:.0f}' 198 | ]) 199 | else: 200 | log_msg = self.delimiter.join([ 201 | header, 202 | '[{0' + space_fmt + '}/{1}]', 203 | 'eta: {eta}', 204 | '{meters}', 205 | 'time: {time}', 206 | 'data: {data}' 207 | ]) 208 | MB = 1024.0 * 1024.0 209 | for obj in iterable: 210 | data_time.update(time.time() - end) 211 | yield obj 212 | iter_time.update(time.time() - end) 213 | if i % print_freq == 0 or i == len(iterable) - 1: 214 | eta_seconds = iter_time.global_avg * (len(iterable) - i) 215 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 216 | if torch.cuda.is_available(): 217 | print(log_msg.format( 218 | i, len(iterable), eta=eta_string, 219 | meters=str(self), 220 | time=str(iter_time), data=str(data_time), 221 | memory=torch.cuda.max_memory_allocated() / MB)) 222 | else: 223 | print(log_msg.format( 224 | i, len(iterable), eta=eta_string, 225 | meters=str(self), 226 | time=str(iter_time), data=str(data_time))) 227 | i += 1 228 | end = time.time() 229 | total_time = time.time() - start_time 230 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 231 | print('{} Total time: {} ({:.4f} s / it)'.format( 232 | header, total_time_str, total_time / len(iterable))) 233 | 234 | 235 | def collate_fn(batch): 236 | return tuple(zip(*batch)) 237 | 238 | 239 | def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor): 240 | 241 | def f(x): 242 | if x >= warmup_iters: 243 | return 1 244 | alpha = float(x) / warmup_iters 245 | return warmup_factor * (1 - alpha) + alpha 246 | 247 | return torch.optim.lr_scheduler.LambdaLR(optimizer, f) 248 | 249 | 250 | def mkdir(path): 251 | try: 252 | os.makedirs(path) 253 | except OSError as e: 254 | if e.errno != errno.EEXIST: 255 | raise 256 | 257 | 258 | def setup_for_distributed(is_master): 259 | """ 260 | This function disables printing when not in master process 261 | """ 262 | import builtins as __builtin__ 263 | builtin_print = __builtin__.print 264 | 265 | def print(*args, **kwargs): 266 | force = kwargs.pop('force', False) 267 | if is_master or force: 268 | builtin_print(*args, **kwargs) 269 | 270 | __builtin__.print = print 271 | 272 | 273 | def is_dist_avail_and_initialized(): 274 | if not dist.is_available(): 275 | return False 276 | if not dist.is_initialized(): 277 | return False 278 | return True 279 | 280 | 281 | def get_world_size(): 282 | if not is_dist_avail_and_initialized(): 283 | return 1 284 | return dist.get_world_size() 285 | 286 | 287 | def get_rank(): 288 | if not is_dist_avail_and_initialized(): 289 | return 0 290 | return dist.get_rank() 291 | 292 | 293 | def is_main_process(): 294 | return get_rank() == 0 295 | 296 | 297 | def save_on_master(*args, **kwargs): 298 | if is_main_process(): 299 | torch.save(*args, **kwargs) 300 | 301 | 302 | def init_distributed_mode(args): 303 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 304 | args.rank = int(os.environ["RANK"]) 305 | args.world_size = int(os.environ['WORLD_SIZE']) 306 | args.gpu = int(os.environ['LOCAL_RANK']) 307 | elif 'SLURM_PROCID' in os.environ: 308 | args.rank = int(os.environ['SLURM_PROCID']) 309 | args.gpu = args.rank % torch.cuda.device_count() 310 | else: 311 | print('Not using distributed mode') 312 | args.distributed = False 313 | return 314 | 315 | args.distributed = True 316 | 317 | torch.cuda.set_device(args.gpu) 318 | args.dist_backend = 'nccl' 319 | print('| distributed init (rank {}): {}'.format( 320 | args.rank, args.dist_url), flush=True) 321 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 322 | world_size=args.world_size, rank=args.rank) 323 | torch.distributed.barrier() 324 | setup_for_distributed(args.rank == 0) 325 | -------------------------------------------------------------------------------- /tool/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import math 5 | import numpy as np 6 | 7 | import itertools 8 | import struct # get_image_size 9 | import imghdr # get_image_size 10 | 11 | 12 | def sigmoid(x): 13 | return 1.0 / (np.exp(-x) + 1.) 14 | 15 | 16 | def softmax(x): 17 | x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1)) 18 | x = x / np.expand_dims(x.sum(axis=1), axis=1) 19 | return x 20 | 21 | 22 | def bbox_iou(box1, box2, x1y1x2y2=True): 23 | 24 | # print('iou box1:', box1) 25 | # print('iou box2:', box2) 26 | 27 | if x1y1x2y2: 28 | mx = min(box1[0], box2[0]) 29 | Mx = max(box1[2], box2[2]) 30 | my = min(box1[1], box2[1]) 31 | My = max(box1[3], box2[3]) 32 | w1 = box1[2] - box1[0] 33 | h1 = box1[3] - box1[1] 34 | w2 = box2[2] - box2[0] 35 | h2 = box2[3] - box2[1] 36 | else: 37 | w1 = box1[2] 38 | h1 = box1[3] 39 | w2 = box2[2] 40 | h2 = box2[3] 41 | 42 | mx = min(box1[0], box2[0]) 43 | Mx = max(box1[0] + w1, box2[0] + w2) 44 | my = min(box1[1], box2[1]) 45 | My = max(box1[1] + h1, box2[1] + h2) 46 | uw = Mx - mx 47 | uh = My - my 48 | cw = w1 + w2 - uw 49 | ch = h1 + h2 - uh 50 | carea = 0 51 | if cw <= 0 or ch <= 0: 52 | return 0.0 53 | 54 | area1 = w1 * h1 55 | area2 = w2 * h2 56 | carea = cw * ch 57 | uarea = area1 + area2 - carea 58 | return carea / uarea 59 | 60 | 61 | def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): 62 | # print(boxes.shape) 63 | x1 = boxes[:, 0] 64 | y1 = boxes[:, 1] 65 | x2 = boxes[:, 2] 66 | y2 = boxes[:, 3] 67 | 68 | areas = (x2 - x1) * (y2 - y1) 69 | order = confs.argsort()[::-1] 70 | 71 | keep = [] 72 | while order.size > 0: 73 | idx_self = order[0] 74 | idx_other = order[1:] 75 | 76 | keep.append(idx_self) 77 | 78 | xx1 = np.maximum(x1[idx_self], x1[idx_other]) 79 | yy1 = np.maximum(y1[idx_self], y1[idx_other]) 80 | xx2 = np.minimum(x2[idx_self], x2[idx_other]) 81 | yy2 = np.minimum(y2[idx_self], y2[idx_other]) 82 | 83 | w = np.maximum(0.0, xx2 - xx1) 84 | h = np.maximum(0.0, yy2 - yy1) 85 | inter = w * h 86 | 87 | if min_mode: 88 | over = inter / np.minimum(areas[order[0]], areas[order[1:]]) 89 | else: 90 | over = inter / (areas[order[0]] + areas[order[1:]] - inter) 91 | 92 | inds = np.where(over <= nms_thresh)[0] 93 | order = order[inds + 1] 94 | 95 | return np.array(keep) 96 | 97 | 98 | 99 | def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None): 100 | import cv2 101 | img = np.copy(img) 102 | colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) 103 | 104 | def get_color(c, x, max_val): 105 | ratio = float(x) / max_val * 5 106 | i = int(math.floor(ratio)) 107 | j = int(math.ceil(ratio)) 108 | ratio = ratio - i 109 | r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] 110 | return int(r * 255) 111 | 112 | width = img.shape[1] 113 | height = img.shape[0] 114 | for i in range(len(boxes)): 115 | box = boxes[i] 116 | x1 = int(box[0] * width) 117 | y1 = int(box[1] * height) 118 | x2 = int(box[2] * width) 119 | y2 = int(box[3] * height) 120 | bbox_thick = int(0.6 * (height + width) / 600) 121 | if color: 122 | rgb = color 123 | else: 124 | rgb = (255, 0, 0) 125 | if len(box) >= 7 and class_names: 126 | cls_conf = box[5] 127 | cls_id = box[6] 128 | print('%s: %f' % (class_names[cls_id], cls_conf)) 129 | classes = len(class_names) 130 | offset = cls_id * 123457 % classes 131 | red = get_color(2, offset, classes) 132 | green = get_color(1, offset, classes) 133 | blue = get_color(0, offset, classes) 134 | if color is None: 135 | rgb = (red, green, blue) 136 | msg = str(class_names[cls_id])+" "+str(round(cls_conf,3)) 137 | t_size = cv2.getTextSize(msg, 0, 0.7, thickness=bbox_thick // 2)[0] 138 | c1, c2 = (x1,y1), (x2, y2) 139 | c3 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3) 140 | cv2.rectangle(img, (x1,y1), (np.float32(c3[0]), np.float32(c3[1])), rgb, -1) 141 | img = cv2.putText(img, msg, (c1[0], np.float32(c1[1] - 2)), cv2.FONT_HERSHEY_SIMPLEX,0.7, (0,0,0), bbox_thick//2,lineType=cv2.LINE_AA) 142 | 143 | img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, bbox_thick) 144 | if savename: 145 | print("save plot results to %s" % savename) 146 | cv2.imwrite(savename, img) 147 | return img 148 | 149 | 150 | def read_truths(lab_path): 151 | if not os.path.exists(lab_path): 152 | return np.array([]) 153 | if os.path.getsize(lab_path): 154 | truths = np.loadtxt(lab_path) 155 | truths = truths.reshape(truths.size / 5, 5) # to avoid single truth problem 156 | return truths 157 | else: 158 | return np.array([]) 159 | 160 | 161 | def load_class_names(namesfile): 162 | class_names = [] 163 | with open(namesfile, 'r') as fp: 164 | lines = fp.readlines() 165 | for line in lines: 166 | line = line.rstrip() 167 | class_names.append(line) 168 | return class_names 169 | 170 | 171 | 172 | def post_processing(img, conf_thresh, nms_thresh, output): 173 | 174 | # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] 175 | # num_anchors = 9 176 | # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] 177 | # strides = [8, 16, 32] 178 | # anchor_step = len(anchors) // num_anchors 179 | 180 | # [batch, num, 1, 4] 181 | box_array = output[0] 182 | # [batch, num, num_classes] 183 | confs = output[1] 184 | 185 | t1 = time.time() 186 | 187 | if type(box_array).__name__ != 'ndarray': 188 | box_array = box_array.cpu().detach().numpy() 189 | confs = confs.cpu().detach().numpy() 190 | 191 | num_classes = confs.shape[2] 192 | 193 | # [batch, num, 4] 194 | box_array = box_array[:, :, 0] 195 | 196 | # [batch, num, num_classes] --> [batch, num] 197 | max_conf = np.max(confs, axis=2) 198 | max_id = np.argmax(confs, axis=2) 199 | 200 | t2 = time.time() 201 | 202 | bboxes_batch = [] 203 | for i in range(box_array.shape[0]): 204 | 205 | argwhere = max_conf[i] > conf_thresh 206 | l_box_array = box_array[i, argwhere, :] 207 | l_max_conf = max_conf[i, argwhere] 208 | l_max_id = max_id[i, argwhere] 209 | 210 | bboxes = [] 211 | # nms for each class 212 | for j in range(num_classes): 213 | 214 | cls_argwhere = l_max_id == j 215 | ll_box_array = l_box_array[cls_argwhere, :] 216 | ll_max_conf = l_max_conf[cls_argwhere] 217 | ll_max_id = l_max_id[cls_argwhere] 218 | 219 | keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) 220 | 221 | if (keep.size > 0): 222 | ll_box_array = ll_box_array[keep, :] 223 | ll_max_conf = ll_max_conf[keep] 224 | ll_max_id = ll_max_id[keep] 225 | 226 | for k in range(ll_box_array.shape[0]): 227 | bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]]) 228 | 229 | bboxes_batch.append(bboxes) 230 | 231 | t3 = time.time() 232 | 233 | print('-----------------------------------') 234 | print(' max and argmax : %f' % (t2 - t1)) 235 | print(' nms : %f' % (t3 - t2)) 236 | print('Post processing total : %f' % (t3 - t1)) 237 | print('-----------------------------------') 238 | 239 | return bboxes_batch 240 | -------------------------------------------------------------------------------- /tool/utils_iou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | 4 | ''' 5 | import torch 6 | import os, sys 7 | from torch.nn import functional as F 8 | 9 | import numpy as np 10 | from packaging import version 11 | 12 | 13 | __all__ = [ 14 | "bboxes_iou", 15 | "bboxes_giou", 16 | "bboxes_diou", 17 | "bboxes_ciou", 18 | ] 19 | 20 | 21 | if version.parse(torch.__version__) >= version.parse('1.5.0'): 22 | def _true_divide(dividend, divisor): 23 | return torch.true_divide(dividend, divisor) 24 | else: 25 | def _true_divide(dividend, divisor): 26 | return dividend / divisor 27 | 28 | def bboxes_iou(bboxes_a, bboxes_b, fmt='voc', iou_type='iou'): 29 | """Calculate the Intersection of Unions (IoUs) between bounding boxes. 30 | IoU is calculated as a ratio of area of the intersection 31 | and area of the union. 32 | 33 | Args: 34 | bbox_a (array): An array whose shape is :math:`(N, 4)`. 35 | :math:`N` is the number of bounding boxes. 36 | The dtype should be :obj:`numpy.float32`. 37 | bbox_b (array): An array similar to :obj:`bbox_a`, 38 | whose shape is :math:`(K, 4)`. 39 | The dtype should be :obj:`numpy.float32`. 40 | Returns: 41 | array: 42 | An array whose shape is :math:`(N, K)`. \ 43 | An element at index :math:`(n, k)` contains IoUs between \ 44 | :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \ 45 | box in :obj:`bbox_b`. 46 | 47 | from: https://github.com/chainer/chainercv 48 | """ 49 | if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: 50 | raise IndexError 51 | 52 | N, K = bboxes_a.shape[0], bboxes_b.shape[0] 53 | 54 | if fmt.lower() == 'voc': # xmin, ymin, xmax, ymax 55 | # top left 56 | tl_intersect = torch.max( 57 | bboxes_a[:, np.newaxis, :2], 58 | bboxes_b[:, :2] 59 | ) # of shape `(N,K,2)` 60 | # bottom right 61 | br_intersect = torch.min( 62 | bboxes_a[:, np.newaxis, 2:], 63 | bboxes_b[:, 2:] 64 | ) 65 | bb_a = bboxes_a[:, 2:] - bboxes_a[:, :2] 66 | bb_b = bboxes_b[:, 2:] - bboxes_b[:, :2] 67 | # bb_* can also be seen vectors representing box_width, box_height 68 | elif fmt.lower() == 'yolo': # xcen, ycen, w, h 69 | # top left 70 | tl_intersect = torch.max( 71 | bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2, 72 | bboxes_b[:, :2] - bboxes_b[:, 2:] / 2 73 | ) 74 | # bottom right 75 | br_intersect = torch.min( 76 | bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2, 77 | bboxes_b[:, :2] + bboxes_b[:, 2:] / 2 78 | ) 79 | bb_a = bboxes_a[:, 2:] 80 | bb_b = bboxes_b[:, 2:] 81 | elif fmt.lower() == 'coco': # xmin, ymin, w, h 82 | # top left 83 | tl_intersect = torch.max( 84 | bboxes_a[:, np.newaxis, :2], 85 | bboxes_b[:, :2] 86 | ) 87 | # bottom right 88 | br_intersect = torch.min( 89 | bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:], 90 | bboxes_b[:, :2] + bboxes_b[:, 2:] 91 | ) 92 | bb_a = bboxes_a[:, 2:] 93 | bb_b = bboxes_b[:, 2:] 94 | 95 | area_a = torch.prod(bb_a, 1) 96 | area_b = torch.prod(bb_b, 1) 97 | 98 | # torch.prod(input, dim, keepdim=False, dtype=None) → Tensor 99 | # Returns the product of each row of the input tensor in the given dimension dim 100 | # if tl, br does not form a nondegenerate squre, then the corr. element in the `prod` would be 0 101 | en = (tl_intersect < br_intersect).type(tl_intersect.type()).prod(dim=2) # shape `(N,K,2)` ---> shape `(N,K)` 102 | 103 | area_intersect = torch.prod(br_intersect - tl_intersect, 2) * en # * ((tl < br).all()) 104 | area_union = (area_a[:, np.newaxis] + area_b - area_intersect) 105 | 106 | iou = _true_divide(area_intersect, area_union) 107 | 108 | if iou_type.lower() == 'iou': 109 | return iou 110 | 111 | if fmt.lower() == 'voc': # xmin, ymin, xmax, ymax 112 | # top left 113 | tl_union = torch.min( 114 | bboxes_a[:, np.newaxis, :2], 115 | bboxes_b[:, :2] 116 | ) # of shape `(N,K,2)` 117 | # bottom right 118 | br_union = torch.max( 119 | bboxes_a[:, np.newaxis, 2:], 120 | bboxes_b[:, 2:] 121 | ) 122 | elif fmt.lower() == 'yolo': # xcen, ycen, w, h 123 | # top left 124 | tl_union = torch.min( 125 | bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2, 126 | bboxes_b[:, :2] - bboxes_b[:, 2:] / 2 127 | ) 128 | # bottom right 129 | br_union = torch.max( 130 | bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2, 131 | bboxes_b[:, :2] + bboxes_b[:, 2:] / 2 132 | ) 133 | elif fmt.lower() == 'coco': # xmin, ymin, w, h 134 | # top left 135 | tl_union = torch.min( 136 | bboxes_a[:, np.newaxis, :2], 137 | bboxes_b[:, :2] 138 | ) 139 | # bottom right 140 | br_union = torch.max( 141 | bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:], 142 | bboxes_b[:, :2] + bboxes_b[:, 2:] 143 | ) 144 | 145 | # c for covering, of shape `(N,K,2)` 146 | # the last dim is box width, box hight 147 | bboxes_c = br_union - tl_union 148 | 149 | area_covering = torch.prod(bboxes_c, 2) # shape `(N,K)` 150 | 151 | giou = iou - _true_divide(area_covering - area_union, area_covering) 152 | 153 | if iou_type.lower() == 'giou': 154 | return giou 155 | 156 | if fmt.lower() == 'voc': # xmin, ymin, xmax, ymax 157 | centre_a = (bboxes_a[..., 2 :] + bboxes_a[..., : 2]) / 2 158 | centre_b = (bboxes_b[..., 2 :] + bboxes_b[..., : 2]) / 2 159 | elif fmt.lower() == 'yolo': # xcen, ycen, w, h 160 | centre_a = bboxes_a[..., : 2] 161 | centre_b = bboxes_b[..., : 2] 162 | elif fmt.lower() == 'coco': # xmin, ymin, w, h 163 | centre_a = bboxes_a[..., 2 :] + bboxes_a[..., : 2]/2 164 | centre_b = bboxes_b[..., 2 :] + bboxes_b[..., : 2]/2 165 | 166 | centre_dist = torch.norm(centre_a[:, np.newaxis] - centre_b, p='fro', dim=2) 167 | diag_len = torch.norm(bboxes_c, p='fro', dim=2) 168 | 169 | diou = iou - _true_divide(centre_dist.pow(2), diag_len.pow(2)) 170 | 171 | if iou_type.lower() == 'diou': 172 | return diou 173 | 174 | """ the legacy custom cosine similarity: 175 | 176 | # bb_a of shape `(N,2)`, bb_b of shape `(K,2)` 177 | v = torch.einsum('nm,km->nk', bb_a, bb_b) 178 | v = _true_divide(v, (torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1))) 179 | # avoid nan for torch.acos near \pm 1 180 | # https://github.com/pytorch/pytorch/issues/8069 181 | eps = 1e-7 182 | v = torch.clamp(v, -1+eps, 1-eps) 183 | """ 184 | v = F.cosine_similarity(bb_a[:,np.newaxis,:], bb_b, dim=-1) 185 | v = (_true_divide(2*torch.acos(v), np.pi)).pow(2) 186 | with torch.no_grad(): 187 | alpha = (_true_divide(v, 1-iou+v)) * ((iou>=0.5).type(iou.type())) 188 | 189 | ciou = diou - alpha * v 190 | 191 | if iou_type.lower() == 'ciou': 192 | return ciou 193 | 194 | 195 | def bboxes_giou(bboxes_a, bboxes_b, fmt='voc'): 196 | return bboxes_iou(bboxes_a, bboxes_b, fmt, 'giou') 197 | 198 | 199 | def bboxes_diou(bboxes_a, bboxes_b, fmt='voc'): 200 | return bboxes_iou(bboxes_a, bboxes_b, fmt, 'diou') 201 | 202 | 203 | def bboxes_ciou(bboxes_a, bboxes_b, fmt='voc'): 204 | return bboxes_iou(bboxes_a, bboxes_b, fmt, 'ciou') 205 | --------------------------------------------------------------------------------