├── ACTIVITY_BOX.md ├── COMMANDS.md ├── LICENSE ├── README.md ├── SPEED.md ├── TRAINING.md ├── application_util ├── __init__.py ├── image_viewer.py ├── preprocessing.py └── visualization.py ├── check_and_vis_global_tracks.py ├── class_ids.py ├── datasets.py ├── deep_sort ├── __init__.py ├── detection.py ├── iou_matching.py ├── kalman_filter.py ├── linear_assignment.py ├── nn_matching.py ├── track.py ├── tracker.py └── utils.py ├── deformable_helper.py ├── diva_io ├── LICENSE ├── README.md ├── __init__.py ├── annotation │ ├── __init__.py │ ├── converter.py │ └── kf1.py ├── docs │ └── speed.md ├── environment.yml ├── utils │ ├── __init__.py │ └── log.py └── video │ ├── __init__.py │ ├── frame.py │ ├── reader.py │ ├── speed_test.sh │ └── test.py ├── efficientdet ├── __init__.py ├── anchors.py ├── backbone │ ├── __init__.py │ ├── backbone_factory.py │ ├── efficientnet_builder.py │ ├── efficientnet_builder_test.py │ ├── efficientnet_lite_builder.py │ ├── efficientnet_lite_builder_test.py │ ├── efficientnet_model.py │ └── efficientnet_model_test.py ├── dataloader.py ├── efficientdet_arch.py ├── hparams_config.py ├── object_detection │ ├── __init__.py │ ├── argmax_matcher.py │ ├── box_coder.py │ ├── box_list.py │ ├── faster_rcnn_box_coder.py │ ├── matcher.py │ ├── preprocessor.py │ ├── region_similarity_calculator.py │ ├── shape_utils.py │ ├── target_assigner.py │ └── tf_example_decoder.py └── utils.py ├── efficientdet_wrapper.py ├── enqueuer.py ├── enqueuer_thread.py ├── eval.py ├── generate_anchors.py ├── generate_util_graph.py ├── get_frames_resize.py ├── images ├── Person_vis_video.gif ├── Vehicle_vis_video.gif ├── actev-prizechallenge-06-2019.png ├── inf_actev_0.49audc_02-2020.png ├── multi-camera-reid.gif ├── person_multi_reid.gif ├── person_multi_reid2.gif ├── util_log_b1partial.png ├── util_log_b8multithread.png └── vehicle_multi_reid.gif ├── main.py ├── models.py ├── multi_video_reid.py ├── nn.py ├── obj_detect_imgs.py ├── obj_detect_imgs_multi.py ├── obj_detect_imgs_multi_queuer.py ├── obj_detect_tracking.py ├── obj_detect_tracking_multi.py ├── obj_detect_tracking_multi_queuer.py ├── obj_detect_tracking_multi_queuer_tmot.py ├── single_video_reid.py ├── tensorrt_optimize.py ├── tensorrt_optimize_tf1.15.py ├── test_reid.py ├── tester.py ├── tmot ├── __init__.py ├── basetrack.py ├── kalman_filter.py ├── matching.py └── multitracker.py ├── torchreid ├── distance.py ├── feature_extractor.py └── models │ ├── __init__.py │ ├── densenet.py │ ├── hacnn.py │ ├── inceptionresnetv2.py │ ├── inceptionv4.py │ ├── mlfn.py │ ├── mobilenetv2.py │ ├── mudeep.py │ ├── nasnet.py │ ├── osnet.py │ ├── osnet_ain.py │ ├── pcb.py │ ├── resnet.py │ ├── resnet_ibn_a.py │ ├── resnet_ibn_b.py │ ├── resnetmid.py │ ├── senet.py │ ├── shufflenet.py │ ├── shufflenetv2.py │ ├── squeezenet.py │ └── xception.py ├── track_to_json.py ├── tracks_to_json.py ├── trainer.py ├── utils.py ├── vis_json.py ├── vis_tracks.py └── viz.py /ACTIVITY_BOX.md: -------------------------------------------------------------------------------- 1 | # Frame-level Activity Detection Experiments 2 | 3 | ## Training 4 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to train on and extract all the frames into the following format: `training_frames/${videoname}/${videoname}_F_%08d.jpg`. 5 | 6 | - Put annotations into a single folder. one npz file for one frame: `training_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Here `actlabels` and `actboxes` are used during training: 7 | ``` 8 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_012019_actgt_allsingle_npz.tar 9 | ``` 10 | 11 | - Prepare the file list for training set and validation set. We split a small subset of the ActEV training set as the validation set and the ActEV validation set will be used for testing. You can download my file lst. Training set: 12 | ``` 13 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minusminival_frames.lst 14 | ``` 15 | Validation set: 16 | ``` 17 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minival_frames.lst 18 | ``` 19 | These file lists are in absolute path. You will need to replace the path with the correct ones. 20 | 21 | - Download MSCOCO pretrained model from Tensorpack: 22 | ``` 23 | wget http://models.tensorpack.com/FasterRCNN/COCO-MaskRCNN-R101FPN1x.npz 24 | ``` 25 | 26 | - Train the Actbox v1 model with 1 GPU: 27 | ``` 28 | $python main.py nothing training_frames --mode train --annopath v1-training_012019_actgt_allsingle_npz \ 29 | --trainlst v1-training_minusminival_frames.lst --train_skip 5 --valframepath \ 30 | v1-training_frames --vallst v1-training_minival_frames.lst --valannopath \ 31 | v1-training_012019_actgt_allsingle_npz --outbasepath bupt_actboxexp_resnet101_dilation_classagnostic --modelname mrcnn101 --num_epochs 20 \ 32 | --save_period 2500 --rpn_batch_size 256 --frcnn_batch_size 512 --num_class 10 \ 33 | --bupt_exp --max_size 1920 --short_edge_size 1080 --init_lr 0.003 --use_cosine_schedule \ 34 | --warm_up_steps 5000 --optimizer momentum --rpn_test_post_nms_topk 1000 --freeze 0 \ 35 | --gpu 1 --is_fpn --im_batch_size 1 --flip_image --val_skip 20 --load_from \ 36 | COCO-R101FPN-MaskRCNN-Standard.npz --skip_first_eval --best_first -1 --show_loss_period \ 37 | 1000 --loss_me_step 50 --ignore_vars fastrcnn/outputs --wd 0.0001 --use_dilation \ 38 | --use_frcnn_class_agnostic 39 | ``` 40 | You can change `--gpu 4` and `--im_batch_size 4` (and maybe `--gpuid_start`) if you have a multi-GPU machine. Note that it is a [known bug](https://github.com/tensorflow/tensorflow/issues/23458) in tf 1.13 that you would see all 4 gpu memory allocated even if you set gpu to 2. This is fixed in tf 1.14.0 (but still takes some GPU0's memory). But multi-GPU training with a subset of the GPUs (`--gpuid_start` larger than 0) will fail since tf v1.13 according to [this](https://github.com/tensorflow/tensorflow/issues/27259). 41 | 42 | ## Testing 43 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to test on and extract all the frames into the following format: `validation_frames/${videoname}/${videoname}_F_%08d.jpg`. 44 | 45 | - Put annotations into a single folder. one npz file for one frame: `testing_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Here `actlabels` and `actboxes` are used: 46 | ``` 47 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_012019_actgt_allsingle_npz.tar 48 | ``` 49 | 50 | - Prepare the file list for testing. We use the official validation set as testing set. You can download my file lst: 51 | ``` 52 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_frames.lst 53 | ``` 54 | Again, you will need to replace them with the correct absolute path. 55 | 56 | - Test the model by generating COCO-format JSON files: 57 | ``` 58 | $ python main.py nothing v1-validate_frames.lst --mode forward --outbasepath \ 59 | actbox_v1_test --rpn_batch_size 256 --frcnn_batch_size 512 --num_class 10 --bupt_exp \ 60 | --max_size 1920 --short_edge_size 1080 --rpn_test_post_nms_topk 1000 --gpu 1 --is_fpn \ 61 | --im_batch_size 1 --load_from bupt_actboxexp_resnet101_dilation_classagnostic/ \ 62 | mrcnn101/01/save-best/ --use_dilation --use_frcnn_class_agnostic --log 63 | ``` 64 | 65 | - Evaluate: 66 | ``` 67 | $ python eval.py v1-validate_frames.lst v1-validate_012019_actgt_allsingle_npz \ 68 | actbox_v1_test --bupt_exp 69 | ``` 70 | 71 | - Visualize: 72 | ``` 73 | $ python vis_json.py v1-validate_videos.lst v1-validate_frames/ actbox_v1_test/ \ 74 | actbox_v1_test_visbox --score_thres 0.7 75 | ``` 76 | 77 | - Tracking 78 | ``` 79 | $ python obj_detect_tracking.py --model_path bupt_actboxexp_resnet101_dilation_classagnostic/mrcnn101/01/save-best/ --version 5 \ 80 | --video_dir v1-validate_videos/ --video_lst_file v1-validate_videos.names.lst --out_dir \ 81 | act_box_out --frame_gap 1 --get_tracking --tracking_dir act_track_out --min_confidence \ 82 | 0.8 --tracking_objs Person-Vehicle,Pull,Riding,Talking,Transport_HeavyCarry,Vehicle-Turning,activity_carrying \ 83 | --bupt_exp --num_class 10 --gpuid_start 0 84 | ``` 85 | -------------------------------------------------------------------------------- /COMMANDS.md: -------------------------------------------------------------------------------- 1 | # Example Commands 2 | 3 | ## 02-2020 0.49 pAUDC, 0.64 processing time 4 | ``` 5 | $ python obj_detect_tracking.py \ 6 | --model_path obj_coco_resnet50_partial_tfv1.14_1280x720_rpn300.pb \ 7 | --video_dir videos --tracking_dir output/ --video_lst_file videos.lst \ 8 | --version 2 --is_coco_model --use_partial_classes --frame_gap 8 \ 9 | --is_load_from_pb --get_tracking \ 10 | --tracking_objs Person,Vehicle --min_confidence 0.85 \ 11 | --resnet50 --rpn_test_post_nms_topk 300 --max_size 1280 --short_edge_size 720 \ 12 | --use_lijun_video_loader --nms_max_overlap 0.85 --max_iou_distance 0.5 \ 13 | --max_cosine_distance 0.5 --nn_budget 5 14 | ``` 15 | This is for processing AVI videos. For MP4 videos, run without `--use_lijun`. 16 | Add `--log_time_and_gpu` to get GPU utilization and time profile. 17 | 18 | 19 | ## 05-2020, added EfficientDet 20 | The [EfficientDet (CVPR 2020)](https://github.com/google/automl/tree/master/efficientdet) (D7) is reported to be more than 12 mAP better than the Resnet-50 FPN model we used on COCO. 21 | 22 | I have made the following changes based on the code from early May: 23 | + Added multi-level ROI align with the final detection boxes since we need the FPN box features for deep-SORT tracking. Basically since one-stage object detection models have box predictions at each feature level, I added a level index variable to keep track of each box's feature level so that in the end they can be efficiently backtracked to the original feature map and crop the features. 24 | + Similar to the MaskRCNN model, I modified the EfficientDet to allow NMS on only some of the COCO classes (currently we only care about person and vehicle) and save computations. 25 | 26 | 27 | Example command \[[d0 model from early May](https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/models/efficientdet-d0.tar.gz)\]: 28 | ``` 29 | $ python obj_detect_tracking.py \ 30 | --model_path efficientdet-d0 \ 31 | --efficientdet_modelname efficientdet-d0 --is_efficientdet \ 32 | --efficientdet_max_detection_topk 5000 \ 33 | --video_dir videos --tracking_dir output/ --video_lst_file videos.lst \ 34 | --version 2 --is_coco_model --use_partial_classes --frame_gap 8 \ 35 | --get_tracking --tracking_objs Person,Vehicle --min_confidence 0.6 \ 36 | --max_size 1280 --short_edge_size 720 \ 37 | --use_lijun_video_loader --nms_max_overlap 0.85 --max_iou_distance 0.5 \ 38 | --max_cosine_distance 0.5 --nn_budget 5 39 | ``` 40 | This is for processing AVI videos. I have tried it with pyav==6.2.0. Install it by 41 | ``` 42 | $ sudo apt-get install -y \ 43 | libavformat-dev libavcodec-dev libavdevice-dev \ 44 | libavutil-dev libswscale-dev libswresample-dev libavfilter-dev 45 | $ sudo pip install av==6.2.0 46 | ``` 47 | 48 | For MP4 videos, run without `--use_lijun`. 49 | Add `--log_time_and_gpu` to get GPU utilization and time profile. 50 | 51 | Example command with a partial frozen graph \[[d0-TFv1.15](https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/models/efficientd0_tfv1.15_1280x720.pb)\] (slightly faster): 52 | ``` 53 | $ python obj_detect_tracking.py \ 54 | --model_path efficientd0_tfv1.15_1280x720.pb --is_load_from_pb \ 55 | --efficientdet_modelname efficientdet-d0 --is_efficientdet \ 56 | --efficientdet_max_detection_topk 5000 \ 57 | --video_dir videos --tracking_dir output/ --video_lst_file videos.lst \ 58 | --version 2 --is_coco_model --use_partial_classes --frame_gap 8 \ 59 | --get_tracking --tracking_objs Person,Vehicle --min_confidence 0.6 \ 60 | --max_size 1280 --short_edge_size 720 \ 61 | --use_lijun_video_loader --nms_max_overlap 0.85 --max_iou_distance 0.5 \ 62 | --max_cosine_distance 0.5 --nn_budget 5 63 | ``` 64 | 65 | [05/04/2020] Tried to optimize the frozen model with TensorRT by: 66 | ``` 67 | $ python tensorrt_optimize_tf1.15.py efficientd0_tfv1.15_1280x720.pb \ 68 | efficientd0_tfv1.15_1280x720_trt_fp16.pb --precision_mode FP16 69 | ``` 70 | But it does not work: 71 | ``` 72 | 2020-05-04 22:11:48.850233: F tensorflow/core/framework/op_kernel.cc:875] Check failed: mutable_output(index) == nullptr (0x7f82d4244ff0 vs. nullptr) 73 | Aborted (core dumped) 74 | ``` 75 | 76 | Run object detection and visualization on images. This could be used to reproduce the official repo's tutorial output: 77 | ``` 78 | $ python obj_detect_imgs.py --model_path efficientdet-d0 \ 79 | --efficientdet_modelname efficientdet-d0 --is_efficientdet \ 80 | --img_lst imgs.lst --out_dir test_d0_json \ 81 | --visualize --vis_path test_d0_vis --vis_thres 0.4 \ 82 | --max_size 1920 --short_edge_size 1080 \ 83 | --efficientdet_max_detection_topk 5000 84 | ``` 85 | 86 | ## 10-2020, comparing EfficientDet with MaskRCNN on video datasets 87 | 88 | 1. VIRAT 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 |
ModelsCOCO-validation-AP-80classesVIRAT Person-Val-AP VIRAT Vehicle-Val-AP VIRAT Bike-Val-AP
MaskRCNN, R50-FPN0.389 0.374 0.943 0.367
MaskRCNN, R101-FPN0.407 0.378 0.947 0.399
EfficientDet-d20.425 0.371 0.949 0.293
EfficientDet-d60.513 0.422 0.947 0.355
127 | 128 | 2. AVA-Kinetics 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 |
ModelsCOCO-validation-AP-80classesAVA-Kinetics Train-Person-AP AVA-Kinetics Val-Person-AP
MaskRCNN, R101-FPN0.407 0.664 0.682
EfficientDet-d20.425 0.650 0.680
EfficientDet-d60.513 0.623 0.658
156 | 157 | VIRAT consists of mostly small person boxes, while AVA-Kineitcs has much bigger ones. So it seems EfficientDet is slightly better on detecting small person. However, EfficientDet-d6 is about 2.4x the inference time of MaskRCNN-R101-FPN. 158 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Junwei Liang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /TRAINING.md: -------------------------------------------------------------------------------- 1 | # CMU Object Detection & Tracking for Surveillance Video Activity Detection 2 | 3 | ## Training 4 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to train on and extract all the frames into the following format: `training_frames/${videoname}/${videoname}_F_%08d.jpg`. 5 | 6 | - Put annotations into a single folder. one npz file for one frame: `training_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Only `labels` and `boxes` are used during training: 7 | ``` 8 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_012019_actgt_allsingle_npz.tar 9 | ``` 10 | 11 | - Prepare the file list for training set and validation set. We split a small subset of the ActEV training set as the validation set and the ActEV validation set will be used for testing. You can download my file lst. Training set: 12 | ``` 13 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minusminival_frames.lst 14 | ``` 15 | Validation set: 16 | ``` 17 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minival_frames.lst 18 | ``` 19 | These file lists are in absolute path. You will need to replace the path with the correct ones. 20 | 21 | - Download MSCOCO pretrained model from Tensorpack: 22 | ``` 23 | wget http://models.tensorpack.com/FasterRCNN/COCO-MaskRCNN-R101FPN1x.npz 24 | ``` 25 | 26 | - Train the obj_v3 model with 1 GPU: 27 | ``` 28 | $ python main.py nothing training_frames --mode train --annopath v1-training_012019_actgt_allsingle_npz \ 29 | --trainlst v1-training_minusminival_frames.lst --train_skip 30 --valframepath v1-training_frames --vallst \ 30 | v1-training_minival_frames.lst --valannopath v1-training_012019_actgt_allsingle_npz --outbasepath my_model \ 31 | --modelname obj_v3 --num_epochs 15 --save_period 5000 --rpn_batch_size 256 --frcnn_batch_size 512 --num_class \ 32 | 15 --diva_class3 --max_size 1920 --short_edge_size 1080 --init_lr 0.006 --use_cosine_schedule --warm_up_steps \ 33 | 10000 --optimizer momentum --rpn_test_post_nms_topk 1000 --freeze 0 --gpu 1 --is_fpn --im_batch_size 1 \ 34 | --flip_image --load_from COCO-MaskRCNN-R101FPN1x.npz --skip_first_eval --best_first -1 --show_loss_period 1000 \ 35 | --loss_me_step 50 --ignore_vars fastrcnn/outputs --wd 0.0001 --use_dilation --use_frcnn_class_agnostic 36 | ``` 37 | You can change `--gpu 4` and `--im_batch_size 4` (and maybe `--gpuid_start`) if you have a multi-GPU machine. Note that it is a [known bug](https://github.com/tensorflow/tensorflow/issues/23458) in tf 1.13 that you would see all 4 gpu memory allocated even if you set gpu to 2. This is fixed in tf 1.14.0 (but still takes some GPU0's memory). But multi-GPU training with a subset of the GPUs (`--gpuid_start` larger than 0) will fail since tf v1.13 according to [this](https://github.com/tensorflow/tensorflow/issues/27259). 38 | 39 | - June 2020, finetune MaskRCNN person detection on AVA-Kinetics Dataset: 40 | ``` 41 | $ python main.py nothing pack_ava_kinetics_keyframes --mode train --annopath ava_kinetics_person_box_anno/ --trainlst person_train.lst --valframepath pack_ava_kinetics_keyframes --vallst person_val.lst --valannopath ava_kinetics_person_box_anno/ --outbasepath maskrcnn_finetune --modelname maskrcnn_r101fpn --num_epochs 15 --save_period 5000 --rpn_batch_size 256 --frcnn_batch_size 512 --num_class 81 --is_coco_model --one_level_framepath --max_size 560 --short_edge_size 320 --init_lr 0.001 --use_cosine_schedule --warm_up_steps 10000 --optimizer momentum --rpn_test_post_nms_topk 1000 --freeze 0 --gpu 4 --is_fpn --im_batch_size 4 --flip_image --load_from COCO-MaskRCNN-R101FPN1x.npz --show_loss_period 1000 --loss_me_step 100 --wd 0.0001 --val_skip 10 42 | ``` 43 | 44 | ## Testing 45 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to test on and extract all the frames into the following format: `validation_frames/${videoname}/${videoname}_F_%08d.jpg`. 46 | 47 | - Put annotations into a single folder. one npz file for one frame: `testing_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Only `labels` and `boxes` are used during training: 48 | ``` 49 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_012019_actgt_allsingle_npz.tar 50 | ``` 51 | 52 | - Prepare the file list for testing. We use the official validation set as testing set. You can download my file lst: 53 | ``` 54 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_frames.lst 55 | ``` 56 | Again, you will need to replace them with the correct absolute path. 57 | 58 | - Test the model by generating COCO-format JSON files: 59 | ``` 60 | $ python main.py nothing v1-validate_frames.lst --mode forward --outbasepath test_jsons --rpn_batch_size 256 \ 61 | --frcnn_batch_size 512 --num_class 15 --diva_class3 --max_size 1920 --short_edge_size 1080 \ 62 | --rpn_test_post_nms_topk 1000 --gpu 1 --is_fpn --im_batch_size 1 --load_from my_model/obj_v3/01/save-best/ \ 63 | --use_frcnn_class_agnostic --use_dilation 64 | ``` 65 | 66 | - Evaluate: 67 | ``` 68 | $ python eval.py v1-validate_frames.lst v1-validate_012019_actgt_allsingle_npz test_jsons/ 69 | ``` 70 | 71 | -------------------------------------------------------------------------------- /application_util/__init__.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 -------------------------------------------------------------------------------- /application_util/preprocessing.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | import cv2 4 | 5 | 6 | def non_max_suppression(boxes, max_bbox_overlap, scores=None): 7 | """Suppress overlapping detections. 8 | 9 | Original code from [1]_ has been adapted to include confidence score. 10 | 11 | .. [1] http://www.pyimagesearch.com/2015/02/16/ 12 | faster-non-maximum-suppression-python/ 13 | 14 | Examples 15 | -------- 16 | 17 | >>> boxes = [d.roi for d in detections] 18 | >>> scores = [d.confidence for d in detections] 19 | >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) 20 | >>> detections = [detections[i] for i in indices] 21 | 22 | Parameters 23 | ---------- 24 | boxes : ndarray 25 | Array of ROIs (x, y, width, height). 26 | max_bbox_overlap : float 27 | ROIs that overlap more than this values are suppressed. 28 | scores : Optional[array_like] 29 | Detector confidence score. 30 | 31 | Returns 32 | ------- 33 | List[int] 34 | Returns indices of detections that have survived non-maxima suppression. 35 | 36 | """ 37 | if len(boxes) == 0: 38 | return [] 39 | 40 | boxes = boxes.astype(np.float) 41 | pick = [] 42 | 43 | x1 = boxes[:, 0] 44 | y1 = boxes[:, 1] 45 | x2 = boxes[:, 2] + boxes[:, 0] 46 | y2 = boxes[:, 3] + boxes[:, 1] 47 | 48 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 49 | if scores is not None: 50 | idxs = np.argsort(scores) 51 | else: 52 | idxs = np.argsort(y2) 53 | 54 | while len(idxs) > 0: 55 | last = len(idxs) - 1 56 | i = idxs[last] 57 | pick.append(i) 58 | 59 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 60 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 61 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 62 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 63 | 64 | w = np.maximum(0, xx2 - xx1 + 1) 65 | h = np.maximum(0, yy2 - yy1 + 1) 66 | 67 | overlap = (w * h) / area[idxs[:last]] 68 | 69 | idxs = np.delete( 70 | idxs, np.concatenate( 71 | ([last], np.where(overlap > max_bbox_overlap)[0]))) 72 | 73 | return pick 74 | -------------------------------------------------------------------------------- /application_util/visualization.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | import colorsys 4 | from .image_viewer import ImageViewer 5 | 6 | 7 | def create_unique_color_float(tag, hue_step=0.41): 8 | """Create a unique RGB color code for a given track id (tag). 9 | 10 | The color code is generated in HSV color space by moving along the 11 | hue angle and gradually changing the saturation. 12 | 13 | Parameters 14 | ---------- 15 | tag : int 16 | The unique target identifying tag. 17 | hue_step : float 18 | Difference between two neighboring color codes in HSV space (more 19 | specifically, the distance in hue channel). 20 | 21 | Returns 22 | ------- 23 | (float, float, float) 24 | RGB color code in range [0, 1] 25 | 26 | """ 27 | h, v = (tag * hue_step) % 1, 1. - (int(tag * hue_step) % 4) / 5. 28 | r, g, b = colorsys.hsv_to_rgb(h, 1., v) 29 | return r, g, b 30 | 31 | 32 | def create_unique_color_uchar(tag, hue_step=0.41): 33 | """Create a unique RGB color code for a given track id (tag). 34 | 35 | The color code is generated in HSV color space by moving along the 36 | hue angle and gradually changing the saturation. 37 | 38 | Parameters 39 | ---------- 40 | tag : int 41 | The unique target identifying tag. 42 | hue_step : float 43 | Difference between two neighboring color codes in HSV space (more 44 | specifically, the distance in hue channel). 45 | 46 | Returns 47 | ------- 48 | (int, int, int) 49 | RGB color code in range [0, 255] 50 | 51 | """ 52 | r, g, b = create_unique_color_float(tag, hue_step) 53 | return int(255*r), int(255*g), int(255*b) 54 | 55 | 56 | class NoVisualization(object): 57 | """ 58 | A dummy visualization object that loops through all frames in a given 59 | sequence to update the tracker without performing any visualization. 60 | """ 61 | 62 | def __init__(self, seq_info): 63 | self.frame_idx = seq_info["min_frame_idx"] 64 | self.last_idx = seq_info["max_frame_idx"] 65 | 66 | def set_image(self, image): 67 | pass 68 | 69 | def draw_groundtruth(self, track_ids, boxes): 70 | pass 71 | 72 | def draw_detections(self, detections): 73 | pass 74 | 75 | def draw_trackers(self, trackers): 76 | pass 77 | 78 | def run(self, frame_callback): 79 | while self.frame_idx <= self.last_idx: 80 | frame_callback(self, self.frame_idx) 81 | self.frame_idx += 1 82 | 83 | 84 | class Visualization(object): 85 | """ 86 | This class shows tracking output in an OpenCV image viewer. 87 | """ 88 | 89 | def __init__(self, seq_info, update_ms): 90 | image_shape = seq_info["image_size"][::-1] 91 | aspect_ratio = float(image_shape[1]) / image_shape[0] 92 | image_shape = 1024, int(aspect_ratio * 1024) 93 | self.viewer = ImageViewer( 94 | update_ms, image_shape, "Figure %s" % seq_info["sequence_name"]) 95 | self.viewer.thickness = 2 96 | self.frame_idx = seq_info["min_frame_idx"] 97 | self.last_idx = seq_info["max_frame_idx"] 98 | 99 | def run(self, frame_callback): 100 | self.viewer.run(lambda: self._update_fun(frame_callback)) 101 | 102 | def _update_fun(self, frame_callback): 103 | if self.frame_idx > self.last_idx: 104 | return False # Terminate 105 | frame_callback(self, self.frame_idx) 106 | self.frame_idx += 1 107 | return True 108 | 109 | def set_image(self, image): 110 | self.viewer.image = image 111 | 112 | def draw_groundtruth(self, track_ids, boxes): 113 | self.viewer.thickness = 2 114 | for track_id, box in zip(track_ids, boxes): 115 | self.viewer.color = create_unique_color_uchar(track_id) 116 | self.viewer.rectangle(*box.astype(np.int), label=str(track_id)) 117 | 118 | def draw_detections(self, detections): 119 | self.viewer.thickness = 2 120 | self.viewer.color = 0, 0, 255 121 | for i, detection in enumerate(detections): 122 | self.viewer.rectangle(*detection.tlwh) 123 | 124 | def draw_trackers(self, tracks): 125 | self.viewer.thickness = 2 126 | for track in tracks: 127 | if not track.is_confirmed() or track.time_since_update > 0: 128 | continue 129 | self.viewer.color = create_unique_color_uchar(track.track_id) 130 | self.viewer.rectangle( 131 | *track.to_tlwh().astype(np.int), label=str(track.track_id)) 132 | return self.viewer.image 133 | # self.viewer.gaussian(track.mean[:2], track.covariance[:2, :2], 134 | # label="%d" % track.track_id) 135 | # 136 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # dataset object definition 3 | import cv2 4 | import os 5 | import logging 6 | import math 7 | import numpy as np 8 | from nn import resizeImage 9 | 10 | # dataset object need to implement the following function 11 | # get_sample(self, idx) 12 | # collect_batch(self, datalist) 13 | class ImageDataset(object): 14 | def __init__(self, cfg, split, imglst, annotations=None): 15 | """ 16 | imglst: a file containing a list of absolute path to all the images 17 | """ 18 | self.cfg = cfg # this should include short_edge_size, max_size, etc. 19 | self.split = split 20 | self.imglst = imglst 21 | self.annotations = annotations 22 | 23 | # machine-specific config 24 | self.num_gpu = cfg.gpu 25 | self.batch_size = cfg.im_batch_size 26 | self.batch_size_per_gpu = self.batch_size // cfg.gpu 27 | assert self.batch_size % cfg.gpu == 0, "bruh" 28 | 29 | 30 | if self.split == "train": 31 | self.num_epochs = cfg.num_epochs 32 | else: 33 | self.num_epochs = 1 34 | 35 | # load the img file list 36 | self.imgs = [line.strip() for line in open(self.imglst).readlines()] 37 | 38 | self.num_samples = len(self.imgs) # one epoch length 39 | 40 | self.num_batches_per_epoch = int( 41 | math.ceil(self.num_samples / float(self.batch_size))) 42 | self.num_batches = int(self.num_batches_per_epoch * self.num_epochs) 43 | self.valid_idxs = range(self.num_samples) 44 | 45 | logging.info("Loaded %s imgs", len(self.imgs)) 46 | 47 | def get_sample(self, idx): 48 | """ 49 | preprocess one sample from the list 50 | """ 51 | cfg = self.cfg 52 | img_file_path = self.imgs[idx] 53 | 54 | imgname = os.path.splitext(os.path.basename(img_file_path))[0] 55 | 56 | frame = cv2.imread(img_file_path) 57 | im = frame.astype("float32") 58 | 59 | resized_image = resizeImage(im, cfg.short_edge_size, cfg.max_size) 60 | 61 | scale = (resized_image.shape[0] * 1.0 / im.shape[0] + \ 62 | resized_image.shape[1] * 1.0 / im.shape[1]) / 2.0 63 | 64 | return resized_image, scale, imgname, (im.shape[0], im.shape[1]) 65 | 66 | def collect_batch(self, data, idxs=None): 67 | """ 68 | collect the idxs of the data list into a dictionary 69 | """ 70 | if idxs is None: 71 | idxs = range(len(data)) 72 | imgs, scales, imgnames, shapes = zip(*[data[idx] for idx in idxs]) 73 | 74 | return { 75 | "imgs": imgs, 76 | "scales": scales, 77 | "imgnames": imgnames, 78 | "ori_shapes": shapes 79 | } 80 | 81 | 82 | -------------------------------------------------------------------------------- /deep_sort/__init__.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | -------------------------------------------------------------------------------- /deep_sort/detection.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | 4 | 5 | class Detection(object): 6 | """ 7 | This class represents a bounding box detection in a single image. 8 | 9 | Parameters 10 | ---------- 11 | tlwh : array_like 12 | Bounding box in format `(x, y, w, h)`. 13 | confidence : float 14 | Detector confidence score. 15 | feature : array_like 16 | A feature vector that describes the object contained in this image. 17 | 18 | Attributes 19 | ---------- 20 | tlwh : ndarray 21 | Bounding box in format `(top left x, top left y, width, height)`. 22 | confidence : ndarray 23 | Detector confidence score. 24 | feature : ndarray | NoneType 25 | A feature vector that describes the object contained in this image. 26 | 27 | """ 28 | 29 | def __init__(self, tlwh, confidence, feature): 30 | self.tlwh = np.asarray(tlwh, dtype=np.float) 31 | self.confidence = float(confidence) 32 | self.feature = np.asarray(feature, dtype=np.float32) 33 | 34 | def to_tlbr(self): 35 | """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., 36 | `(top left, bottom right)`. 37 | """ 38 | ret = self.tlwh.copy() 39 | ret[2:] += ret[:2] 40 | return ret 41 | 42 | def to_xyah(self): 43 | """Convert bounding box to format `(center x, center y, aspect ratio, 44 | height)`, where the aspect ratio is `width / height`. 45 | """ 46 | ret = self.tlwh.copy() 47 | ret[:2] += ret[2:] / 2 48 | ret[2] /= ret[3] 49 | return ret 50 | -------------------------------------------------------------------------------- /deep_sort/iou_matching.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from . import linear_assignment 5 | 6 | 7 | def iou(bbox, candidates): 8 | """Computer intersection over union. 9 | 10 | Parameters 11 | ---------- 12 | bbox : ndarray 13 | A bounding box in format `(top left x, top left y, width, height)`. 14 | candidates : ndarray 15 | A matrix of candidate bounding boxes (one per row) in the same format 16 | as `bbox`. 17 | 18 | Returns 19 | ------- 20 | ndarray 21 | The intersection over union in [0, 1] between the `bbox` and each 22 | candidate. A higher score means a larger fraction of the `bbox` is 23 | occluded by the candidate. 24 | 25 | """ 26 | bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] 27 | candidates_tl = candidates[:, :2] 28 | candidates_br = candidates[:, :2] + candidates[:, 2:] 29 | 30 | tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], 31 | np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] 32 | br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], 33 | np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] 34 | wh = np.maximum(0., br - tl) 35 | 36 | area_intersection = wh.prod(axis=1) 37 | area_bbox = bbox[2:].prod() 38 | area_candidates = candidates[:, 2:].prod(axis=1) 39 | return area_intersection / (area_bbox + area_candidates - area_intersection) 40 | 41 | 42 | def iou_cost(tracks, detections, track_indices=None, 43 | detection_indices=None): 44 | """An intersection over union distance metric. 45 | 46 | Parameters 47 | ---------- 48 | tracks : List[deep_sort.track.Track] 49 | A list of tracks. 50 | detections : List[deep_sort.detection.Detection] 51 | A list of detections. 52 | track_indices : Optional[List[int]] 53 | A list of indices to tracks that should be matched. Defaults to 54 | all `tracks`. 55 | detection_indices : Optional[List[int]] 56 | A list of indices to detections that should be matched. Defaults 57 | to all `detections`. 58 | 59 | Returns 60 | ------- 61 | ndarray 62 | Returns a cost matrix of shape 63 | len(track_indices), len(detection_indices) where entry (i, j) is 64 | `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. 65 | 66 | """ 67 | if track_indices is None: 68 | track_indices = np.arange(len(tracks)) 69 | if detection_indices is None: 70 | detection_indices = np.arange(len(detections)) 71 | 72 | cost_matrix = np.zeros((len(track_indices), len(detection_indices))) 73 | for row, track_idx in enumerate(track_indices): 74 | if tracks[track_idx].time_since_update > 1: 75 | cost_matrix[row, :] = linear_assignment.INFTY_COST 76 | continue 77 | 78 | bbox = tracks[track_idx].to_tlwh() 79 | candidates = np.asarray([detections[i].tlwh for i in detection_indices]) 80 | cost_matrix[row, :] = 1. - iou(bbox, candidates) 81 | return cost_matrix 82 | -------------------------------------------------------------------------------- /deep_sort/kalman_filter.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | import scipy.linalg 4 | 5 | 6 | """ 7 | Table for the 0.95 quantile of the chi-square distribution with N degrees of 8 | freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv 9 | function and used as Mahalanobis gating threshold. 10 | """ 11 | chi2inv95 = { 12 | 1: 3.8415, 13 | 2: 5.9915, 14 | 3: 7.8147, 15 | 4: 9.4877, 16 | 5: 11.070, 17 | 6: 12.592, 18 | 7: 14.067, 19 | 8: 15.507, 20 | 9: 16.919} 21 | 22 | 23 | class KalmanFilter(object): 24 | """ 25 | A simple Kalman filter for tracking bounding boxes in image space. 26 | 27 | The 8-dimensional state space 28 | 29 | x, y, a, h, vx, vy, va, vh 30 | 31 | contains the bounding box center position (x, y), aspect ratio a, height h, 32 | and their respective velocities. 33 | 34 | Object motion follows a constant velocity model. The bounding box location 35 | (x, y, a, h) is taken as direct observation of the state space (linear 36 | observation model). 37 | 38 | """ 39 | 40 | def __init__(self): 41 | ndim, dt = 4, 1. 42 | 43 | # Create Kalman filter model matrices. 44 | self._motion_mat = np.eye(2 * ndim, 2 * ndim) 45 | for i in range(ndim): 46 | self._motion_mat[i, ndim + i] = dt 47 | self._update_mat = np.eye(ndim, 2 * ndim) 48 | 49 | # Motion and observation uncertainty are chosen relative to the current 50 | # state estimate. These weights control the amount of uncertainty in 51 | # the model. This is a bit hacky. 52 | self._std_weight_position = 1. / 20 53 | self._std_weight_velocity = 1. / 160 54 | 55 | def initiate(self, measurement): 56 | """Create track from unassociated measurement. 57 | 58 | Parameters 59 | ---------- 60 | measurement : ndarray 61 | Bounding box coordinates (x, y, a, h) with center position (x, y), 62 | aspect ratio a, and height h. 63 | 64 | Returns 65 | ------- 66 | (ndarray, ndarray) 67 | Returns the mean vector (8 dimensional) and covariance matrix (8x8 68 | dimensional) of the new track. Unobserved velocities are initialized 69 | to 0 mean. 70 | 71 | """ 72 | mean_pos = measurement 73 | mean_vel = np.zeros_like(mean_pos) 74 | mean = np.r_[mean_pos, mean_vel] 75 | 76 | std = [ 77 | 2 * self._std_weight_position * measurement[3], 78 | 2 * self._std_weight_position * measurement[3], 79 | 1e-2, 80 | 2 * self._std_weight_position * measurement[3], 81 | 10 * self._std_weight_velocity * measurement[3], 82 | 10 * self._std_weight_velocity * measurement[3], 83 | 1e-5, 84 | 10 * self._std_weight_velocity * measurement[3]] 85 | 86 | covariance = np.diag(np.square(std)) 87 | return mean, covariance 88 | 89 | def predict(self, mean, covariance): 90 | """Run Kalman filter prediction step. 91 | 92 | Parameters 93 | ---------- 94 | mean : ndarray 95 | The 8 dimensional mean vector of the object state at the previous 96 | time step. 97 | covariance : ndarray 98 | The 8x8 dimensional covariance matrix of the object state at the 99 | previous time step. 100 | 101 | Returns 102 | ------- 103 | (ndarray, ndarray) 104 | Returns the mean vector and covariance matrix of the predicted 105 | state. Unobserved velocities are initialized to 0 mean. 106 | 107 | """ 108 | 109 | std_pos = [ 110 | self._std_weight_position * mean[3], 111 | self._std_weight_position * mean[3], 112 | 1e-2, 113 | self._std_weight_position * mean[3]] 114 | std_vel = [ 115 | self._std_weight_velocity * mean[3], 116 | self._std_weight_velocity * mean[3], 117 | 1e-5, 118 | self._std_weight_velocity * mean[3]] 119 | motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) 120 | 121 | mean = np.dot(self._motion_mat, mean) 122 | covariance = np.linalg.multi_dot(( 123 | self._motion_mat, covariance, self._motion_mat.T)) + motion_cov 124 | 125 | return mean, covariance 126 | 127 | def project(self, mean, covariance): 128 | """Project state distribution to measurement space. 129 | 130 | Parameters 131 | ---------- 132 | mean : ndarray 133 | The state's mean vector (8 dimensional array). 134 | covariance : ndarray 135 | The state's covariance matrix (8x8 dimensional). 136 | 137 | Returns 138 | ------- 139 | (ndarray, ndarray) 140 | Returns the projected mean and covariance matrix of the given state 141 | estimate. 142 | 143 | """ 144 | std = [ 145 | self._std_weight_position * mean[3], 146 | self._std_weight_position * mean[3], 147 | 1e-1, 148 | self._std_weight_position * mean[3]] 149 | 150 | innovation_cov = np.diag(np.square(std)) 151 | 152 | mean = np.dot(self._update_mat, mean) 153 | covariance = np.linalg.multi_dot(( 154 | self._update_mat, covariance, self._update_mat.T)) 155 | return mean, covariance + innovation_cov 156 | 157 | def update(self, mean, covariance, measurement): 158 | """Run Kalman filter correction step. 159 | 160 | Parameters 161 | ---------- 162 | mean : ndarray 163 | The predicted state's mean vector (8 dimensional). 164 | covariance : ndarray 165 | The state's covariance matrix (8x8 dimensional). 166 | measurement : ndarray 167 | The 4 dimensional measurement vector (x, y, a, h), where (x, y) 168 | is the center position, a the aspect ratio, and h the height of the 169 | bounding box. 170 | 171 | Returns 172 | ------- 173 | (ndarray, ndarray) 174 | Returns the measurement-corrected state distribution. 175 | 176 | """ 177 | projected_mean, projected_cov = self.project(mean, covariance) 178 | 179 | chol_factor, lower = scipy.linalg.cho_factor( 180 | projected_cov, lower=True, check_finite=False) 181 | kalman_gain = scipy.linalg.cho_solve( 182 | (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, 183 | check_finite=False).T 184 | innovation = measurement - projected_mean 185 | 186 | new_mean = mean + np.dot(innovation, kalman_gain.T) 187 | new_covariance = covariance - np.linalg.multi_dot(( 188 | kalman_gain, projected_cov, kalman_gain.T)) 189 | return new_mean, new_covariance 190 | 191 | def gating_distance(self, mean, covariance, measurements, 192 | only_position=False): 193 | """Compute gating distance between state distribution and measurements. 194 | 195 | A suitable distance threshold can be obtained from `chi2inv95`. If 196 | `only_position` is False, the chi-square distribution has 4 degrees of 197 | freedom, otherwise 2. 198 | 199 | Parameters 200 | ---------- 201 | mean : ndarray 202 | Mean vector over the state distribution (8 dimensional). 203 | covariance : ndarray 204 | Covariance of the state distribution (8x8 dimensional). 205 | measurements : ndarray 206 | An Nx4 dimensional matrix of N measurements, each in 207 | format (x, y, a, h) where (x, y) is the bounding box center 208 | position, a the aspect ratio, and h the height. 209 | only_position : Optional[bool] 210 | If True, distance computation is done with respect to the bounding 211 | box center position only. 212 | 213 | Returns 214 | ------- 215 | ndarray 216 | Returns an array of length N, where the i-th element contains the 217 | squared Mahalanobis distance between (mean, covariance) and 218 | `measurements[i]`. 219 | 220 | """ 221 | mean, covariance = self.project(mean, covariance) 222 | if only_position: 223 | mean, covariance = mean[:2], covariance[:2, :2] 224 | measurements = measurements[:, :2] 225 | 226 | cholesky_factor = np.linalg.cholesky(covariance) 227 | d = measurements - mean 228 | z = scipy.linalg.solve_triangular( 229 | cholesky_factor, d.T, lower=True, check_finite=False, 230 | overwrite_b=True) 231 | squared_maha = np.sum(z * z, axis=0) 232 | return squared_maha 233 | -------------------------------------------------------------------------------- /deep_sort/linear_assignment.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | #from sklearn.utils.linear_assignment_ import linear_assignment 5 | from scipy.optimize import linear_sum_assignment 6 | from . import kalman_filter 7 | 8 | 9 | INFTY_COST = 1e+5 10 | 11 | 12 | def min_cost_matching( 13 | distance_metric, max_distance, tracks, detections, track_indices=None, 14 | detection_indices=None): 15 | """Solve linear assignment problem. 16 | 17 | Parameters 18 | ---------- 19 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray 20 | The distance metric is given a list of tracks and detections as well as 21 | a list of N track indices and M detection indices. The metric should 22 | return the NxM dimensional cost matrix, where element (i, j) is the 23 | association cost between the i-th track in the given track indices and 24 | the j-th detection in the given detection_indices. 25 | max_distance : float 26 | Gating threshold. Associations with cost larger than this value are 27 | disregarded. 28 | tracks : List[track.Track] 29 | A list of predicted tracks at the current time step. 30 | detections : List[detection.Detection] 31 | A list of detections at the current time step. 32 | track_indices : List[int] 33 | List of track indices that maps rows in `cost_matrix` to tracks in 34 | `tracks` (see description above). 35 | detection_indices : List[int] 36 | List of detection indices that maps columns in `cost_matrix` to 37 | detections in `detections` (see description above). 38 | 39 | Returns 40 | ------- 41 | (List[(int, int)], List[int], List[int]) 42 | Returns a tuple with the following three entries: 43 | * A list of matched track and detection indices. 44 | * A list of unmatched track indices. 45 | * A list of unmatched detection indices. 46 | 47 | """ 48 | if track_indices is None: 49 | track_indices = np.arange(len(tracks)) 50 | if detection_indices is None: 51 | detection_indices = np.arange(len(detections)) 52 | 53 | if len(detection_indices) == 0 or len(track_indices) == 0: 54 | return [], track_indices, detection_indices # Nothing to match. 55 | 56 | cost_matrix = distance_metric( 57 | tracks, detections, track_indices, detection_indices) 58 | cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 59 | #indices = linear_assignment(cost_matrix) 60 | indices = linear_sum_assignment(cost_matrix) 61 | indices = np.asarray(indices) 62 | indices = np.transpose(indices) 63 | 64 | matches, unmatched_tracks, unmatched_detections = [], [], [] 65 | for col, detection_idx in enumerate(detection_indices): 66 | if col not in indices[:, 1]: 67 | unmatched_detections.append(detection_idx) 68 | for row, track_idx in enumerate(track_indices): 69 | if row not in indices[:, 0]: 70 | unmatched_tracks.append(track_idx) 71 | for row, col in indices: 72 | track_idx = track_indices[row] 73 | detection_idx = detection_indices[col] 74 | if cost_matrix[row, col] > max_distance: 75 | unmatched_tracks.append(track_idx) 76 | unmatched_detections.append(detection_idx) 77 | else: 78 | matches.append((track_idx, detection_idx)) 79 | return matches, unmatched_tracks, unmatched_detections 80 | 81 | 82 | def matching_cascade( 83 | distance_metric, max_distance, cascade_depth, tracks, detections, 84 | track_indices=None, detection_indices=None): 85 | """Run matching cascade. 86 | 87 | Parameters 88 | ---------- 89 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray 90 | The distance metric is given a list of tracks and detections as well as 91 | a list of N track indices and M detection indices. The metric should 92 | return the NxM dimensional cost matrix, where element (i, j) is the 93 | association cost between the i-th track in the given track indices and 94 | the j-th detection in the given detection indices. 95 | max_distance : float 96 | Gating threshold. Associations with cost larger than this value are 97 | disregarded. 98 | cascade_depth: int 99 | The cascade depth, should be se to the maximum track age. 100 | tracks : List[track.Track] 101 | A list of predicted tracks at the current time step. 102 | detections : List[detection.Detection] 103 | A list of detections at the current time step. 104 | track_indices : Optional[List[int]] 105 | List of track indices that maps rows in `cost_matrix` to tracks in 106 | `tracks` (see description above). Defaults to all tracks. 107 | detection_indices : Optional[List[int]] 108 | List of detection indices that maps columns in `cost_matrix` to 109 | detections in `detections` (see description above). Defaults to all 110 | detections. 111 | 112 | Returns 113 | ------- 114 | (List[(int, int)], List[int], List[int]) 115 | Returns a tuple with the following three entries: 116 | * A list of matched track and detection indices. 117 | * A list of unmatched track indices. 118 | * A list of unmatched detection indices. 119 | 120 | """ 121 | if track_indices is None: 122 | track_indices = list(range(len(tracks))) 123 | if detection_indices is None: 124 | detection_indices = list(range(len(detections))) 125 | 126 | unmatched_detections = detection_indices 127 | matches = [] 128 | for level in range(cascade_depth): 129 | if len(unmatched_detections) == 0: # No detections left 130 | break 131 | 132 | track_indices_l = [ 133 | k for k in track_indices 134 | if tracks[k].time_since_update == 1 + level 135 | ] 136 | if len(track_indices_l) == 0: # Nothing to match at this level 137 | continue 138 | 139 | matches_l, _, unmatched_detections = \ 140 | min_cost_matching( 141 | distance_metric, max_distance, tracks, detections, 142 | track_indices_l, unmatched_detections) 143 | matches += matches_l 144 | unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) 145 | return matches, unmatched_tracks, unmatched_detections 146 | 147 | 148 | def gate_cost_matrix( 149 | kf, cost_matrix, tracks, detections, track_indices, detection_indices, 150 | gated_cost=INFTY_COST, only_position=False): 151 | """Invalidate infeasible entries in cost matrix based on the state 152 | distributions obtained by Kalman filtering. 153 | 154 | Parameters 155 | ---------- 156 | kf : The Kalman filter. 157 | cost_matrix : ndarray 158 | The NxM dimensional cost matrix, where N is the number of track indices 159 | and M is the number of detection indices, such that entry (i, j) is the 160 | association cost between `tracks[track_indices[i]]` and 161 | `detections[detection_indices[j]]`. 162 | tracks : List[track.Track] 163 | A list of predicted tracks at the current time step. 164 | detections : List[detection.Detection] 165 | A list of detections at the current time step. 166 | track_indices : List[int] 167 | List of track indices that maps rows in `cost_matrix` to tracks in 168 | `tracks` (see description above). 169 | detection_indices : List[int] 170 | List of detection indices that maps columns in `cost_matrix` to 171 | detections in `detections` (see description above). 172 | gated_cost : Optional[float] 173 | Entries in the cost matrix corresponding to infeasible associations are 174 | set this value. Defaults to a very large value. 175 | only_position : Optional[bool] 176 | If True, only the x, y position of the state distribution is considered 177 | during gating. Defaults to False. 178 | 179 | Returns 180 | ------- 181 | ndarray 182 | Returns the modified cost matrix. 183 | 184 | """ 185 | gating_dim = 2 if only_position else 4 186 | gating_threshold = kalman_filter.chi2inv95[gating_dim] 187 | measurements = np.asarray( 188 | [detections[i].to_xyah() for i in detection_indices]) 189 | for row, track_idx in enumerate(track_indices): 190 | track = tracks[track_idx] 191 | gating_distance = kf.gating_distance( 192 | track.mean, track.covariance, measurements, only_position) 193 | cost_matrix[row, gating_distance > gating_threshold] = gated_cost 194 | return cost_matrix 195 | -------------------------------------------------------------------------------- /deep_sort/nn_matching.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | 4 | 5 | def _pdist(a, b): 6 | """Compute pair-wise squared distance between points in `a` and `b`. 7 | 8 | Parameters 9 | ---------- 10 | a : array_like 11 | An NxM matrix of N samples of dimensionality M. 12 | b : array_like 13 | An LxM matrix of L samples of dimensionality M. 14 | 15 | Returns 16 | ------- 17 | ndarray 18 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 19 | contains the squared distance between `a[i]` and `b[j]`. 20 | 21 | """ 22 | a, b = np.asarray(a), np.asarray(b) 23 | if len(a) == 0 or len(b) == 0: 24 | return np.zeros((len(a), len(b))) 25 | a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) 26 | r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :] 27 | r2 = np.clip(r2, 0., float(np.inf)) 28 | return r2 29 | 30 | 31 | def _cosine_distance(a, b, data_is_normalized=False): 32 | """Compute pair-wise cosine distance between points in `a` and `b`. 33 | 34 | Parameters 35 | ---------- 36 | a : array_like 37 | An NxM matrix of N samples of dimensionality M. 38 | b : array_like 39 | An LxM matrix of L samples of dimensionality M. 40 | data_is_normalized : Optional[bool] 41 | If True, assumes rows in a and b are unit length vectors. 42 | Otherwise, a and b are explicitly normalized to lenght 1. 43 | 44 | Returns 45 | ------- 46 | ndarray 47 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 48 | contains the squared distance between `a[i]` and `b[j]`. 49 | 50 | """ 51 | if not data_is_normalized: 52 | a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) 53 | b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) 54 | return 1. - np.dot(a, b.T) 55 | 56 | 57 | def _nn_euclidean_distance(x, y): 58 | """ Helper function for nearest neighbor distance metric (Euclidean). 59 | 60 | Parameters 61 | ---------- 62 | x : ndarray 63 | A matrix of N row-vectors (sample points). 64 | y : ndarray 65 | A matrix of M row-vectors (query points). 66 | 67 | Returns 68 | ------- 69 | ndarray 70 | A vector of length M that contains for each entry in `y` the 71 | smallest Euclidean distance to a sample in `x`. 72 | 73 | """ 74 | distances = _pdist(x, y) 75 | return np.maximum(0.0, distances.min(axis=0)) 76 | 77 | 78 | def _nn_cosine_distance(x, y): 79 | """ Helper function for nearest neighbor distance metric (cosine). 80 | 81 | Parameters 82 | ---------- 83 | x : ndarray 84 | A matrix of N row-vectors (sample points). 85 | y : ndarray 86 | A matrix of M row-vectors (query points). 87 | 88 | Returns 89 | ------- 90 | ndarray 91 | A vector of length M that contains for each entry in `y` the 92 | smallest cosine distance to a sample in `x`. 93 | 94 | """ 95 | distances = _cosine_distance(x, y) 96 | return distances.min(axis=0) 97 | 98 | 99 | class NearestNeighborDistanceMetric(object): 100 | """ 101 | A nearest neighbor distance metric that, for each target, returns 102 | the closest distance to any sample that has been observed so far. 103 | 104 | Parameters 105 | ---------- 106 | metric : str 107 | Either "euclidean" or "cosine". 108 | matching_threshold: float 109 | The matching threshold. Samples with larger distance are considered an 110 | invalid match. 111 | budget : Optional[int] 112 | If not None, fix samples per class to at most this number. Removes 113 | the oldest samples when the budget is reached. 114 | 115 | Attributes 116 | ---------- 117 | samples : Dict[int -> List[ndarray]] 118 | A dictionary that maps from target identities to the list of samples 119 | that have been observed so far. 120 | 121 | """ 122 | 123 | def __init__(self, metric, matching_threshold, budget=None): 124 | 125 | 126 | if metric == "euclidean": 127 | self._metric = _nn_euclidean_distance 128 | elif metric == "cosine": 129 | self._metric = _nn_cosine_distance 130 | else: 131 | raise ValueError( 132 | "Invalid metric; must be either 'euclidean' or 'cosine'") 133 | self.matching_threshold = matching_threshold 134 | self.budget = budget 135 | self.samples = {} 136 | 137 | def partial_fit(self, features, targets, active_targets): 138 | """Update the distance metric with new data. 139 | 140 | Parameters 141 | ---------- 142 | features : ndarray 143 | An NxM matrix of N features of dimensionality M. 144 | targets : ndarray 145 | An integer array of associated target identities. 146 | active_targets : List[int] 147 | A list of targets that are currently present in the scene. 148 | 149 | """ 150 | for feature, target in zip(features, targets): 151 | self.samples.setdefault(target, []).append(feature) 152 | if self.budget is not None: 153 | self.samples[target] = self.samples[target][-self.budget:] 154 | self.samples = {k: self.samples[k] for k in active_targets} 155 | 156 | def distance(self, features, targets): 157 | """Compute distance between features and targets. 158 | 159 | Parameters 160 | ---------- 161 | features : ndarray 162 | An NxM matrix of N features of dimensionality M. 163 | targets : List[int] 164 | A list of targets to match the given `features` against. 165 | 166 | Returns 167 | ------- 168 | ndarray 169 | Returns a cost matrix of shape len(targets), len(features), where 170 | element (i, j) contains the closest squared distance between 171 | `targets[i]` and `features[j]`. 172 | 173 | """ 174 | cost_matrix = np.zeros((len(targets), len(features))) 175 | for i, target in enumerate(targets): 176 | cost_matrix[i, :] = self._metric(self.samples[target], features) 177 | return cost_matrix 178 | -------------------------------------------------------------------------------- /deep_sort/track.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | 3 | 4 | class TrackState: 5 | """ 6 | Enumeration type for the single target track state. Newly created tracks are 7 | classified as `tentative` until enough evidence has been collected. Then, 8 | the track state is changed to `confirmed`. Tracks that are no longer alive 9 | are classified as `deleted` to mark them for removal from the set of active 10 | tracks. 11 | 12 | """ 13 | 14 | Tentative = 1 15 | Confirmed = 2 16 | Deleted = 3 17 | 18 | 19 | class Track: 20 | """ 21 | A single target track with state space `(x, y, a, h)` and associated 22 | velocities, where `(x, y)` is the center of the bounding box, `a` is the 23 | aspect ratio and `h` is the height. 24 | 25 | Parameters 26 | ---------- 27 | mean : ndarray 28 | Mean vector of the initial state distribution. 29 | covariance : ndarray 30 | Covariance matrix of the initial state distribution. 31 | track_id : int 32 | A unique track identifier. 33 | n_init : int 34 | Number of consecutive detections before the track is confirmed. The 35 | track state is set to `Deleted` if a miss occurs within the first 36 | `n_init` frames. 37 | max_age : int 38 | The maximum number of consecutive misses before the track state is 39 | set to `Deleted`. 40 | feature : Optional[ndarray] 41 | Feature vector of the detection this track originates from. If not None, 42 | this feature is added to the `features` cache. 43 | 44 | Attributes 45 | ---------- 46 | mean : ndarray 47 | Mean vector of the initial state distribution. 48 | covariance : ndarray 49 | Covariance matrix of the initial state distribution. 50 | track_id : int 51 | A unique track identifier. 52 | hits : int 53 | Total number of measurement updates. 54 | age : int 55 | Total number of frames since first occurance. 56 | time_since_update : int 57 | Total number of frames since last measurement update. 58 | state : TrackState 59 | The current track state. 60 | features : List[ndarray] 61 | A cache of features. On each measurement update, the associated feature 62 | vector is added to this list. 63 | 64 | """ 65 | 66 | def __init__(self, mean, covariance, track_id, n_init, max_age, 67 | feature=None): 68 | self.mean = mean 69 | self.covariance = covariance 70 | self.track_id = track_id 71 | self.hits = 1 72 | self.age = 1 73 | self.time_since_update = 0 74 | 75 | self.state = TrackState.Tentative 76 | self.features = [] 77 | if feature is not None: 78 | self.features.append(feature) 79 | 80 | self._n_init = n_init 81 | self._max_age = max_age 82 | 83 | def to_tlwh(self): 84 | """Get current position in bounding box format `(top left x, top left y, 85 | width, height)`. 86 | 87 | Returns 88 | ------- 89 | ndarray 90 | The bounding box. 91 | 92 | """ 93 | ret = self.mean[:4].copy() 94 | ret[2] *= ret[3] 95 | ret[:2] -= ret[2:] / 2 96 | return ret 97 | 98 | def to_tlbr(self): 99 | """Get current position in bounding box format `(min x, miny, max x, 100 | max y)`. 101 | 102 | Returns 103 | ------- 104 | ndarray 105 | The bounding box. 106 | 107 | """ 108 | ret = self.to_tlwh() 109 | ret[2:] = ret[:2] + ret[2:] 110 | return ret 111 | 112 | def predict(self, kf): 113 | """Propagate the state distribution to the current time step using a 114 | Kalman filter prediction step. 115 | 116 | Parameters 117 | ---------- 118 | kf : kalman_filter.KalmanFilter 119 | The Kalman filter. 120 | 121 | """ 122 | self.mean, self.covariance = kf.predict(self.mean, self.covariance) 123 | self.age += 1 124 | self.time_since_update += 1 125 | 126 | def update(self, kf, detection): 127 | """Perform Kalman filter measurement update step and update the feature 128 | cache. 129 | 130 | Parameters 131 | ---------- 132 | kf : kalman_filter.KalmanFilter 133 | The Kalman filter. 134 | detection : Detection 135 | The associated detection. 136 | 137 | """ 138 | self.mean, self.covariance = kf.update( 139 | self.mean, self.covariance, detection.to_xyah()) 140 | self.features.append(detection.feature) 141 | 142 | self.hits += 1 143 | self.time_since_update = 0 144 | if self.state == TrackState.Tentative and self.hits >= self._n_init: 145 | self.state = TrackState.Confirmed 146 | 147 | def mark_missed(self): 148 | """Mark this track as missed (no association at the current time step). 149 | """ 150 | if self.state == TrackState.Tentative: 151 | self.state = TrackState.Deleted 152 | elif self.time_since_update > self._max_age: 153 | self.state = TrackState.Deleted 154 | 155 | def is_tentative(self): 156 | """Returns True if this track is tentative (unconfirmed). 157 | """ 158 | return self.state == TrackState.Tentative 159 | 160 | def is_confirmed(self): 161 | """Returns True if this track is confirmed.""" 162 | return self.state == TrackState.Confirmed 163 | 164 | def is_deleted(self): 165 | """Returns True if this track is dead and should be deleted.""" 166 | return self.state == TrackState.Deleted 167 | -------------------------------------------------------------------------------- /deep_sort/tracker.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from . import kalman_filter 5 | from . import linear_assignment 6 | from . import iou_matching 7 | from .track import Track 8 | 9 | 10 | class Tracker: 11 | """ 12 | This is the multi-target tracker. 13 | 14 | Parameters 15 | ---------- 16 | metric : nn_matching.NearestNeighborDistanceMetric 17 | A distance metric for measurement-to-track association. 18 | max_age : int 19 | Maximum number of missed misses before a track is deleted. 20 | n_init : int 21 | Number of consecutive detections before the track is confirmed. The 22 | track state is set to `Deleted` if a miss occurs within the first 23 | `n_init` frames. 24 | 25 | Attributes 26 | ---------- 27 | metric : nn_matching.NearestNeighborDistanceMetric 28 | The distance metric used for measurement to track association. 29 | max_age : int 30 | Maximum number of missed misses before a track is deleted. 31 | n_init : int 32 | Number of frames that a track remains in initialization phase. 33 | kf : kalman_filter.KalmanFilter 34 | A Kalman filter to filter target trajectories in image space. 35 | tracks : List[Track] 36 | The list of active tracks at the current time step. 37 | 38 | """ 39 | 40 | def __init__(self, metric, max_iou_distance=0.5, max_age=60, n_init=1): 41 | self.metric = metric 42 | self.max_iou_distance = max_iou_distance 43 | self.max_age = max_age 44 | self.n_init = n_init 45 | 46 | self.kf = kalman_filter.KalmanFilter() 47 | self.tracks = [] 48 | self._next_id = 1 49 | 50 | def predict(self): 51 | """Propagate track state distributions one time step forward. 52 | 53 | This function should be called once every time step, before `update`. 54 | """ 55 | for track in self.tracks: 56 | track.predict(self.kf) 57 | def update(self, detections): 58 | """Perform measurement update and track management. 59 | 60 | Parameters 61 | ---------- 62 | detections : List[deep_sort.detection.Detection] 63 | A list of detections at the current time step. 64 | 65 | """ 66 | # Run matching cascade. 67 | matches, unmatched_tracks, unmatched_detections = \ 68 | self._match(detections) 69 | 70 | # Update track set. 71 | for track_idx, detection_idx in matches: 72 | self.tracks[track_idx].update( 73 | self.kf, detections[detection_idx]) 74 | for track_idx in unmatched_tracks: 75 | self.tracks[track_idx].mark_missed() 76 | for detection_idx in unmatched_detections: 77 | self._initiate_track(detections[detection_idx]) 78 | self.tracks = [t for t in self.tracks if not t.is_deleted()] 79 | 80 | # Update distance metric. 81 | active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] 82 | features, targets = [], [] 83 | for track in self.tracks: 84 | if not track.is_confirmed(): 85 | continue 86 | features += track.features 87 | targets += [track.track_id for _ in track.features] 88 | track.features = [] 89 | self.metric.partial_fit( 90 | np.asarray(features), np.asarray(targets), active_targets) 91 | 92 | def _match(self, detections): 93 | 94 | def gated_metric(tracks, dets, track_indices, detection_indices): 95 | features = np.array([dets[i].feature for i in detection_indices]) 96 | targets = np.array([tracks[i].track_id for i in track_indices]) 97 | 98 | cost_matrix = self.metric.distance(features, targets) 99 | cost_matrix = linear_assignment.gate_cost_matrix( 100 | self.kf, cost_matrix, tracks, dets, track_indices, 101 | detection_indices) 102 | 103 | return cost_matrix 104 | 105 | # Split track set into confirmed and unconfirmed tracks. 106 | confirmed_tracks = [ 107 | i for i, t in enumerate(self.tracks) if t.is_confirmed()] 108 | unconfirmed_tracks = [ 109 | i for i, t in enumerate(self.tracks) if not t.is_confirmed()] 110 | 111 | # Associate confirmed tracks using appearance features. 112 | matches_a, unmatched_tracks_a, unmatched_detections = \ 113 | linear_assignment.matching_cascade( 114 | gated_metric, self.metric.matching_threshold, self.max_age, 115 | self.tracks, detections, confirmed_tracks) 116 | 117 | # Associate remaining tracks together with unconfirmed tracks using IOU. 118 | iou_track_candidates = unconfirmed_tracks + [ 119 | k for k in unmatched_tracks_a if 120 | self.tracks[k].time_since_update == 1] 121 | unmatched_tracks_a = [ 122 | k for k in unmatched_tracks_a if 123 | self.tracks[k].time_since_update != 1] 124 | matches_b, unmatched_tracks_b, unmatched_detections = \ 125 | linear_assignment.min_cost_matching( 126 | iou_matching.iou_cost, self.max_iou_distance, self.tracks, 127 | detections, iou_track_candidates, unmatched_detections) 128 | 129 | matches = matches_a + matches_b 130 | unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) 131 | return matches, unmatched_tracks, unmatched_detections 132 | 133 | def _initiate_track(self, detection): 134 | mean, covariance = self.kf.initiate(detection.to_xyah()) 135 | self.tracks.append(Track( 136 | mean, covariance, self._next_id, self.n_init, self.max_age, 137 | detection.feature)) 138 | self._next_id += 1 139 | -------------------------------------------------------------------------------- /deep_sort/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from deep_sort.detection import Detection 3 | from bisect import bisect 4 | 5 | def create_obj_infos(cur_frame, final_boxes, final_probs, final_labels, 6 | box_feats, targetid2class, tracking_objs, min_confidence, 7 | min_detection_height, scale, is_coco_model=False, 8 | coco_to_actev_mapping=None): 9 | 10 | # tracking_objs is a single item 11 | obj_infos = [] 12 | tracking_boxes = final_boxes / scale 13 | for j, (box, prob, label) in enumerate(zip(tracking_boxes, final_probs, final_labels)): 14 | cat_name = targetid2class[label] 15 | if is_coco_model: 16 | if cat_name not in coco_to_actev_mapping: 17 | continue 18 | else: 19 | cat_name = coco_to_actev_mapping[cat_name] 20 | 21 | confidence_socre = float(round(prob, 7)) 22 | if cat_name not in tracking_objs or confidence_socre < min_confidence: 23 | continue 24 | box[2] -= box[0] 25 | box[3] -= box[1] # x, y, w, h 26 | avg_feat = box_feats[j] 27 | if len(avg_feat.shape) > 2: # [C, H, W] 28 | avg_feat = np.mean(box_feats[j], axis=(1, 2)) 29 | 30 | 31 | #norm_feat = avg_feat / np.linalg.norm(avg_feat) # will be normed later 32 | 33 | list_feat = avg_feat.tolist() 34 | # frameIdx, xywh, conf, feature 35 | bbox_data = [cur_frame, box[0], box[1], box[2], box[3], confidence_socre] + list_feat 36 | obj_infos.append(bbox_data) 37 | 38 | detections = [] 39 | for row in obj_infos: 40 | bbox, confidence, feature = row[1:5], row[5], row[6:] 41 | if bbox[3] < min_detection_height: 42 | continue 43 | detections.append(Detection(bbox, confidence, feature)) 44 | return detections 45 | 46 | 47 | # 1 48 | def linear_inter_bbox(tracking_data, frame_gap): 49 | # print tracking_data.shape 50 | if tracking_data.shape[0] == 0: 51 | return tracking_data 52 | obj_indices = tracking_data[:, 1].astype(np.int) 53 | obj_ids = set(obj_indices.tolist()) 54 | tracking_data_list = tracking_data.tolist() 55 | # if len(tracking_data_list) == 0: 56 | # return tracking_data 57 | 58 | # for each track 59 | for obj_index in obj_ids: 60 | mask = obj_indices == obj_index 61 | # all the frames for this track 62 | tracked_frames = tracking_data[mask][:, 0].tolist() 63 | 64 | min_frame_idx = int(min(tracked_frames)) 65 | max_frame_idx = int(max(tracked_frames)) 66 | whole_frames = range(min_frame_idx, max_frame_idx) 67 | missing_frames = list(set(whole_frames).difference(tracked_frames)) 68 | if not missing_frames: 69 | continue 70 | for missing_frame in missing_frames: 71 | insert_index = bisect(tracked_frames, missing_frame) 72 | if insert_index == 0 or insert_index == len(whole_frames): 73 | continue 74 | selected_data = tracking_data[mask] 75 | prev_frame = selected_data[insert_index-1, 0] 76 | next_frame = selected_data[insert_index, 0] 77 | # tolerate some occlusion? 78 | if next_frame - prev_frame > 10*frame_gap: 79 | continue 80 | prev_data = selected_data[insert_index-1, 2:] 81 | next_data = selected_data[insert_index, 2:] 82 | 83 | ratio = 1.0 * (missing_frame - prev_frame) / (next_frame - prev_frame) 84 | cur_data = prev_data + (next_data - prev_data) * ratio 85 | cur_data = np.around(cur_data, decimals=2) 86 | missing_data = [missing_frame, obj_index] + cur_data.tolist() 87 | tracking_data_list.append(missing_data) 88 | 89 | tracking_data_list = sorted(tracking_data_list, key=lambda x: (x[0], x[1])) 90 | tracking_data = np.asarray(tracking_data_list) 91 | return tracking_data 92 | 93 | 94 | # 3 95 | def filter_short_objs(tracking_data): 96 | # print tracking_data.shape 97 | if tracking_data.shape[0] == 0: 98 | return tracking_data 99 | obj_indices = tracking_data[:, 1].astype(np.int) 100 | obj_ids = set(obj_indices.tolist()) 101 | filter_objs = set() 102 | 103 | for obj_index in obj_ids: 104 | mask = obj_indices == obj_index 105 | num_frames = np.sum(mask) 106 | if num_frames < 2: 107 | filter_objs.add(obj_index) 108 | 109 | tracking_data_list = tracking_data.tolist() 110 | tracking_data_list = [tracklet for tracklet in tracking_data_list if int(tracklet[1]) not in filter_objs] 111 | tracking_data_list = sorted(tracking_data_list, key=lambda x: (x[0], x[1])) 112 | tracking_data = np.asarray(tracking_data_list) 113 | return tracking_data 114 | -------------------------------------------------------------------------------- /deformable_helper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # helper function for deformable conv 3 | import tensorflow as tf 4 | 5 | def _to_bc_h_w(x, x_shape): 6 | """(b, h, w, c) -> (b*c, h, w)""" 7 | x = tf.transpose(x, [0, 3, 1, 2]) 8 | x = tf.reshape(x, (-1, x_shape[1], x_shape[2])) 9 | return x 10 | 11 | def _to_b_h_w_n_c(x, x_shape): 12 | """(b*c, h, w, n) -> (b, h, w, n, c)""" 13 | x = tf.reshape(x, (-1, x_shape[4], x_shape[1], x_shape[2], x_shape[3])) 14 | x = tf.transpose(x, [0, 2, 3, 4, 1]) 15 | return x 16 | 17 | def tf_flatten(a): 18 | """Flatten tensor""" 19 | return tf.reshape(a, [-1]) 20 | 21 | def _get_vals_by_coords(inputs, coords, idx, out_shape): 22 | indices = tf.stack( 23 | [idx, tf_flatten(coords[:, :, :, :, 0]), 24 | tf_flatten(coords[:, :, :, :, 1])], axis=-1 25 | ) 26 | vals = tf.gather_nd(inputs, indices) 27 | vals = tf.reshape(vals, out_shape) 28 | return vals 29 | 30 | def _tf_repeat(a, repeats): 31 | """Tensorflow version of np.repeat for 1D""" 32 | # https://github.com/tensorflow/tensorflow/issues/8521 33 | 34 | if len(a.get_shape()) != 1: 35 | raise AssertionError("This is not a 1D Tensor") 36 | 37 | a = tf.expand_dims(a, -1) 38 | a = tf.tile(a, [1, repeats]) 39 | a = tf_flatten(a) 40 | return a 41 | 42 | def _tf_batch_map_coordinates(inputs, coords): 43 | """Batch version of tf_map_coordinates 44 | 45 | Only supports 2D feature maps 46 | 47 | Parameters 48 | ---------- 49 | inputs : ``tf.Tensor`` 50 | shape = (b*c, h, w) 51 | coords : ``tf.Tensor`` 52 | shape = (b*c, h, w, n, 2) 53 | 54 | Returns 55 | ------- 56 | ``tf.Tensor`` 57 | A Tensor with the shape as (b*c, h, w, n) 58 | 59 | """ 60 | input_shape = inputs.get_shape() 61 | coords_shape = coords.get_shape() 62 | batch_channel = tf.shape(inputs)[0] 63 | input_h = tf.shape(inputs)[1] 64 | input_w = tf.shape(inputs)[2] 65 | kernel_n = int(coords_shape[3]) 66 | n_coords = input_h * input_w * kernel_n 67 | 68 | coords_lt = tf.cast(tf.floor(coords), 'int32') 69 | coords_rb = tf.cast(tf.ceil(coords), 'int32') 70 | coords_lb = tf.stack([coords_lt[:, :, :, :, 0], coords_rb[:, :, :, :, 1]], axis=-1) 71 | coords_rt = tf.stack([coords_rb[:, :, :, :, 0], coords_lt[:, :, :, :, 1]], axis=-1) 72 | 73 | idx = _tf_repeat(tf.range(batch_channel), n_coords) 74 | 75 | vals_lt = _get_vals_by_coords(inputs, coords_lt, idx, (batch_channel, input_h, input_w, kernel_n)) 76 | vals_rb = _get_vals_by_coords(inputs, coords_rb, idx, (batch_channel, input_h, input_w, kernel_n)) 77 | vals_lb = _get_vals_by_coords(inputs, coords_lb, idx, (batch_channel, input_h, input_w, kernel_n)) 78 | vals_rt = _get_vals_by_coords(inputs, coords_rt, idx, (batch_channel, input_h, input_w, kernel_n)) 79 | 80 | coords_offset_lt = coords - tf.cast(coords_lt, 'float32') 81 | 82 | vals_t = vals_lt + (vals_rt - vals_lt) * coords_offset_lt[:, :, :, :, 0] 83 | vals_b = vals_lb + (vals_rb - vals_lb) * coords_offset_lt[:, :, :, :, 0] 84 | mapped_vals = vals_t + (vals_b - vals_t) * coords_offset_lt[:, :, :, :, 1] 85 | 86 | return mapped_vals 87 | 88 | def _tf_batch_map_offsets(inputs, offsets, grid_offset): 89 | """Batch map offsets into input 90 | 91 | Parameters 92 | ------------ 93 | inputs : ``tf.Tensor`` 94 | shape = (b, h, w, c) 95 | offsets: ``tf.Tensor`` 96 | shape = (b, h, w, 2*n) 97 | grid_offset: `tf.Tensor`` 98 | Offset grids shape = (h, w, n, 2) 99 | 100 | Returns 101 | ------- 102 | ``tf.Tensor`` 103 | A Tensor with the shape as (b, h, w, c) 104 | 105 | """ 106 | input_shape = inputs.get_shape() 107 | batch_size = tf.shape(inputs)[0] 108 | kernel_n = int(int(offsets.get_shape()[3]) / 2) 109 | input_h = tf.shape(inputs)[1] 110 | input_w = tf.shape(inputs)[2] 111 | channel = input_shape[3] 112 | 113 | # inputs (b, h, w, c) --> (b*c, h, w) 114 | inputs = _to_bc_h_w(inputs, tf.shape(inputs)) 115 | 116 | # offsets (b, h, w, 2*n) --> (b, h, w, n, 2) 117 | offsets = tf.reshape(offsets, (batch_size, input_h, input_w, kernel_n, 2)) 118 | # offsets (b, h, w, n, 2) --> (b*c, h, w, n, 2) 119 | # offsets = tf.tile(offsets, [channel, 1, 1, 1, 1]) 120 | 121 | coords = tf.expand_dims(grid_offset, 0) # grid_offset --> (1, h, w, n, 2) 122 | coords = tf.tile(coords, [batch_size, 1, 1, 1, 1]) + offsets # grid_offset --> (b, h, w, n, 2) 123 | 124 | # clip out of bound 125 | coords = tf.stack( 126 | [ 127 | tf.clip_by_value(coords[:, :, :, :, 0], 0.0, tf.cast(input_h - 1, 'float32')), 128 | tf.clip_by_value(coords[:, :, :, :, 1], 0.0, tf.cast(input_w - 1, 'float32')) 129 | ], axis=-1 130 | ) 131 | coords = tf.tile(coords, [channel, 1, 1, 1, 1]) 132 | 133 | mapped_vals = _tf_batch_map_coordinates(inputs, coords) 134 | # (b*c, h, w, n) --> (b, h, w, n, c) 135 | mapped_vals = _to_b_h_w_n_c(mapped_vals, [batch_size, input_h, input_w, kernel_n, channel]) 136 | 137 | return mapped_vals -------------------------------------------------------------------------------- /diva_io/README.md: -------------------------------------------------------------------------------- 1 | # DIVA IO Package 2 | 3 | Version 0.3 4 | 5 | Author: Lijun Yu 6 | 7 | Email: lijun@lj-y.com 8 | 9 | IO interfaces for the [DIVA](https://www.iarpa.gov/index.php/research-programs/diva) project. 10 | 11 | ## Version History 12 | 13 | * 0.3 14 | * Optimized random access and fix missing. 15 | * Robustness improvement. 16 | * Speed test. 17 | * 0.2 (Deprecated) 18 | * Real random access in video loader. 19 | * Add annotation converter. 20 | * Warning control option. 21 | * 0.1 22 | * Initial release of video loader. 23 | 24 | ## Installation 25 | 26 | ### Integration 27 | 28 | To use as a submodule in your git project, run 29 | 30 | ```sh 31 | git submodule add https://github.com/Lijun-Yu/diva_io.git 32 | ``` 33 | 34 | ### Requirements 35 | 36 | Environment requirements are listed in [environment.yml](environment.yml). 37 | For the `av` package, I recommend you install it via `conda` by 38 | 39 | ```sh 40 | conda install av -c conda-forge 41 | ``` 42 | 43 | as building from `pip` would require a lot of [dependencies](http://docs.mikeboers.com/pyav/7.0.0/overview/installation.html#dependencies). 44 | 45 | ## Video Loader 46 | 47 | A robust video loader that deals with missing frames in the [MEVA dataset](http://mevadata.org). 48 | 49 | This video loader is developed based on [`PyAV`](https://github.com/mikeboers/PyAV) package. 50 | The [`pims`](https://github.com/soft-matter/pims) package was also a good reference despite its compatibility issue with current `PyAV`. 51 | 52 | For the videos in the MEVA, using `cv2.VideoCapture` would result in wrong frame ids as it never counts the missing frames. 53 | If you are using MEVA, I suggest you change to this video loader ASAP. 54 | 55 | ### Replace `cv2.VideoCapture` 56 | 57 | According to my test, this video loader returns the exact same frame as `cv2.VideoCapture` unless missing frame or decoding error occured. 58 | To replace the `cv2.VideoCapture` objects in legacy codes, simply change from 59 | 60 | ```python 61 | import cv2 62 | cap = cv2.VideoCapture(video_path) 63 | ``` 64 | 65 | to 66 | 67 | ```python 68 | from diva_io.video import VideoReader 69 | cap = VideoReader(video_path) 70 | ``` 71 | 72 | `VideoReader.read` follows the schema of `cv2.VideoCapture.read` but automatically inserts the missing frames while reading the video. 73 | 74 | ### Iterator Interface 75 | 76 | ```python 77 | video = VideoReader(video_path) 78 | for frame in video: 79 | # frame is a diva_io.video.frame.Frame object 80 | image = frame.numpy() 81 | # image is an uint8 array in a shape of (height, width, channel[BGR]) 82 | # ... Do something with the image 83 | ``` 84 | 85 | ### Random Access 86 | 87 | Random access of a frame requires decoding from the nearest key frame (approximately every 60 frames for MEVA). 88 | Averagely, this introduces a constant overhead of 0.1 seconds, which is much faster than iterating from the beginning. 89 | 90 | ```python 91 | start_frame_id = 1500 92 | length = 100 93 | video.seek(start_frame_id) 94 | for frame in video.get_iter(length): 95 | image = frame.numpy() 96 | # ... Do something with the image 97 | ``` 98 | 99 | ### Video Properties 100 | 101 | ```python 102 | video.width # cap.get(cv2.CAP_PROP_FRAME_WIDTH) 103 | video.height # cap.get(cv2.CAP_PROP_FRAME_HEIGHT) 104 | video.fps # cap.get(cv2.CAP_PROP_FPS) 105 | video.length # cap.get(cv2.CAP_PROP_FRAME_COUNT) 106 | ``` 107 | 108 | ### Other Interfaces 109 | 110 | For other usages, please see the comments in [video/reader.py](video/reader.py). 111 | 112 | ### Speed 113 | 114 | See [speed.md](docs/speed.md). 115 | 116 | ## Annotation 117 | 118 | An annotation loader and converter for Kitware YML format in [meva-data-repo](https://gitlab.kitware.com/meva/meva-data-repo). 119 | 120 | Clone the meva-data-repo and set 121 | 122 | ```python 123 | annotation_dir = 'path/to/meva-data-repo/annotation/DIVA-phase-2/MEVA/meva-annotations' 124 | ``` 125 | 126 | ### Convert Annotation 127 | 128 | This is to convert the annotation from Kitware YML format to ActEV Scorer JSON format. 129 | Run the following command in shell outside the repo's director, 130 | 131 | ```sh 132 | python -m diva_io.annotation.converter 133 | ``` 134 | 135 | ### Read Annotation 136 | 137 | ```python 138 | from diva_io.annotation import KitwareAnnotation 139 | video_name = '2018-03-11.11-15-04.11-20-04.school.G300' 140 | annotation = KitwareAnnotation(video_name, annotation_dir) 141 | # deal with annotation.raw_data 142 | ``` 143 | -------------------------------------------------------------------------------- /diva_io/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Lijun Yu' 2 | -------------------------------------------------------------------------------- /diva_io/annotation/__init__.py: -------------------------------------------------------------------------------- 1 | from .kf1 import KitwareAnnotation 2 | -------------------------------------------------------------------------------- /diva_io/annotation/converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import os.path as osp 5 | from progressbar import progressbar 6 | from concurrent.futures import ProcessPoolExecutor 7 | from ..utils import get_logger 8 | from .kf1 import KitwareAnnotation 9 | 10 | 11 | def _get_video_list(annotation_dir): 12 | path = osp.join(annotation_dir, 'list-of-annotated-meva-clips.txt') 13 | with open(path) as f: 14 | video_list = [l.strip() for l in f][2:] 15 | return video_list 16 | 17 | 18 | def _worker(job): 19 | video_name, annotation_dir = job 20 | annotation = KitwareAnnotation(video_name, annotation_dir) 21 | return annotation.get_activities_official() 22 | 23 | 24 | def _get_official_format(video_list, annotation_dir): 25 | jobs = [(video_name, annotation_dir) for video_name in video_list] 26 | pool = ProcessPoolExecutor() 27 | activities = [] 28 | for result in progressbar(pool.map(_worker, jobs)): 29 | activities.extend(result) 30 | reference = {'filesProcessed': video_list, 'activities': activities} 31 | file_index = {video_name: {'framerate': 30.0, 'selected': {0: 1, 9000: 0}} 32 | for video_name in video_list} 33 | return reference, file_index 34 | 35 | 36 | def _write_files(data_dict, output_dir): 37 | os.makedirs(output_dir, exist_ok=True) 38 | logger = get_logger(__name__) 39 | for filename, data in data_dict.items(): 40 | path = osp.join(output_dir, filename + '.json') 41 | if osp.exists(path): 42 | logger.warning('Overwriting file %s', path) 43 | with open(path, 'w') as f: 44 | json.dump(data, f) 45 | 46 | 47 | def convert_annotation(annotation_dir, output_dir): 48 | video_list = _get_video_list(annotation_dir) 49 | reference, file_index = _get_official_format(video_list, annotation_dir) 50 | data_dict = {'reference': reference, 'file-index': file_index} 51 | _write_files(data_dict, output_dir) 52 | 53 | 54 | def main(): 55 | parser = argparse.ArgumentParser( 56 | 'Annotation Converter for KF1, from Kitware YML format to ' 57 | 'ActEV Scorer JSON format.') 58 | parser.add_argument('annotation_dir') 59 | parser.add_argument('output_dir') 60 | args = parser.parse_args() 61 | convert_annotation(args.annotation_dir, args.output_dir) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /diva_io/annotation/kf1.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os.path as osp 3 | from collections import defaultdict 4 | 5 | 6 | FIELDS = ['activities', 'geom', 'types'] 7 | 8 | 9 | class KitwareAnnotation(object): 10 | 11 | def __init__(self, video_name: str, annotation_dir: str): 12 | # Please explore the structure of raw_data yourself 13 | self.video_name = video_name 14 | self.raw_data = self._load_raw_data(video_name, annotation_dir) 15 | 16 | def _split_meta(self, contents, key): 17 | meta = [] 18 | i = 0 19 | while i < len(contents) and 'meta' in contents[i]: 20 | assert key not in contents[i] 21 | meta.append(contents[i]['meta']) 22 | i += 1 23 | data = [content[key] for content in contents[i:]] 24 | return meta, data 25 | 26 | def _load_file(self, video_name, annotation_dir, field): 27 | date, time_1, time_2 = video_name.split('.')[:3] 28 | for time in [time_1, time_2]: 29 | path = osp.join(annotation_dir, date, time[:2], '%s.%s.yml' % ( 30 | video_name, field)) 31 | if not osp.exists(path): 32 | continue 33 | with open(path) as f: 34 | contents = yaml.load(f, Loader=yaml.FullLoader) 35 | return contents 36 | path = osp.join(annotation_dir, date, time_1[:2], '%s.%s.yml' % ( 37 | video_name, field)) 38 | raise FileNotFoundError(path) 39 | 40 | def _load_raw_data(self, video_name, annotation_dir): 41 | raw_data = {'meta': {}} 42 | for field in FIELDS: 43 | contents = self._load_file(video_name, annotation_dir, field) 44 | key = field if field != 'activities' else 'act' 45 | raw_data['meta'][field], raw_data[field] = self._split_meta( 46 | contents, key) 47 | objs = defaultdict(dict) 48 | for obj in raw_data['geom']: 49 | obj['g0'] = [int(x) for x in obj['g0'].split()] 50 | objs[obj['id1']][obj['ts0']] = obj 51 | for obj in raw_data['types']: 52 | objs[obj['id1']]['type'] = [*obj['cset3'].keys()][0] 53 | for act in raw_data['activities']: 54 | for actor in act.get('actors', []): 55 | obj = objs[actor['id1']] 56 | geoms = [] 57 | for ts in actor['timespan']: 58 | start, end = ts['tsr0'] 59 | for time in range(start, end + 1): 60 | geoms.append(obj[time]) 61 | actor['geoms'] = geoms 62 | actor['type'] = obj['type'] 63 | return raw_data 64 | 65 | def get_activities_official(self): 66 | activities = [] 67 | for act in self.raw_data['activities']: 68 | act_id = act['id2'] 69 | act_type = [*act['act2'].keys()][0] 70 | if act_type.startswith('empty'): 71 | continue 72 | start, end = act['timespan'][0]['tsr0'] 73 | objects = [] 74 | for actor in act['actors']: 75 | actor_id = actor['id1'] 76 | bbox_history = {} 77 | for geom in actor['geoms']: 78 | frame_id = geom['ts0'] 79 | x1, y1, x2, y2 = geom['g0'] 80 | bbox_history[frame_id] = { 81 | 'presenceConf': 1, 82 | 'boundingBox': { 83 | 'x': min(x1, x2), 'y': min(y1, y2), 84 | 'w': abs(x2 - x1), 'h': abs(y2 - y1)}} 85 | for frame_id in range(start, end + 1): 86 | if frame_id not in bbox_history: 87 | bbox_history[frame_id] = {} 88 | obj = {'objectType': 'Vehicle', 'objectID': actor_id, 89 | 'localization': {self.video_name: bbox_history}} 90 | objects.append(obj) 91 | activity = { 92 | 'activity': act_type, 'activityID': act_id, 93 | 'presenceConf': 1, 'alertFrame': start, 94 | 'localization': {self.video_name: {start: 1, end + 1: 0}}, 95 | 'objects': objects} 96 | activities.append(activity) 97 | return activities 98 | -------------------------------------------------------------------------------- /diva_io/docs/speed.md: -------------------------------------------------------------------------------- 1 | # Speed of diva_io.video.VideoReader 2 | 3 | Test performed by [video/speed_test.sh](../video/speed_test.sh). 4 | 5 | ```sh 6 | ./video/speed_test.sh 7 | ``` 8 | 9 | ## Overall Performance 10 | 11 | Loading all frames of 7 videos from the [MEVA dataset](http://mevadata.org). Each video is 5-min long and 1080p at 30 fps. 12 | 13 | | | `diva_io.video. VideoReader (fix_missing=True)` | `diva_io.video. VideoReader (fix_missing=False)` | `pymovie.editor .VideoFileClip` | `cv2.VideoCapture` | 14 | |:---------------:|:----------------------------------------------:|:-----------------------------------------------:|:-------------------------------:|:------------------:| 15 | | User Time | 338.12s | 329.00s | 904.09s | 844.35s | 16 | | System Time | 0.80s | 0.60s | 317.14s | 6.44s | 17 | | CPU Utilization | 99% | 99% | 293% | 264% | 18 | | Total Time | 338.98s | 329.60s | 416.31s | 321.06s | 19 | 20 | ## Detailed Results 21 | 22 | | Video Name | Video Description | `diva_io.video .VideoReader (fix_missing=True)` | `diva_io.video .VideoReader (fix_missing=False)` | `pymovie.editor .VideoFileClip` | `cv2.VideoCapture` | 23 | |:----------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------:|:------------------------------------------------:|:-------------------------------:|:------------------:| 24 | | 2018-03-11.16-30-08.16-35-08.hospital.G436.avi | No missing | 0:45 | 0:44 | 1:00 | 0:26 | 25 | | 2018-03-07.16-55-06.17-00-06.school.G336.avi | Missing 104-109, 2294 | 0:55 | 0:53 | 0:59 | 0:26 | 26 | | 2018-03-11.11-25-01.11-30-01.school.G424.avi | Missing 7391-7499 | 0:38 | 0:37 | 0:58 | 0:26 | 27 | | 2018-03-11.16-25-00.16-30-00.school.G639.avi | Bidirectional frames, missing 1, 4 | 0:55 | 0:53 | 0:59 | 0:27 | 28 | | 2018-03-11.11-35-00.11-40-00.school.G299.avi | Packet id and frame id unsychronized, missing 5789-5797 | 0:50 | 0:49 | 0:58 | 1:42 | 29 | | 2018-03-11.11-35-00.11-40-00.school.G330.avi | Packet id and frame id unsychronized, missing 5755-5761 | 0:50 | 0:49 | 0:58 | 0:39 | 30 | | 2018-03-12.10-05-00.10-10-00.hospital.G436.avi | First packet fail | 0:41 | 0:41 | 0:59 | 1:11 | 31 | -------------------------------------------------------------------------------- /diva_io/environment.yml: -------------------------------------------------------------------------------- 1 | name: diva_io 2 | channels: 3 | - pkgs/main 4 | - conda-forge 5 | dependencies: 6 | - python 7 | - numpy 8 | - av 9 | -------------------------------------------------------------------------------- /diva_io/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .log import get_logger 2 | -------------------------------------------------------------------------------- /diva_io/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_logger(name, level=logging.INFO, log_file=None): 5 | logger = logging.getLogger(name) 6 | logger.setLevel(level) 7 | formatter = logging.Formatter( 8 | '%(asctime)s %(name)s %(levelname)s %(message)s') 9 | handlers = [logging.StreamHandler()] 10 | if log_file is not None: 11 | handlers.append(logging.FileHandler(log_file)) 12 | for handler in handlers: 13 | handler.setLevel(level) 14 | handler.setFormatter(formatter) 15 | logger.addHandler(handler) 16 | return logger 17 | -------------------------------------------------------------------------------- /diva_io/video/__init__.py: -------------------------------------------------------------------------------- 1 | from .reader import VideoReader 2 | -------------------------------------------------------------------------------- /diva_io/video/frame.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from av import VideoFrame 3 | 4 | 5 | class Frame(object): 6 | 7 | def __init__(self, frame, fix_missing_offset= 0): 8 | """Frame wrapper of av.VideoFrame. 9 | 10 | Parameters 11 | ---------- 12 | frame : av.VideoFrame 13 | VideoFrame object to be wrapped. 14 | fix_missing_offset : int, optional 15 | Frame id offset to fix a missing frame, by default 0 16 | """ 17 | assert isinstance(frame, VideoFrame) 18 | self.frame = frame 19 | self.fix_missing_offset = fix_missing_offset 20 | 21 | @property 22 | def frame_id(self): 23 | """Frame id for external use, including fixing for a missing frame. 24 | """ 25 | return self.frame_index_display + self.fix_missing_offset 26 | 27 | def image(self): 28 | """Get PIL Image for visualization in jupyter. 29 | 30 | Returns 31 | ------- 32 | PIL.Image 33 | Image for visualization in jupyter. 34 | """ 35 | return self.frame.to_image() 36 | 37 | def numpy(self, format='bgr24', width= None, 38 | height= None): 39 | """Get numpy array of the frame in the specified format. 40 | 41 | Parameters 42 | ---------- 43 | format : str, optional 44 | Format parameter of av.VideoFrame.reformat(), by default 'bgr24'. 45 | width : int, optional 46 | Desired width of the frame, by default None 47 | height : int, optional 48 | Desired height of the frame, by default None 49 | 50 | Returns 51 | ------- 52 | np.ndarray 53 | Numpy array of the frame. 54 | """ 55 | return self.frame.to_ndarray(width=width, height=height, format=format) 56 | 57 | @property 58 | def frame_index_display(self): 59 | """The correct frame index for display, 0 based. 60 | 61 | Returns 62 | ------- 63 | int 64 | Frame index for display. 65 | """ 66 | return self.frame.pts - 1 67 | 68 | @property 69 | def frame_index_store(self): 70 | """The frame index as stored in the video, 0 based. 71 | If you used cv2.VideoCapture.read() to read a video sequentially, this 72 | is the index you would get. 73 | 74 | Returns 75 | ------- 76 | int 77 | Frame index as stored. 78 | """ 79 | return self.frame.index 80 | 81 | def __repr__(self): 82 | return '<%s contains %s>' % ( 83 | repr(self.__class__)[8:-2], repr(self.frame)) 84 | 85 | def __getattr__(self, name): 86 | return getattr(self.frame, name) 87 | -------------------------------------------------------------------------------- /diva_io/video/speed_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/zsh 2 | 3 | video_dir=$1 4 | cd $(pwd)/$(dirname $0)/../.. 5 | 6 | echo "diva_io.video.VideoReader(fix_missing=True)" 7 | time python -c "from diva_io.video.test import speed_test_divaio; speed_test_divaio(\"$video_dir\", True)" 8 | 9 | echo "diva_io.video.VideoReader(fix_missing=False)" 10 | time python -c "from diva_io.video.test import speed_test_divaio; speed_test_divaio(\"$video_dir\", False)" 11 | 12 | echo "moviepy.editor.VideoFileClip" 13 | time python -c "from diva_io.video.test import speed_test_moviepy; speed_test_moviepy(\"$video_dir\")" 14 | 15 | echo "cv2.VideoCapture" 16 | time python -c "from diva_io.video.test import speed_test_opencv; speed_test_opencv(\"$video_dir\")" 17 | -------------------------------------------------------------------------------- /diva_io/video/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import os.path as osp 4 | from progressbar import ProgressBar 5 | from .reader import VideoReader 6 | 7 | VIDEO_LIST = [ 8 | '2018-03-11.16-30-08.16-35-08.hospital.G436.avi', # no missing 9 | '2018-03-07.16-55-06.17-00-06.school.G336.avi', # have missing 10 | '2018-03-11.11-25-01.11-30-01.school.G424.avi', 11 | '2018-03-11.16-25-00.16-30-00.school.G639.avi', # bidirectional 12 | '2018-03-11.11-35-00.11-40-00.school.G299.avi', # frame id misorder 13 | '2018-03-11.11-35-00.11-40-00.school.G330.avi', 14 | '2018-03-12.10-05-00.10-10-00.hospital.G436.avi' # first frame fail 15 | ] 16 | 17 | 18 | def integrity_test(video_list, video_dir, random_access_point=(5790, 100)): 19 | print('No fix missing') 20 | for video_name in video_list: 21 | print('\t', video_name, flush=True) 22 | bar = ProgressBar().start() 23 | v = VideoReader(video_name, video_dir, fix_missing=False) 24 | for i, f in bar(enumerate(v)): 25 | pass 26 | 27 | print('Fix missing with random access') 28 | start_frame_id, length = random_access_point 29 | for video_name in video_list: 30 | print('\t', video_name, flush=True) 31 | bar = ProgressBar().start() 32 | v = VideoReader(video_name, video_dir) 33 | for i, f in bar(enumerate(v)): 34 | assert f.frame_id == i 35 | bar = ProgressBar().start() 36 | v = VideoReader(video_name, video_dir) 37 | v.seek(start_frame_id) 38 | for i, frame in bar(enumerate(v.get_iter(length))): 39 | assert frame.frame_id == start_frame_id + i 40 | 41 | 42 | def speed_test_opencv(video_dir, video_list=VIDEO_LIST): 43 | import cv2 44 | for video_name in video_list: 45 | print('\t', video_name, flush=True) 46 | bar = ProgressBar().start() 47 | cap = cv2.VideoCapture(osp.join(video_dir, video_name)) 48 | for _ in bar(range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)))): 49 | cap.read() 50 | 51 | 52 | def speed_test_moviepy(video_dir, video_list=VIDEO_LIST): 53 | from moviepy.editor import VideoFileClip 54 | for video_name in video_list: 55 | print('\t', video_name, flush=True) 56 | bar = ProgressBar().start() 57 | clip = VideoFileClip(osp.join(video_dir, video_name)) 58 | for i in bar(range(int(clip.duration * clip.fps))): 59 | clip.get_frame(i / clip.fps) 60 | 61 | 62 | def speed_test_divaio(video_dir, fix_missing, video_list=VIDEO_LIST): 63 | for video_name in video_list: 64 | print('\t', video_name, flush=True) 65 | bar = ProgressBar().start() 66 | video = VideoReader(video_name, video_dir, fix_missing=fix_missing) 67 | for _ in bar(range(video.length)): 68 | video.read() 69 | 70 | 71 | if __name__ == "__main__": 72 | video_dir = sys.argv[1] 73 | integrity_test(VIDEO_LIST, video_dir) 74 | -------------------------------------------------------------------------------- /efficientdet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/efficientdet/__init__.py -------------------------------------------------------------------------------- /efficientdet/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /efficientdet/backbone/backbone_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Backbone network factory.""" 16 | 17 | from . import efficientnet_builder 18 | from . import efficientnet_lite_builder 19 | 20 | 21 | def get_model_builder(model_name): 22 | """Get the model_builder module for a given model name.""" 23 | if model_name.startswith('efficientnet-lite'): 24 | return efficientnet_lite_builder 25 | elif model_name.startswith('efficientnet-b'): 26 | return efficientnet_builder 27 | else: 28 | raise ValueError('Unknown model name {}'.format(model_name)) 29 | -------------------------------------------------------------------------------- /efficientdet/backbone/efficientnet_builder_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for efficientnet_builder.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | import tensorflow.compat.v1 as tf 23 | 24 | from backbone import efficientnet_builder 25 | 26 | 27 | class EfficientnetBuilderTest(tf.test.TestCase): 28 | 29 | def _test_model_params(self, 30 | model_name, 31 | input_size, 32 | expected_params, 33 | override_params=None, 34 | features_only=False, 35 | pooled_features_only=False): 36 | images = tf.zeros((1, input_size, input_size, 3), dtype=tf.float32) 37 | efficientnet_builder.build_model( 38 | images, 39 | model_name=model_name, 40 | override_params=override_params, 41 | training=True, 42 | features_only=features_only, 43 | pooled_features_only=pooled_features_only) 44 | num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) 45 | self.assertEqual(num_params, expected_params) 46 | 47 | def test_efficientnet_b0(self): 48 | self._test_model_params('efficientnet-b0', 224, expected_params=5288548) 49 | 50 | def test_efficientnet_b1(self): 51 | self._test_model_params('efficientnet-b1', 240, expected_params=7794184) 52 | 53 | def test_efficientnet_b2(self): 54 | self._test_model_params('efficientnet-b2', 260, expected_params=9109994) 55 | 56 | def test_efficientnet_b3(self): 57 | self._test_model_params('efficientnet-b3', 300, expected_params=12233232) 58 | 59 | def test_efficientnet_b4(self): 60 | self._test_model_params('efficientnet-b4', 380, expected_params=19341616) 61 | 62 | def test_efficientnet_b5(self): 63 | self._test_model_params('efficientnet-b5', 456, expected_params=30389784) 64 | 65 | def test_efficientnet_b6(self): 66 | self._test_model_params('efficientnet-b6', 528, expected_params=43040704) 67 | 68 | def test_efficientnet_b7(self): 69 | self._test_model_params('efficientnet-b7', 600, expected_params=66347960) 70 | 71 | def test_efficientnet_b0_with_customized_num_classes(self): 72 | self._test_model_params( 73 | 'efficientnet-b0', 74 | 224, 75 | expected_params=4135648, 76 | override_params={'num_classes': 100}) 77 | 78 | def test_efficientnet_b0_with_features_only(self): 79 | self._test_model_params( 80 | 'efficientnet-b0', 224, features_only=True, expected_params=3595388) 81 | 82 | def test_efficientnet_b0_with_pooled_features_only(self): 83 | self._test_model_params( 84 | 'efficientnet-b0', 85 | 224, 86 | pooled_features_only=True, 87 | expected_params=4007548) 88 | 89 | def test_efficientnet_b0_fails_if_both_features_requested(self): 90 | with self.assertRaises(AssertionError): 91 | efficientnet_builder.build_model( 92 | None, 93 | model_name='efficientnet-b0', 94 | training=True, 95 | features_only=True, 96 | pooled_features_only=True) 97 | 98 | def test_efficientnet_b0_base(self): 99 | # Creates a base model using the model configuration. 100 | images = tf.zeros((1, 224, 224, 3), dtype=tf.float32) 101 | _, endpoints = efficientnet_builder.build_model_base( 102 | images, model_name='efficientnet-b0', training=True) 103 | 104 | # reduction_1 to reduction_5 should be in endpoints 105 | self.assertIn('reduction_1', endpoints) 106 | self.assertIn('reduction_5', endpoints) 107 | # reduction_5 should be the last one: no reduction_6. 108 | self.assertNotIn('reduction_6', endpoints) 109 | 110 | 111 | if __name__ == '__main__': 112 | # Disable eager to allow tf.profile works for #params/#flops. 113 | tf.disable_eager_execution() 114 | tf.test.main() 115 | -------------------------------------------------------------------------------- /efficientdet/backbone/efficientnet_lite_builder_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for efficientnet_lite_builder.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | import tensorflow.compat.v1 as tf 23 | 24 | from backbone import efficientnet_lite_builder 25 | 26 | 27 | class EfficientnetBuilderTest(tf.test.TestCase): 28 | 29 | def _test_model_params(self, 30 | model_name, 31 | input_size, 32 | expected_params, 33 | override_params=None, 34 | features_only=False, 35 | pooled_features_only=False): 36 | images = tf.zeros((1, input_size, input_size, 3), dtype=tf.float32) 37 | efficientnet_lite_builder.build_model( 38 | images, 39 | model_name=model_name, 40 | override_params=override_params, 41 | training=True, 42 | features_only=features_only, 43 | pooled_features_only=pooled_features_only) 44 | num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) 45 | 46 | self.assertEqual(num_params, expected_params) 47 | 48 | def test_efficientnet_b0(self): 49 | self._test_model_params( 50 | 'efficientnet-lite0', 224, expected_params=4652008) 51 | 52 | def test_efficientnet_b1(self): 53 | self._test_model_params( 54 | 'efficientnet-lite1', 240, expected_params=5416680) 55 | 56 | def test_efficientnet_b2(self): 57 | self._test_model_params( 58 | 'efficientnet-lite2', 260, expected_params=6092072) 59 | 60 | def test_efficientnet_b3(self): 61 | self._test_model_params( 62 | 'efficientnet-lite3', 280, expected_params=8197096) 63 | 64 | def test_efficientnet_b4(self): 65 | self._test_model_params( 66 | 'efficientnet-lite4', 300, expected_params=13006568) 67 | 68 | 69 | if __name__ == '__main__': 70 | # Disable eager to allow tf.profile works for #params/#flops. 71 | tf.disable_eager_execution() 72 | tf.test.main() 73 | -------------------------------------------------------------------------------- /efficientdet/object_detection/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | # Object detection data loaders and libraries are mostly based on RetinaNet: 16 | # https://github.com/tensorflow/tpu/tree/master/models/official/retinanet 17 | -------------------------------------------------------------------------------- /efficientdet/object_detection/box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Base box coder. 16 | 17 | Box coders convert between coordinate frames, namely image-centric 18 | (with (0,0) on the top left of image) and anchor-centric (with (0,0) being 19 | defined by a specific anchor). 20 | 21 | Users of a BoxCoder can call two methods: 22 | encode: which encodes a box with respect to a given anchor 23 | (or rather, a tensor of boxes wrt a corresponding tensor of anchors) and 24 | decode: which inverts this encoding with a decode operation. 25 | In both cases, the arguments are assumed to be in 1-1 correspondence already; 26 | it is not the job of a BoxCoder to perform matching. 27 | """ 28 | from abc import ABCMeta 29 | from abc import abstractmethod 30 | from abc import abstractproperty 31 | 32 | import tensorflow.compat.v1 as tf 33 | 34 | 35 | # Box coder types. 36 | FASTER_RCNN = 'faster_rcnn' 37 | KEYPOINT = 'keypoint' 38 | MEAN_STDDEV = 'mean_stddev' 39 | SQUARE = 'square' 40 | 41 | 42 | class BoxCoder(object): 43 | """Abstract base class for box coder.""" 44 | __metaclass__ = ABCMeta 45 | 46 | @abstractproperty 47 | def code_size(self): 48 | """Return the size of each code. 49 | 50 | This number is a constant and should agree with the output of the `encode` 51 | op (e.g. if rel_codes is the output of self.encode(...), then it should have 52 | shape [N, code_size()]). This abstractproperty should be overridden by 53 | implementations. 54 | 55 | Returns: 56 | an integer constant 57 | """ 58 | pass 59 | 60 | def encode(self, boxes, anchors): 61 | """Encode a box list relative to an anchor collection. 62 | 63 | Args: 64 | boxes: BoxList holding N boxes to be encoded 65 | anchors: BoxList of N anchors 66 | 67 | Returns: 68 | a tensor representing N relative-encoded boxes 69 | """ 70 | with tf.name_scope('Encode'): 71 | return self._encode(boxes, anchors) 72 | 73 | def decode(self, rel_codes, anchors): 74 | """Decode boxes that are encoded relative to an anchor collection. 75 | 76 | Args: 77 | rel_codes: a tensor representing N relative-encoded boxes 78 | anchors: BoxList of anchors 79 | 80 | Returns: 81 | boxlist: BoxList holding N boxes encoded in the ordinary way (i.e., 82 | with corners y_min, x_min, y_max, x_max) 83 | """ 84 | with tf.name_scope('Decode'): 85 | return self._decode(rel_codes, anchors) 86 | 87 | @abstractmethod 88 | def _encode(self, boxes, anchors): 89 | """Method to be overridden by implementations. 90 | 91 | Args: 92 | boxes: BoxList holding N boxes to be encoded 93 | anchors: BoxList of N anchors 94 | 95 | Returns: 96 | a tensor representing N relative-encoded boxes 97 | """ 98 | pass 99 | 100 | @abstractmethod 101 | def _decode(self, rel_codes, anchors): 102 | """Method to be overridden by implementations. 103 | 104 | Args: 105 | rel_codes: a tensor representing N relative-encoded boxes 106 | anchors: BoxList of anchors 107 | 108 | Returns: 109 | boxlist: BoxList holding N boxes encoded in the ordinary way (i.e., 110 | with corners y_min, x_min, y_max, x_max) 111 | """ 112 | pass 113 | 114 | 115 | def batch_decode(encoded_boxes, box_coder, anchors): 116 | """Decode a batch of encoded boxes. 117 | 118 | This op takes a batch of encoded bounding boxes and transforms 119 | them to a batch of bounding boxes specified by their corners in 120 | the order of [y_min, x_min, y_max, x_max]. 121 | 122 | Args: 123 | encoded_boxes: a float32 tensor of shape [batch_size, num_anchors, 124 | code_size] representing the location of the objects. 125 | box_coder: a BoxCoder object. 126 | anchors: a BoxList of anchors used to encode `encoded_boxes`. 127 | 128 | Returns: 129 | decoded_boxes: a float32 tensor of shape [batch_size, num_anchors, 130 | coder_size] representing the corners of the objects in the order 131 | of [y_min, x_min, y_max, x_max]. 132 | 133 | Raises: 134 | ValueError: if batch sizes of the inputs are inconsistent, or if 135 | the number of anchors inferred from encoded_boxes and anchors are 136 | inconsistent. 137 | """ 138 | encoded_boxes.get_shape().assert_has_rank(3) 139 | if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static(): 140 | raise ValueError('The number of anchors inferred from encoded_boxes' 141 | ' and anchors are inconsistent: shape[1] of encoded_boxes' 142 | ' %s should be equal to the number of anchors: %s.' % 143 | (encoded_boxes.get_shape()[1].value, 144 | anchors.num_boxes_static())) 145 | 146 | decoded_boxes = tf.stack([ 147 | box_coder.decode(boxes, anchors).get() 148 | for boxes in tf.unstack(encoded_boxes) 149 | ]) 150 | return decoded_boxes 151 | -------------------------------------------------------------------------------- /efficientdet/object_detection/box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Bounding Box List definition. 16 | 17 | BoxList represents a list of bounding boxes as tensorflow 18 | tensors, where each bounding box is represented as a row of 4 numbers, 19 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes 20 | within a given list correspond to a single image. See also 21 | box_list_ops.py for common box related operations (such as area, iou, etc). 22 | 23 | Optionally, users can add additional related fields (such as weights). 24 | We assume the following things to be true about fields: 25 | * they correspond to boxes in the box_list along the 0th dimension 26 | * they have inferable rank at graph construction time 27 | * all dimensions except for possibly the 0th can be inferred 28 | (i.e., not None) at graph construction time. 29 | 30 | Some other notes: 31 | * Following tensorflow conventions, we use height, width ordering, 32 | and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering 33 | * Tensors are always provided as (flat) [N, 4] tensors. 34 | """ 35 | 36 | import tensorflow.compat.v1 as tf 37 | 38 | 39 | class BoxList(object): 40 | """Box collection.""" 41 | 42 | def __init__(self, boxes): 43 | """Constructs box collection. 44 | 45 | Args: 46 | boxes: a tensor of shape [N, 4] representing box corners 47 | 48 | Raises: 49 | ValueError: if invalid dimensions for bbox data or if bbox data is not in 50 | float32 format. 51 | """ 52 | if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4: 53 | raise ValueError('Invalid dimensions for box data.') 54 | if boxes.dtype != tf.float32: 55 | raise ValueError('Invalid tensor type: should be tf.float32') 56 | self.data = {'boxes': boxes} 57 | 58 | def num_boxes(self): 59 | """Returns number of boxes held in collection. 60 | 61 | Returns: 62 | a tensor representing the number of boxes held in the collection. 63 | """ 64 | return tf.shape(self.data['boxes'])[0] 65 | 66 | def num_boxes_static(self): 67 | """Returns number of boxes held in collection. 68 | 69 | This number is inferred at graph construction time rather than run-time. 70 | 71 | Returns: 72 | Number of boxes held in collection (integer) or None if this is not 73 | inferable at graph construction time. 74 | """ 75 | return self.data['boxes'].get_shape().as_list()[0] 76 | 77 | def get_all_fields(self): 78 | """Returns all fields.""" 79 | return self.data.keys() 80 | 81 | def get_extra_fields(self): 82 | """Returns all non-box fields (i.e., everything not named 'boxes').""" 83 | return [k for k in self.data.keys() if k != 'boxes'] 84 | 85 | def add_field(self, field, field_data): 86 | """Add field to box list. 87 | 88 | This method can be used to add related box data such as 89 | weights/labels, etc. 90 | 91 | Args: 92 | field: a string key to access the data via `get` 93 | field_data: a tensor containing the data to store in the BoxList 94 | """ 95 | self.data[field] = field_data 96 | 97 | def has_field(self, field): 98 | return field in self.data 99 | 100 | def get(self): 101 | """Convenience function for accessing box coordinates. 102 | 103 | Returns: 104 | a tensor with shape [N, 4] representing box coordinates. 105 | """ 106 | return self.get_field('boxes') 107 | 108 | def set(self, boxes): 109 | """Convenience function for setting box coordinates. 110 | 111 | Args: 112 | boxes: a tensor of shape [N, 4] representing box corners 113 | 114 | Raises: 115 | ValueError: if invalid dimensions for bbox data 116 | """ 117 | if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4: 118 | raise ValueError('Invalid dimensions for box data.') 119 | self.data['boxes'] = boxes 120 | 121 | def get_field(self, field): 122 | """Accesses a box collection and associated fields. 123 | 124 | This function returns specified field with object; if no field is specified, 125 | it returns the box coordinates. 126 | 127 | Args: 128 | field: this optional string parameter can be used to specify 129 | a related field to be accessed. 130 | 131 | Returns: 132 | a tensor representing the box collection or an associated field. 133 | 134 | Raises: 135 | ValueError: if invalid field 136 | """ 137 | if not self.has_field(field): 138 | raise ValueError('field ' + str(field) + ' does not exist') 139 | return self.data[field] 140 | 141 | def set_field(self, field, value): 142 | """Sets the value of a field. 143 | 144 | Updates the field of a box_list with a given value. 145 | 146 | Args: 147 | field: (string) name of the field to set value. 148 | value: the value to assign to the field. 149 | 150 | Raises: 151 | ValueError: if the box_list does not have specified field. 152 | """ 153 | if not self.has_field(field): 154 | raise ValueError('field %s does not exist' % field) 155 | self.data[field] = value 156 | 157 | def get_center_coordinates_and_sizes(self, scope=None): 158 | """Computes the center coordinates, height and width of the boxes. 159 | 160 | Args: 161 | scope: name scope of the function. 162 | 163 | Returns: 164 | a list of 4 1-D tensors [ycenter, xcenter, height, width]. 165 | """ 166 | with tf.name_scope(scope, 'get_center_coordinates_and_sizes'): 167 | box_corners = self.get() 168 | ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners)) 169 | width = xmax - xmin 170 | height = ymax - ymin 171 | ycenter = ymin + height / 2. 172 | xcenter = xmin + width / 2. 173 | return [ycenter, xcenter, height, width] 174 | 175 | def transpose_coordinates(self, scope=None): 176 | """Transpose the coordinate representation in a boxlist. 177 | 178 | Args: 179 | scope: name scope of the function. 180 | """ 181 | with tf.name_scope(scope, 'transpose_coordinates'): 182 | y_min, x_min, y_max, x_max = tf.split( 183 | value=self.get(), num_or_size_splits=4, axis=1) 184 | self.set(tf.concat([x_min, y_min, x_max, y_max], 1)) 185 | 186 | def as_tensor_dict(self, fields=None): 187 | """Retrieves specified fields as a dictionary of tensors. 188 | 189 | Args: 190 | fields: (optional) list of fields to return in the dictionary. 191 | If None (default), all fields are returned. 192 | 193 | Returns: 194 | tensor_dict: A dictionary of tensors specified by fields. 195 | 196 | Raises: 197 | ValueError: if specified field is not contained in boxlist. 198 | """ 199 | tensor_dict = {} 200 | if fields is None: 201 | fields = self.get_all_fields() 202 | for field in fields: 203 | if not self.has_field(field): 204 | raise ValueError('boxlist must contain all specified fields') 205 | tensor_dict[field] = self.get_field(field) 206 | return tensor_dict 207 | -------------------------------------------------------------------------------- /efficientdet/object_detection/faster_rcnn_box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Faster RCNN box coder. 16 | 17 | Faster RCNN box coder follows the coding schema described below: 18 | ty = (y - ya) / ha 19 | tx = (x - xa) / wa 20 | th = log(h / ha) 21 | tw = log(w / wa) 22 | where x, y, w, h denote the box's center coordinates, width and height 23 | respectively. Similarly, xa, ya, wa, ha denote the anchor's center 24 | coordinates, width and height. tx, ty, tw and th denote the anchor-encoded 25 | center, width and height respectively. 26 | 27 | See http://arxiv.org/abs/1506.01497 for details. 28 | """ 29 | 30 | import tensorflow.compat.v1 as tf 31 | 32 | from . import box_coder 33 | from . import box_list 34 | 35 | EPSILON = 1e-8 36 | 37 | 38 | class FasterRcnnBoxCoder(box_coder.BoxCoder): 39 | """Faster RCNN box coder.""" 40 | 41 | def __init__(self, scale_factors=None): 42 | """Constructor for FasterRcnnBoxCoder. 43 | 44 | Args: 45 | scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. 46 | If set to None, does not perform scaling. For Faster RCNN, 47 | the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0]. 48 | """ 49 | if scale_factors: 50 | assert len(scale_factors) == 4 51 | for scalar in scale_factors: 52 | assert scalar > 0 53 | self._scale_factors = scale_factors 54 | 55 | @property 56 | def code_size(self): 57 | return 4 58 | 59 | def _encode(self, boxes, anchors): 60 | """Encode a box collection with respect to anchor collection. 61 | 62 | Args: 63 | boxes: BoxList holding N boxes to be encoded. 64 | anchors: BoxList of anchors. 65 | 66 | Returns: 67 | a tensor representing N anchor-encoded boxes of the format 68 | [ty, tx, th, tw]. 69 | """ 70 | # Convert anchors to the center coordinate representation. 71 | ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() 72 | ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() 73 | # Avoid NaN in division and log below. 74 | ha += EPSILON 75 | wa += EPSILON 76 | h += EPSILON 77 | w += EPSILON 78 | 79 | tx = (xcenter - xcenter_a) / wa 80 | ty = (ycenter - ycenter_a) / ha 81 | tw = tf.log(w / wa) 82 | th = tf.log(h / ha) 83 | # Scales location targets as used in paper for joint training. 84 | if self._scale_factors: 85 | ty *= self._scale_factors[0] 86 | tx *= self._scale_factors[1] 87 | th *= self._scale_factors[2] 88 | tw *= self._scale_factors[3] 89 | return tf.transpose(tf.stack([ty, tx, th, tw])) 90 | 91 | def _decode(self, rel_codes, anchors): 92 | """Decode relative codes to boxes. 93 | 94 | Args: 95 | rel_codes: a tensor representing N anchor-encoded boxes. 96 | anchors: BoxList of anchors. 97 | 98 | Returns: 99 | boxes: BoxList holding N bounding boxes. 100 | """ 101 | ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() 102 | 103 | ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes)) 104 | if self._scale_factors: 105 | ty /= self._scale_factors[0] 106 | tx /= self._scale_factors[1] 107 | th /= self._scale_factors[2] 108 | tw /= self._scale_factors[3] 109 | w = tf.exp(tw) * wa 110 | h = tf.exp(th) * ha 111 | ycenter = ty * ha + ycenter_a 112 | xcenter = tx * wa + xcenter_a 113 | ymin = ycenter - h / 2. 114 | xmin = xcenter - w / 2. 115 | ymax = ycenter + h / 2. 116 | xmax = xcenter + w / 2. 117 | return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax]))) 118 | -------------------------------------------------------------------------------- /efficientdet/object_detection/region_similarity_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Region Similarity Calculators for BoxLists. 16 | 17 | Region Similarity Calculators compare a pairwise measure of similarity 18 | between the boxes in two BoxLists. 19 | """ 20 | from abc import ABCMeta 21 | from abc import abstractmethod 22 | 23 | import tensorflow.compat.v1 as tf 24 | 25 | 26 | def area(boxlist, scope=None): 27 | """Computes area of boxes. 28 | 29 | Args: 30 | boxlist: BoxList holding N boxes 31 | scope: name scope. 32 | 33 | Returns: 34 | a tensor with shape [N] representing box areas. 35 | """ 36 | with tf.name_scope(scope, 'Area'): 37 | y_min, x_min, y_max, x_max = tf.split( 38 | value=boxlist.get(), num_or_size_splits=4, axis=1) 39 | return tf.squeeze((y_max - y_min) * (x_max - x_min), [1]) 40 | 41 | 42 | def intersection(boxlist1, boxlist2, scope=None): 43 | """Compute pairwise intersection areas between boxes. 44 | 45 | Args: 46 | boxlist1: BoxList holding N boxes 47 | boxlist2: BoxList holding M boxes 48 | scope: name scope. 49 | 50 | Returns: 51 | a tensor with shape [N, M] representing pairwise intersections 52 | """ 53 | with tf.name_scope(scope, 'Intersection'): 54 | y_min1, x_min1, y_max1, x_max1 = tf.split( 55 | value=boxlist1.get(), num_or_size_splits=4, axis=1) 56 | y_min2, x_min2, y_max2, x_max2 = tf.split( 57 | value=boxlist2.get(), num_or_size_splits=4, axis=1) 58 | all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2)) 59 | all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2)) 60 | intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin) 61 | all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2)) 62 | all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2)) 63 | intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin) 64 | return intersect_heights * intersect_widths 65 | 66 | 67 | def iou(boxlist1, boxlist2, scope=None): 68 | """Computes pairwise intersection-over-union between box collections. 69 | 70 | Args: 71 | boxlist1: BoxList holding N boxes 72 | boxlist2: BoxList holding M boxes 73 | scope: name scope. 74 | 75 | Returns: 76 | a tensor with shape [N, M] representing pairwise iou scores. 77 | """ 78 | with tf.name_scope(scope, 'IOU'): 79 | intersections = intersection(boxlist1, boxlist2) 80 | areas1 = area(boxlist1) 81 | areas2 = area(boxlist2) 82 | unions = ( 83 | tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections) 84 | return tf.where( 85 | tf.equal(intersections, 0.0), 86 | tf.zeros_like(intersections), tf.truediv(intersections, unions)) 87 | 88 | 89 | class RegionSimilarityCalculator(object): 90 | """Abstract base class for region similarity calculator.""" 91 | __metaclass__ = ABCMeta 92 | 93 | def compare(self, boxlist1, boxlist2, scope=None): 94 | """Computes matrix of pairwise similarity between BoxLists. 95 | 96 | This op (to be overridden) computes a measure of pairwise similarity between 97 | the boxes in the given BoxLists. Higher values indicate more similarity. 98 | 99 | Note that this method simply measures similarity and does not explicitly 100 | perform a matching. 101 | 102 | Args: 103 | boxlist1: BoxList holding N boxes. 104 | boxlist2: BoxList holding M boxes. 105 | scope: Op scope name. Defaults to 'Compare' if None. 106 | 107 | Returns: 108 | a (float32) tensor of shape [N, M] with pairwise similarity score. 109 | """ 110 | with tf.name_scope(scope, 'Compare', [boxlist1, boxlist2]) as scope: 111 | return self._compare(boxlist1, boxlist2) 112 | 113 | @abstractmethod 114 | def _compare(self, boxlist1, boxlist2): 115 | pass 116 | 117 | 118 | class IouSimilarity(RegionSimilarityCalculator): 119 | """Class to compute similarity based on Intersection over Union (IOU) metric. 120 | 121 | This class computes pairwise similarity between two BoxLists based on IOU. 122 | """ 123 | 124 | def _compare(self, boxlist1, boxlist2): 125 | """Compute pairwise IOU similarity between the two BoxLists. 126 | 127 | Args: 128 | boxlist1: BoxList holding N boxes. 129 | boxlist2: BoxList holding M boxes. 130 | 131 | Returns: 132 | A tensor with shape [N, M] representing pairwise iou scores. 133 | """ 134 | return iou(boxlist1, boxlist2) 135 | -------------------------------------------------------------------------------- /efficientdet/object_detection/shape_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Utils used to manipulate tensor shapes.""" 16 | 17 | import tensorflow.compat.v1 as tf 18 | 19 | 20 | def assert_shape_equal(shape_a, shape_b): 21 | """Asserts that shape_a and shape_b are equal. 22 | 23 | If the shapes are static, raises a ValueError when the shapes 24 | mismatch. 25 | 26 | If the shapes are dynamic, raises a tf InvalidArgumentError when the shapes 27 | mismatch. 28 | 29 | Args: 30 | shape_a: a list containing shape of the first tensor. 31 | shape_b: a list containing shape of the second tensor. 32 | 33 | Returns: 34 | Either a tf.no_op() when shapes are all static and a tf.assert_equal() op 35 | when the shapes are dynamic. 36 | 37 | Raises: 38 | ValueError: When shapes are both static and unequal. 39 | """ 40 | if (all(isinstance(dim, int) for dim in shape_a) and 41 | all(isinstance(dim, int) for dim in shape_b)): 42 | if shape_a != shape_b: 43 | raise ValueError('Unequal shapes {}, {}'.format(shape_a, shape_b)) 44 | else: return tf.no_op() 45 | else: 46 | return tf.assert_equal(shape_a, shape_b) 47 | 48 | 49 | def combined_static_and_dynamic_shape(tensor): 50 | """Returns a list containing static and dynamic values for the dimensions. 51 | 52 | Returns a list of static and dynamic values for shape dimensions. This is 53 | useful to preserve static shapes when available in reshape operation. 54 | 55 | Args: 56 | tensor: A tensor of any type. 57 | 58 | Returns: 59 | A list of size tensor.shape.ndims containing integers or a scalar tensor. 60 | """ 61 | static_tensor_shape = tensor.shape.as_list() 62 | dynamic_tensor_shape = tf.shape(tensor) 63 | combined_shape = [] 64 | for index, dim in enumerate(static_tensor_shape): 65 | if dim is not None: 66 | combined_shape.append(dim) 67 | else: 68 | combined_shape.append(dynamic_tensor_shape[index]) 69 | return combined_shape 70 | -------------------------------------------------------------------------------- /efficientdet/object_detection/tf_example_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Research. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tensorflow Example proto decoder for object detection. 16 | 17 | A decoder to decode string tensors containing serialized tensorflow.Example 18 | protos for object detection. 19 | """ 20 | 21 | import tensorflow.compat.v1 as tf 22 | 23 | 24 | def _get_source_id_from_encoded_image(parsed_tensors): 25 | return tf.strings.as_string( 26 | tf.strings.to_hash_bucket_fast(parsed_tensors['image/encoded'], 27 | 2**63 - 1)) 28 | 29 | 30 | class TfExampleDecoder(object): 31 | """Tensorflow Example proto decoder.""" 32 | 33 | def __init__(self, include_mask=False, regenerate_source_id=False): 34 | self._include_mask = include_mask 35 | self._regenerate_source_id = regenerate_source_id 36 | self._keys_to_features = { 37 | 'image/encoded': tf.FixedLenFeature((), tf.string), 38 | 'image/source_id': tf.FixedLenFeature((), tf.string, ''), 39 | 'image/height': tf.FixedLenFeature((), tf.int64, -1), 40 | 'image/width': tf.FixedLenFeature((), tf.int64, -1), 41 | 'image/object/bbox/xmin': tf.VarLenFeature(tf.float32), 42 | 'image/object/bbox/xmax': tf.VarLenFeature(tf.float32), 43 | 'image/object/bbox/ymin': tf.VarLenFeature(tf.float32), 44 | 'image/object/bbox/ymax': tf.VarLenFeature(tf.float32), 45 | 'image/object/class/label': tf.VarLenFeature(tf.int64), 46 | 'image/object/area': tf.VarLenFeature(tf.float32), 47 | 'image/object/is_crowd': tf.VarLenFeature(tf.int64), 48 | } 49 | if include_mask: 50 | self._keys_to_features.update({ 51 | 'image/object/mask': 52 | tf.VarLenFeature(tf.string), 53 | }) 54 | 55 | def _decode_image(self, parsed_tensors): 56 | """Decodes the image and set its static shape.""" 57 | image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3) 58 | image.set_shape([None, None, 3]) 59 | return image 60 | 61 | def _decode_boxes(self, parsed_tensors): 62 | """Concat box coordinates in the format of [ymin, xmin, ymax, xmax].""" 63 | xmin = parsed_tensors['image/object/bbox/xmin'] 64 | xmax = parsed_tensors['image/object/bbox/xmax'] 65 | ymin = parsed_tensors['image/object/bbox/ymin'] 66 | ymax = parsed_tensors['image/object/bbox/ymax'] 67 | return tf.stack([ymin, xmin, ymax, xmax], axis=-1) 68 | 69 | def _decode_masks(self, parsed_tensors): 70 | """Decode a set of PNG masks to the tf.float32 tensors.""" 71 | def _decode_png_mask(png_bytes): 72 | mask = tf.squeeze( 73 | tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1) 74 | mask = tf.cast(mask, dtype=tf.float32) 75 | mask.set_shape([None, None]) 76 | return mask 77 | 78 | height = parsed_tensors['image/height'] 79 | width = parsed_tensors['image/width'] 80 | masks = parsed_tensors['image/object/mask'] 81 | return tf.cond( 82 | tf.greater(tf.size(masks), 0), 83 | lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32), 84 | lambda: tf.zeros([0, height, width], dtype=tf.float32)) 85 | 86 | def _decode_areas(self, parsed_tensors): 87 | xmin = parsed_tensors['image/object/bbox/xmin'] 88 | xmax = parsed_tensors['image/object/bbox/xmax'] 89 | ymin = parsed_tensors['image/object/bbox/ymin'] 90 | ymax = parsed_tensors['image/object/bbox/ymax'] 91 | return tf.cond( 92 | tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0), 93 | lambda: parsed_tensors['image/object/area'], 94 | lambda: (xmax - xmin) * (ymax - ymin)) 95 | 96 | def decode(self, serialized_example): 97 | """Decode the serialized example. 98 | 99 | Args: 100 | serialized_example: a single serialized tf.Example string. 101 | 102 | Returns: 103 | decoded_tensors: a dictionary of tensors with the following fields: 104 | - image: a uint8 tensor of shape [None, None, 3]. 105 | - source_id: a string scalar tensor. 106 | - height: an integer scalar tensor. 107 | - width: an integer scalar tensor. 108 | - groundtruth_classes: a int64 tensor of shape [None]. 109 | - groundtruth_is_crowd: a bool tensor of shape [None]. 110 | - groundtruth_area: a float32 tensor of shape [None]. 111 | - groundtruth_boxes: a float32 tensor of shape [None, 4]. 112 | - groundtruth_instance_masks: a float32 tensor of shape 113 | [None, None, None]. 114 | - groundtruth_instance_masks_png: a string tensor of shape [None]. 115 | """ 116 | parsed_tensors = tf.io.parse_single_example( 117 | serialized_example, self._keys_to_features) 118 | for k in parsed_tensors: 119 | if isinstance(parsed_tensors[k], tf.SparseTensor): 120 | if parsed_tensors[k].dtype == tf.string: 121 | parsed_tensors[k] = tf.sparse_tensor_to_dense( 122 | parsed_tensors[k], default_value='') 123 | else: 124 | parsed_tensors[k] = tf.sparse_tensor_to_dense( 125 | parsed_tensors[k], default_value=0) 126 | 127 | image = self._decode_image(parsed_tensors) 128 | boxes = self._decode_boxes(parsed_tensors) 129 | areas = self._decode_areas(parsed_tensors) 130 | 131 | decode_image_shape = tf.logical_or( 132 | tf.equal(parsed_tensors['image/height'], -1), 133 | tf.equal(parsed_tensors['image/width'], -1)) 134 | image_shape = tf.cast(tf.shape(image), dtype=tf.int64) 135 | 136 | parsed_tensors['image/height'] = tf.where(decode_image_shape, 137 | image_shape[0], 138 | parsed_tensors['image/height']) 139 | parsed_tensors['image/width'] = tf.where(decode_image_shape, image_shape[1], 140 | parsed_tensors['image/width']) 141 | 142 | is_crowds = tf.cond( 143 | tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0), 144 | lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool), 145 | lambda: tf.zeros_like(parsed_tensors['image/object/class/label'], dtype=tf.bool)) # pylint: disable=line-too-long 146 | if self._regenerate_source_id: 147 | source_id = _get_source_id_from_encoded_image(parsed_tensors) 148 | else: 149 | source_id = tf.cond( 150 | tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 151 | 0), lambda: parsed_tensors['image/source_id'], 152 | lambda: _get_source_id_from_encoded_image(parsed_tensors)) 153 | if self._include_mask: 154 | masks = self._decode_masks(parsed_tensors) 155 | 156 | decoded_tensors = { 157 | 'image': image, 158 | 'source_id': source_id, 159 | 'height': parsed_tensors['image/height'], 160 | 'width': parsed_tensors['image/width'], 161 | 'groundtruth_classes': parsed_tensors['image/object/class/label'], 162 | 'groundtruth_is_crowd': is_crowds, 163 | 'groundtruth_area': areas, 164 | 'groundtruth_boxes': boxes, 165 | } 166 | if self._include_mask: 167 | decoded_tensors.update({ 168 | 'groundtruth_instance_masks': masks, 169 | 'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'], 170 | }) 171 | return decoded_tensors 172 | -------------------------------------------------------------------------------- /generate_anchors.py: -------------------------------------------------------------------------------- 1 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py 2 | 3 | # -------------------------------------------------------- 4 | # Faster R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick and Sean Bell 8 | # -------------------------------------------------------- 9 | 10 | from six.moves import range 11 | import numpy as np 12 | 13 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 14 | # 15 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 16 | # >> anchors 17 | # 18 | # anchors = 19 | # 20 | # -83 -39 100 56 21 | # -175 -87 192 104 22 | # -359 -183 376 200 23 | # -55 -55 72 72 24 | # -119 -119 136 136 25 | # -247 -247 264 264 26 | # -35 -79 52 96 27 | # -79 -167 96 184 28 | # -167 -343 184 360 29 | 30 | #array([[ -83., -39., 100., 56.], 31 | # [-175., -87., 192., 104.], 32 | # [-359., -183., 376., 200.], 33 | # [ -55., -55., 72., 72.], 34 | # [-119., -119., 136., 136.], 35 | # [-247., -247., 264., 264.], 36 | # [ -35., -79., 52., 96.], 37 | # [ -79., -167., 96., 184.], 38 | # [-167., -343., 184., 360.]]) 39 | # base_size -> anchor_stride=16, 40 | # scales -> scales=np.array((32, 64, 128, 256, 512), dtype=np.float) / 16, 41 | # generate anchor for one position 42 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 43 | scales=2**np.arange(3, 6)): 44 | """ 45 | Generate anchor (reference) windows by enumerating aspect ratios X 46 | scales wrt a reference (0, 0, 15, 15) window. 47 | """ 48 | # anchor box, 0-indexed, x1,y1,x2,y2 49 | base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1 50 | # with the same center, same size, -> [0.5,1.0,2.0] boxes 51 | # [[0,0,15,15],[0,0,22,11.],..] 52 | ratio_anchors = _ratio_enum(base_anchor, ratios) 53 | # -> [[0,0,31,31],....] 54 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 55 | for i in range(ratio_anchors.shape[0])]) 56 | return anchors 57 | 58 | def _whctrs(anchor): # x1,y1,x2,y2: (0,0,15,15) -> (16,16,8,8) 59 | """ 60 | Return width, height, x center, and y center for an anchor (window). 61 | """ 62 | 63 | w = anchor[2] - anchor[0] + 1 64 | h = anchor[3] - anchor[1] + 1 65 | x_ctr = anchor[0] + 0.5 * (w - 1) 66 | y_ctr = anchor[1] + 0.5 * (h - 1) 67 | return w, h, x_ctr, y_ctr 68 | 69 | def _mkanchors(ws, hs, x_ctr, y_ctr): 70 | """ 71 | Given a vector of widths (ws) and heights (hs) around a center 72 | (x_ctr, y_ctr), output a set of anchors (windows). 73 | """ 74 | 75 | ws = ws[:, np.newaxis] # [k] -> [k,1] 76 | hs = hs[:, np.newaxis] 77 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 78 | y_ctr - 0.5 * (hs - 1), 79 | x_ctr + 0.5 * (ws - 1), 80 | y_ctr + 0.5 * (hs - 1))) 81 | return anchors 82 | 83 | def _ratio_enum(anchor, ratios): 84 | """ 85 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 86 | """ 87 | 88 | w, h, x_ctr, y_ctr = _whctrs(anchor) # 0,0,15,15 -> # 16,16, 8,8, 89 | size = w * h # 16 * 16 = 256 90 | # given the same size, get the box with different ratio 91 | size_ratios = size / ratios # ratios: [0.5,1,2] -> [512,256,128] 92 | ws = np.round(np.sqrt(size_ratios)) # np_round to a int, -> [sqrt(512),16,sqrt(128)] 93 | hs = np.round(ws * ratios) # [sqrt(512)*0.5, 16 * 1, sqrt(128)*2] 94 | # ws*hs == w*h 95 | # get anchors with the same x,y,center 96 | # a list of [x1,y1,x2,y2] 97 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 98 | return anchors 99 | 100 | def _scale_enum(anchor, scales): 101 | """ 102 | Enumerate a set of anchors for each scale wrt an anchor. 103 | """ 104 | 105 | w, h, x_ctr, y_ctr = _whctrs(anchor) 106 | ws = w * scales 107 | hs = h * scales 108 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 109 | return anchors 110 | 111 | if __name__ == '__main__': 112 | #import time 113 | #t = time.time() 114 | #a = generate_anchors() 115 | #print(time.time() - t) 116 | #print(a) 117 | #from IPython import embed; embed() 118 | 119 | print(generate_anchors( 120 | 16, scales=np.asarray((2, 4, 8, 16, 32), 'float32'), 121 | ratios=[0.5,1,2])) 122 | -------------------------------------------------------------------------------- /generate_util_graph.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # generate cpu/gpu util graph based on the json data 3 | 4 | import argparse 5 | import sys 6 | import os 7 | import json 8 | import matplotlib 9 | matplotlib.use('Agg') 10 | from matplotlib import pyplot as plt 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("logs") 14 | parser.add_argument("output_png") 15 | 16 | if __name__ == "__main__": 17 | args = parser.parse_args() 18 | 19 | with open(args.logs, "r") as f: 20 | data = json.load(f) 21 | 22 | # timing as the timestamp as the x axis, and others as y axis 23 | # timestamp to local time in seconds 24 | start_time = data["timing"][0] 25 | timings = [round(o - start_time, 1) for o in data["timing"]] 26 | 27 | # cpu and gpu util 28 | cpu_util = [round(o, 1) for o in data["cpu_utilization"]] 29 | gpu_util = [round(o, 1) for o in data["gpu_utilization"]] 30 | 31 | # gpu mem and ram, in MB 32 | ram_used = [round(o, 1) for o in data["ram_used"]] 33 | gpu_mem = [round(o, 1) for o in data["gpu_memory"]] 34 | 35 | # plot! 36 | plt.figure(figsize=(10, 6)) 37 | # cpu util 38 | plt.subplot(221) 39 | plt.plot(timings, cpu_util, "g-") 40 | plt.title("cpu util %") 41 | plt.xlabel("seconds") 42 | plt.grid(True) 43 | 44 | plt.subplot(222) 45 | plt.plot(timings, ram_used, "g-") 46 | plt.title("ram used (MB)") 47 | plt.xlabel("seconds") 48 | plt.grid(True) 49 | 50 | plt.subplot(223) 51 | plt.plot(timings, gpu_util, "b-") 52 | plt.title("gpu util %") 53 | plt.xlabel("seconds") 54 | plt.grid(True) 55 | 56 | plt.subplot(224) 57 | plt.plot(timings, gpu_mem, "b-") 58 | plt.title("GPU mem (MB)") 59 | plt.xlabel("seconds") 60 | plt.grid(True) 61 | 62 | plt.subplots_adjust(hspace=0.5, wspace=0.3) 63 | 64 | plt.savefig(args.output_png, dpi=400) 65 | 66 | 67 | -------------------------------------------------------------------------------- /get_frames_resize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | given a list of videos, get all the frames and resize note that the 4 | frames are 0-indexed 5 | """ 6 | 7 | import argparse 8 | import cv2 9 | import os 10 | import pickle 11 | import sys 12 | 13 | from tqdm import tqdm 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | parser.add_argument("videolist") 18 | parser.add_argument("despath") 19 | 20 | parser.add_argument("--resize", default=False, action="store_true") 21 | parser.add_argument("--size", default=800, type=int) 22 | parser.add_argument("--maxsize", default=1333, type=int) 23 | 24 | 25 | 26 | parser.add_argument("--job", type=int, default=1, help="total job") 27 | parser.add_argument("--curJob", type=int, default=1, 28 | help="this script run job Num") 29 | parser.add_argument("--statspath", default=None, 30 | help="path to write videoname.p to save some stats for " 31 | "that video") 32 | parser.add_argument("--use_2level", action="store_true", 33 | help="make videoname/frames dir") 34 | parser.add_argument("--name_level", type=int, default=None, 35 | help="add the top level folder name to the videoname") 36 | parser.add_argument("--cv2path", default=None) 37 | 38 | parser.add_argument("--use_moviepy", action="store_true") 39 | parser.add_argument("--use_lijun", action="store_true") 40 | 41 | 42 | def get_new_hw(h, w, size, max_size): 43 | """Get new hw.""" 44 | scale = size * 1.0 / min(h, w) 45 | if h < w: 46 | newh, neww = size, scale * w 47 | else: 48 | newh, neww = scale * h, size 49 | if max(newh, neww) > max_size: 50 | scale = max_size * 1.0 / max(newh, neww) 51 | newh = newh * scale 52 | neww = neww * scale 53 | neww = int(neww + 0.5) 54 | newh = int(newh + 0.5) 55 | return neww, newh 56 | 57 | 58 | if __name__ == "__main__": 59 | args = parser.parse_args() 60 | if args.cv2path is not None: 61 | sys.path = [args.cv2path] + sys.path 62 | 63 | if args.use_moviepy: 64 | from moviepy.editor import VideoFileClip 65 | elif args.use_lijun: 66 | from diva_io.video import VideoReader 67 | 68 | # still need this to write image 69 | print("using opencv version:%s"%(cv2.__version__)) 70 | 71 | if not os.path.exists(args.despath): 72 | os.makedirs(args.despath) 73 | 74 | if args.statspath is not None and not os.path.exists(args.statspath): 75 | os.makedirs(args.statspath) 76 | 77 | count = 0 78 | for line in tqdm(open(args.videolist, "r").readlines()): 79 | count += 1 80 | if (count % args.job) != (args.curJob-1): 81 | continue 82 | 83 | video = line.strip() 84 | 85 | stats = {"h":None, "w":None, "fps":None, "frame_count":None, 86 | "actual_frame_count":None} 87 | 88 | videoname = os.path.splitext(os.path.basename(video))[0] 89 | 90 | targetpath = args.despath 91 | 92 | if args.use_2level: 93 | targetpath = os.path.join(args.despath, videoname) 94 | if not os.path.exists(targetpath): 95 | os.makedirs(targetpath) 96 | 97 | if args.name_level is not None: 98 | foldernames = video.split("/") 99 | prefixes = foldernames[-1-args.name_level:-1] 100 | videoname = "__".join(prefixes + [videoname]) 101 | 102 | if args.use_moviepy: 103 | vcap = VideoFileClip(video, audio=False) 104 | frame_count = int(vcap.fps * vcap.duration) # uh 105 | vcap_iter = vcap.iter_frames() 106 | elif args.use_lijun: 107 | vcap = VideoReader(video) 108 | frame_count = int(vcap.length) 109 | else: 110 | try: 111 | vcap = cv2.VideoCapture(video) 112 | if not vcap.isOpened(): 113 | raise Exception("cannot open %s"%video) 114 | except Exception as e: 115 | raise e 116 | 117 | if cv2.__version__.split(".") != "2": 118 | frame_width = vcap.get(cv2.CAP_PROP_FRAME_WIDTH) 119 | frame_height = vcap.get(cv2.CAP_PROP_FRAME_HEIGHT) 120 | 121 | fps = vcap.get(cv2.CAP_PROP_FPS) 122 | frame_count = vcap.get(cv2.CAP_PROP_FRAME_COUNT) 123 | else: 124 | frame_width = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH) 125 | frame_height = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT) 126 | 127 | fps = vcap.get(cv2.cv.CV_CAP_PROP_FPS) 128 | frame_count = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT) 129 | stats['h'] = frame_height 130 | stats['w'] = frame_width 131 | 132 | stats['fps'] = fps 133 | 134 | 135 | 136 | stats['frame_count'] = frame_count 137 | 138 | cur_frame = 0 139 | count_actual = 0 140 | while cur_frame < frame_count: 141 | if args.use_moviepy: 142 | suc = True 143 | frame = next(vcap_iter) 144 | 145 | else: 146 | suc, frame = vcap.read() 147 | 148 | if not suc: 149 | cur_frame += 1 150 | tqdm.write("warning, %s frame of %s failed" % (cur_frame, videoname)) 151 | continue 152 | count_actual += 1 153 | if args.use_moviepy: 154 | # moviepy ask ffmpeg to get rgb24 155 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) 156 | 157 | frame = frame.astype("float32") 158 | 159 | if args.resize: 160 | neww, newh = get_new_hw(frame.shape[0], 161 | frame.shape[1], args.size, args.maxsize) 162 | 163 | frame = cv2.resize(frame, (neww, newh), interpolation=cv2.INTER_LINEAR) 164 | 165 | cv2.imwrite(os.path.join(targetpath, 166 | "%s_F_%08d.jpg" % (videoname, cur_frame)), frame) 167 | 168 | cur_frame += 1 169 | 170 | stats['actual_frame_count'] = count_actual 171 | 172 | if args.statspath is not None: 173 | with open(os.path.join(args.statspath, "%s.p" % videoname), "wb") as fs: 174 | pickle.dump(stats, fs) 175 | if not args.use_moviepy and not args.use_lijun: 176 | vcap.release() 177 | -------------------------------------------------------------------------------- /images/Person_vis_video.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/Person_vis_video.gif -------------------------------------------------------------------------------- /images/Vehicle_vis_video.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/Vehicle_vis_video.gif -------------------------------------------------------------------------------- /images/actev-prizechallenge-06-2019.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/actev-prizechallenge-06-2019.png -------------------------------------------------------------------------------- /images/inf_actev_0.49audc_02-2020.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/inf_actev_0.49audc_02-2020.png -------------------------------------------------------------------------------- /images/multi-camera-reid.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/multi-camera-reid.gif -------------------------------------------------------------------------------- /images/person_multi_reid.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/person_multi_reid.gif -------------------------------------------------------------------------------- /images/person_multi_reid2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/person_multi_reid2.gif -------------------------------------------------------------------------------- /images/util_log_b1partial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/util_log_b1partial.png -------------------------------------------------------------------------------- /images/util_log_b8multithread.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/util_log_b8multithread.png -------------------------------------------------------------------------------- /images/vehicle_multi_reid.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/vehicle_multi_reid.gif -------------------------------------------------------------------------------- /tensorrt_optimize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Given the tensorflow frozen graph, use TensorRT to optimize, 3 | get a new frozen graph.""" 4 | 5 | from __future__ import print_function 6 | 7 | import argparse 8 | import time 9 | import tensorflow as tf 10 | import tensorflow.contrib.tensorrt as trt 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("pbfile") 14 | parser.add_argument("newpbfile") 15 | parser.add_argument("--precision_mode", default="FP32", 16 | help="FP32, FP16, or INT8") 17 | parser.add_argument("--maximum_cached_engines", default=100, 18 | help="Don't know what this does.") 19 | 20 | 21 | # parameter 22 | # https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html 23 | if __name__ == "__main__": 24 | args = parser.parse_args() 25 | 26 | max_batch_size = 1 27 | precision_mode = args.precision_mode 28 | minimum_segment_size = 2 # smaller the faster? 5 -60? 29 | max_workspace_size_bytes = 1 << 32 30 | maximum_cached_engines = args.maximum_cached_engines 31 | 32 | output_names = [ 33 | "final_boxes", 34 | "final_labels", 35 | "final_probs", 36 | "fpn_box_feat", 37 | ] 38 | 39 | tf_config = tf.ConfigProto() 40 | tf_config.gpu_options.allow_growth = True 41 | 42 | with tf.Graph().as_default() as tf_graph: 43 | with tf.Session(config=tf_config) as tf_sess: 44 | with tf.gfile.GFile(args.pbfile, "rb") as f: 45 | frozen_graph = tf.GraphDef() 46 | frozen_graph.ParseFromString(f.read()) 47 | 48 | graph_size = len(frozen_graph.SerializeToString()) 49 | num_nodes = len(frozen_graph.node) 50 | start_time = time.time() 51 | frozen_graph = trt.create_inference_graph( 52 | input_graph_def=frozen_graph, 53 | outputs=output_names, 54 | max_batch_size=max_batch_size, 55 | max_workspace_size_bytes=max_workspace_size_bytes, 56 | precision_mode=precision_mode, 57 | minimum_segment_size=minimum_segment_size, 58 | is_dynamic_op=True, # this is needed for FPN 59 | maximum_cached_engines=maximum_cached_engines) 60 | end_time = time.time() 61 | print("graph_size(MB)(native_tf): %.1f" % (float(graph_size)/(1<<20))) 62 | print("graph_size(MB)(trt): %.1f" % ( 63 | float(len(frozen_graph.SerializeToString()))/(1<<20))) 64 | print("num_nodes(native_tf): %d" % num_nodes) 65 | print("num_nodes(tftrt_total): %d" % len(frozen_graph.node)) 66 | print("num_nodes(trt_only): %d" % len( 67 | [1 for n in frozen_graph.node if str(n.op) == "TRTEngineOp"])) 68 | print("time(s) (trt_conversion): %.4f" % (end_time - start_time)) 69 | with open(args.newpbfile, "wb") as f: 70 | f.write(frozen_graph.SerializeToString()) 71 | -------------------------------------------------------------------------------- /tensorrt_optimize_tf1.15.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Given the tensorflow frozen graph, use TensorRT to optimize, 3 | get a new frozen graph.""" 4 | 5 | from __future__ import print_function 6 | 7 | import argparse 8 | import time 9 | import tensorflow as tf 10 | from tensorflow.python.compiler.tensorrt import trt_convert as trt 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("pbfile") 14 | parser.add_argument("newpbfile") 15 | parser.add_argument("--precision_mode", default="FP32", 16 | help="FP32, FP16, or INT8") 17 | 18 | 19 | # parameter 20 | # https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html 21 | if __name__ == "__main__": 22 | args = parser.parse_args() 23 | 24 | # not sure what these do, so leave them default 25 | #max_batch_size = 1 26 | #minimum_segment_size = 2 # smaller the faster? 5 -60? 27 | #max_workspace_size_bytes = 1 << 32 28 | #maximum_cached_engines = 1 29 | 30 | output_names = [ 31 | "final_boxes", 32 | "final_labels", 33 | "final_probs", 34 | "fpn_box_feat", 35 | ] 36 | 37 | tf_config = tf.ConfigProto() 38 | tf_config.gpu_options.allow_growth = True 39 | 40 | with tf.Graph().as_default() as tf_graph: 41 | with tf.Session(config=tf_config) as tf_sess: 42 | with tf.gfile.GFile(args.pbfile, "rb") as f: 43 | frozen_graph = tf.GraphDef() 44 | frozen_graph.ParseFromString(f.read()) 45 | 46 | converter = trt.TrtGraphConverter( 47 | input_graph_def=frozen_graph, 48 | nodes_blacklist=output_names, 49 | is_dynamic_op=False, 50 | precision_mode=args.precision_mode) #output nodes 51 | trt_graph = converter.convert() 52 | #converter.save(args.newpbfile) 53 | 54 | 55 | with open(args.newpbfile, "wb") as f: 56 | f.write(trt_graph.SerializeToString()) 57 | -------------------------------------------------------------------------------- /test_reid.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Test person_reid and vehicle reid model""" 3 | 4 | import argparse 5 | import os 6 | from glob import glob 7 | import numpy as np 8 | 9 | from torchreid.feature_extractor import FeatureExtractor 10 | from torchreid.distance import compute_distance_matrix 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("query_img") 14 | parser.add_argument("test_img_prefix") 15 | parser.add_argument("--gpuid", default=0, type=int, 16 | help="gpu id") 17 | parser.add_argument("--vehicle_reid_model", default=None) 18 | parser.add_argument("--person_reid_model", default=None) 19 | parser.add_argument("--p_model_name", default="osnet_x1_0") 20 | 21 | 22 | if __name__ == "__main__": 23 | args = parser.parse_args() 24 | 25 | if args.person_reid_model is not None: 26 | extractor = FeatureExtractor( 27 | model_name=args.p_model_name, 28 | model_path=args.person_reid_model, 29 | device="cuda:%d" % args.gpuid 30 | ) 31 | 32 | elif args.vehicle_reid_model is not None: 33 | extractor = FeatureExtractor( 34 | model_name="resnet101", 35 | model_path=args.vehicle_reid_model, 36 | device="cuda:%d" % args.gpuid 37 | ) 38 | else: 39 | raise Exception("Please provide a model!") 40 | 41 | test_imgs = glob(args.test_img_prefix + "*") 42 | test_imgs.sort() 43 | assert test_imgs 44 | img_list = [args.query_img] + test_imgs 45 | print(img_list) 46 | features = extractor(img_list) 47 | 48 | print(features.shape) # [n, 512] 49 | # compute nxn distance 50 | distmat = compute_distance_matrix(features, features, metric='euclidean') 51 | np.set_printoptions(suppress=True, precision=3) 52 | print(distmat.cpu().numpy()) 53 | 54 | -------------------------------------------------------------------------------- /tester.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # tester, given the config with model path 3 | 4 | 5 | import tensorflow as tf 6 | 7 | 8 | class Tester(): 9 | def __init__(self,models,config,add_mask=True): 10 | self.config = config 11 | self.models = models 12 | 13 | # infereence out: 14 | self.final_boxes = [model.final_boxes for model in models] 15 | # [R] 16 | self.final_labels = [model.final_labels for model in models] 17 | self.final_probs = [model.final_probs for model in models] 18 | 19 | if config.add_act: 20 | if config.act_v2: 21 | self.act_single_boxes = [model.act_single_boxes for model in models] 22 | self.act_single_label_logits = [model.act_single_label_logits for model in models] 23 | else: 24 | self.act_final_boxes = [model.act_final_boxes for model in models] 25 | # [R] 26 | self.act_final_labels = [model.act_final_labels for model in models] 27 | self.act_final_probs = [model.act_final_probs for model in models] 28 | 29 | self.small_object = False 30 | if config.use_small_object_head: 31 | self.small_object = True 32 | if self.small_object: 33 | # infereence out: 34 | self.so_final_boxes = [model.so_final_boxes for model in models] 35 | # [R] 36 | self.so_final_labels = [model.so_final_labels for model in models] 37 | self.so_final_probs = [model.so_final_probs for model in models] 38 | 39 | self.add_mask = add_mask 40 | 41 | if add_mask: 42 | # [R,14,14] 43 | self.final_masks = [model.final_masks for model in models] 44 | 45 | 46 | def step(self,sess,batch): 47 | config = self.config 48 | # give one batch of Dataset, use model to get the result, 49 | assert isinstance(sess,tf.Session) 50 | batchIdxs,batch_datas = batch 51 | #assert len(batch_datas) == len(self.models) # there may be less data in the end 52 | num_input = len(batch_datas) # use this to cap the model input 53 | 54 | feed_dict = {} 55 | 56 | for _,batch_data,model in zip(range(num_input),batch_datas,self.models): 57 | feed_dict.update(model.get_feed_dict(batch_data,is_train=False)) 58 | 59 | sess_input = [] 60 | if self.add_mask: 61 | for _,boxes,labels,probs,masks in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.final_masks): 62 | sess_input+=[boxes,labels,probs,masks] 63 | else: 64 | if self.small_object: 65 | for _,boxes,labels,probs,so_boxes, so_labels, so_probs in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.so_final_boxes,self.so_final_labels,self.so_final_probs): 66 | sess_input+=[boxes,labels,probs,so_boxes,so_labels,so_probs] 67 | else: 68 | for _,boxes,labels,probs in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs): 69 | sess_input+=[boxes,labels,probs] 70 | 71 | if config.add_act: 72 | sess_input = [] 73 | if config.act_v2: 74 | for _,boxes,labels,probs,actsingleboxes,actsinglelabels in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.act_single_boxes,self.act_single_label_logits): 75 | sess_input+=[boxes,labels,probs,actsingleboxes,actsinglelabels] 76 | else: 77 | for _,boxes,labels,probs,actboxes,actlabels,actprobs in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.act_final_boxes,self.act_final_labels,self.act_final_probs): 78 | sess_input+=[boxes,labels,probs,actboxes,actlabels,actprobs] 79 | 80 | 81 | #final_boxes, final_probs, final_labels, final_masks = sess.run([self.final_boxes, self.final_probs, self.final_labels, self.final_masks],feed_dict=feed_dict) 82 | #return final_boxes, final_probs, final_labels, final_masks 83 | outputs = sess.run(sess_input,feed_dict=feed_dict) 84 | if self.add_mask: 85 | pn = 4 86 | else: 87 | if self.small_object: 88 | pn = 6 89 | else: 90 | pn = 3 91 | if config.add_act: 92 | if config.act_v2: 93 | pn = 5 94 | else: 95 | pn = 6 96 | outputs = [outputs[i*pn:(i*pn+pn)] for i in range(num_input)] 97 | else: 98 | outputs = [outputs[i*pn:(i*pn+pn)] for i in range(num_input)] 99 | return outputs 100 | -------------------------------------------------------------------------------- /tmot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/tmot/__init__.py -------------------------------------------------------------------------------- /tmot/basetrack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | 4 | 5 | class TrackState(object): 6 | New = 0 7 | Tracked = 1 8 | Lost = 2 9 | Removed = 3 10 | 11 | 12 | class BaseTrack(object): 13 | _count = 0 14 | 15 | track_id = 0 16 | is_activated = False 17 | state = TrackState.New 18 | 19 | history = OrderedDict() 20 | features = [] 21 | curr_feature = None 22 | score = 0 23 | start_frame = 0 24 | frame_id = 0 25 | time_since_update = 0 26 | 27 | # multi-camera 28 | location = (np.inf, np.inf) 29 | 30 | @property 31 | def end_frame(self): 32 | return self.frame_id 33 | 34 | @staticmethod 35 | def next_id(): 36 | BaseTrack._count += 1 37 | return BaseTrack._count 38 | 39 | def activate(self, *args): 40 | raise NotImplementedError 41 | 42 | def predict(self): 43 | raise NotImplementedError 44 | 45 | def update(self, *args, **kwargs): 46 | raise NotImplementedError 47 | 48 | def mark_lost(self): 49 | self.state = TrackState.Lost 50 | 51 | def mark_removed(self): 52 | self.state = TrackState.Removed 53 | 54 | -------------------------------------------------------------------------------- /tmot/matching.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | from scipy.spatial.distance import cdist 4 | import lap # 0.4.0 5 | 6 | from cython_bbox import bbox_overlaps as bbox_ious 7 | from . import kalman_filter 8 | 9 | def merge_matches(m1, m2, shape): 10 | O,P,Q = shape 11 | m1 = np.asarray(m1) 12 | m2 = np.asarray(m2) 13 | 14 | M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) 15 | M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) 16 | 17 | mask = M1*M2 18 | match = mask.nonzero() 19 | match = list(zip(match[0], match[1])) 20 | unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) 21 | unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) 22 | 23 | return match, unmatched_O, unmatched_Q 24 | 25 | 26 | def linear_assignment(cost_matrix, thresh): 27 | if cost_matrix.size == 0: 28 | return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) 29 | matches, unmatched_a, unmatched_b = [], [], [] 30 | cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) 31 | for ix, mx in enumerate(x): 32 | if mx >= 0: 33 | matches.append([ix, mx]) 34 | unmatched_a = np.where(x < 0)[0] 35 | unmatched_b = np.where(y < 0)[0] 36 | matches = np.asarray(matches) 37 | return matches, unmatched_a, unmatched_b 38 | 39 | 40 | def ious(atlbrs, btlbrs): 41 | """ 42 | Compute cost based on IoU 43 | :type atlbrs: list[tlbr] | np.ndarray 44 | :type atlbrs: list[tlbr] | np.ndarray 45 | 46 | :rtype ious np.ndarray 47 | """ 48 | ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) 49 | if ious.size == 0: 50 | return ious 51 | 52 | ious = bbox_ious( 53 | np.ascontiguousarray(atlbrs, dtype=np.float), 54 | np.ascontiguousarray(btlbrs, dtype=np.float) 55 | ) 56 | 57 | return ious 58 | 59 | 60 | def iou_distance(atracks, btracks): 61 | """ 62 | Compute cost based on IoU 63 | :type atracks: list[STrack] 64 | :type btracks: list[STrack] 65 | 66 | :rtype cost_matrix np.ndarray 67 | """ 68 | 69 | if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): 70 | atlbrs = atracks 71 | btlbrs = btracks 72 | else: 73 | atlbrs = [track.tlbr for track in atracks] 74 | btlbrs = [track.tlbr for track in btracks] 75 | _ious = ious(atlbrs, btlbrs) 76 | cost_matrix = 1 - _ious 77 | 78 | return cost_matrix 79 | 80 | def embedding_distance(tracks, detections, metric='cosine'): 81 | """ 82 | :param tracks: list[STrack] 83 | :param detections: list[BaseTrack] 84 | :param metric: 85 | :return: cost_matrix np.ndarray 86 | """ 87 | 88 | cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) 89 | if cost_matrix.size == 0: 90 | return cost_matrix 91 | det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) 92 | track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float) 93 | cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features 94 | 95 | return cost_matrix 96 | 97 | 98 | def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98): 99 | if cost_matrix.size == 0: 100 | return cost_matrix 101 | gating_dim = 2 if only_position else 4 102 | gating_threshold = kalman_filter.chi2inv95[gating_dim] 103 | measurements = np.asarray([det.to_xyah() for det in detections]) 104 | for row, track in enumerate(tracks): 105 | gating_distance = kf.gating_distance( 106 | track.mean, track.covariance, measurements, only_position, metric='maha') 107 | cost_matrix[row, gating_distance > gating_threshold] = np.inf 108 | cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance 109 | return cost_matrix 110 | -------------------------------------------------------------------------------- /torchreid/distance.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | 6 | def compute_distance_matrix(input1, input2, metric='euclidean'): 7 | """A wrapper function for computing distance matrix. 8 | 9 | Args: 10 | input1 (torch.Tensor): 2-D feature matrix. 11 | input2 (torch.Tensor): 2-D feature matrix. 12 | metric (str, optional): "euclidean" or "cosine". 13 | Default is "euclidean". 14 | 15 | Returns: 16 | torch.Tensor: distance matrix. 17 | 18 | Examples:: 19 | >>> from torchreid import metrics 20 | >>> input1 = torch.rand(10, 2048) 21 | >>> input2 = torch.rand(100, 2048) 22 | >>> distmat = metrics.compute_distance_matrix(input1, input2) 23 | >>> distmat.size() # (10, 100) 24 | """ 25 | # check input 26 | assert isinstance(input1, torch.Tensor) 27 | assert isinstance(input2, torch.Tensor) 28 | assert input1.dim() == 2, 'Expected 2-D tensor, but got {}-D'.format( 29 | input1.dim() 30 | ) 31 | assert input2.dim() == 2, 'Expected 2-D tensor, but got {}-D'.format( 32 | input2.dim() 33 | ) 34 | assert input1.size(1) == input2.size(1) 35 | 36 | if metric == 'euclidean': 37 | distmat = euclidean_squared_distance(input1, input2) 38 | elif metric == 'cosine': 39 | distmat = cosine_distance(input1, input2) 40 | else: 41 | raise ValueError( 42 | 'Unknown distance metric: {}. ' 43 | 'Please choose either "euclidean" or "cosine"'.format(metric) 44 | ) 45 | 46 | return distmat 47 | 48 | 49 | def euclidean_squared_distance(input1, input2): 50 | """Computes euclidean squared distance. 51 | 52 | Args: 53 | input1 (torch.Tensor): 2-D feature matrix. 54 | input2 (torch.Tensor): 2-D feature matrix. 55 | 56 | Returns: 57 | torch.Tensor: distance matrix. 58 | """ 59 | m, n = input1.size(0), input2.size(0) 60 | mat1 = torch.pow(input1, 2).sum(dim=1, keepdim=True).expand(m, n) 61 | mat2 = torch.pow(input2, 2).sum(dim=1, keepdim=True).expand(n, m).t() 62 | distmat = mat1 + mat2 63 | distmat.addmm_(input1, input2.t(), beta=1, alpha=-2) 64 | return distmat 65 | 66 | 67 | def cosine_distance(input1, input2): 68 | """Computes cosine distance. 69 | 70 | Args: 71 | input1 (torch.Tensor): 2-D feature matrix. 72 | input2 (torch.Tensor): 2-D feature matrix. 73 | 74 | Returns: 75 | torch.Tensor: distance matrix. 76 | """ 77 | input1_normed = F.normalize(input1, p=2, dim=1) 78 | input2_normed = F.normalize(input2, p=2, dim=1) 79 | distmat = 1 - torch.mm(input1_normed, input2_normed.t()) 80 | return distmat 81 | -------------------------------------------------------------------------------- /torchreid/feature_extractor.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import torch 4 | import torchvision.transforms as T 5 | from PIL import Image 6 | 7 | from .models import build_model 8 | 9 | import warnings 10 | import os.path as osp 11 | import pickle 12 | from functools import partial 13 | from collections import OrderedDict 14 | 15 | def check_isfile(fpath): 16 | """Checks if the given path is a file. 17 | 18 | Args: 19 | fpath (str): file path. 20 | 21 | Returns: 22 | bool 23 | """ 24 | isfile = osp.isfile(fpath) 25 | if not isfile: 26 | warnings.warn('No file found at "{}"'.format(fpath)) 27 | return isfile 28 | 29 | def load_pretrained_weights(model, weight_path): 30 | r"""Loads pretrianed weights to model. 31 | 32 | Features:: 33 | - Incompatible layers (unmatched in name or size) will be ignored. 34 | - Can automatically deal with keys containing "module.". 35 | 36 | Args: 37 | model (nn.Module): network model. 38 | weight_path (str): path to pretrained weights. 39 | 40 | Examples:: 41 | >>> from torchreid.utils import load_pretrained_weights 42 | >>> weight_path = 'log/my_model/model-best.pth.tar' 43 | >>> load_pretrained_weights(model, weight_path) 44 | """ 45 | checkpoint = load_checkpoint(weight_path) 46 | if 'state_dict' in checkpoint: 47 | state_dict = checkpoint['state_dict'] 48 | else: 49 | state_dict = checkpoint 50 | 51 | model_dict = model.state_dict() 52 | new_state_dict = OrderedDict() 53 | matched_layers, discarded_layers = [], [] 54 | 55 | for k, v in state_dict.items(): 56 | if k.startswith('module.'): 57 | k = k[7:] # discard module. 58 | 59 | if k in model_dict and model_dict[k].size() == v.size(): 60 | new_state_dict[k] = v 61 | matched_layers.append(k) 62 | else: 63 | discarded_layers.append(k) 64 | 65 | model_dict.update(new_state_dict) 66 | model.load_state_dict(model_dict) 67 | 68 | if len(matched_layers) == 0: 69 | warnings.warn( 70 | 'The pretrained weights "{}" cannot be loaded, ' 71 | 'please check the key names manually ' 72 | '(** ignored and continue **)'.format(weight_path) 73 | ) 74 | #else: 75 | #print( 76 | # 'Successfully loaded pretrained weights from "{}"'. 77 | # format(weight_path) 78 | #) 79 | #if len(discarded_layers) > 0: 80 | # print( 81 | # '** The following layers are discarded ' 82 | # 'due to unmatched keys or layer size: {}'. 83 | # format(discarded_layers) 84 | # ) 85 | 86 | def load_checkpoint(fpath): 87 | r"""Loads checkpoint. 88 | 89 | ``UnicodeDecodeError`` can be well handled, which means 90 | python2-saved files can be read from python3. 91 | 92 | Args: 93 | fpath (str): path to checkpoint. 94 | 95 | Returns: 96 | dict 97 | 98 | Examples:: 99 | >>> from torchreid.utils import load_checkpoint 100 | >>> fpath = 'log/my_model/model.pth.tar-10' 101 | >>> checkpoint = load_checkpoint(fpath) 102 | """ 103 | if fpath is None: 104 | raise ValueError('File path is None') 105 | if not osp.exists(fpath): 106 | raise FileNotFoundError('File is not found at "{}"'.format(fpath)) 107 | map_location = None if torch.cuda.is_available() else 'cpu' 108 | try: 109 | checkpoint = torch.load(fpath, map_location=map_location) 110 | except UnicodeDecodeError: 111 | pickle.load = partial(pickle.load, encoding="latin1") 112 | pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") 113 | checkpoint = torch.load( 114 | fpath, pickle_module=pickle, map_location=map_location 115 | ) 116 | except Exception: 117 | print('Unable to load checkpoint from "{}"'.format(fpath)) 118 | raise 119 | return checkpoint 120 | 121 | class FeatureExtractor(object): 122 | """A simple API for feature extraction. 123 | 124 | FeatureExtractor can be used like a python function, which 125 | accepts input of the following types: 126 | - a list of strings (image paths) 127 | - a list of numpy.ndarray each with shape (H, W, C) 128 | - a single string (image path) 129 | - a single numpy.ndarray with shape (H, W, C) 130 | - a torch.Tensor with shape (B, C, H, W) or (C, H, W) 131 | 132 | Returned is a torch tensor with shape (B, D) where D is the 133 | feature dimension. 134 | 135 | Args: 136 | model_name (str): model name. 137 | model_path (str): path to model weights. 138 | image_size (sequence or int): image height and width. 139 | pixel_mean (list): pixel mean for normalization. 140 | pixel_std (list): pixel std for normalization. 141 | pixel_norm (bool): whether to normalize pixels. 142 | device (str): 'cpu' or 'cuda' (could be specific gpu devices). 143 | verbose (bool): show model details. 144 | 145 | Examples:: 146 | 147 | from torchreid.utils import FeatureExtractor 148 | 149 | extractor = FeatureExtractor( 150 | model_name='osnet_x1_0', 151 | model_path='a/b/c/model.pth.tar', 152 | device='cuda' 153 | ) 154 | 155 | image_list = [ 156 | 'a/b/c/image001.jpg', 157 | 'a/b/c/image002.jpg', 158 | 'a/b/c/image003.jpg', 159 | 'a/b/c/image004.jpg', 160 | 'a/b/c/image005.jpg' 161 | ] 162 | 163 | features = extractor(image_list) 164 | print(features.shape) # output (5, 512) 165 | """ 166 | 167 | def __init__( 168 | self, 169 | model_name='', 170 | model_path='', 171 | image_size=(256, 128), # (h, w) 172 | pixel_mean=[0.485, 0.456, 0.406], 173 | pixel_std=[0.229, 0.224, 0.225], 174 | pixel_norm=True, 175 | device='cuda', 176 | verbose=True 177 | ): 178 | # Build model 179 | model = build_model( 180 | model_name, 181 | num_classes=1, 182 | pretrained=False, 183 | use_gpu=device.startswith('cuda') 184 | ) 185 | model.eval() 186 | 187 | if model_path and check_isfile(model_path): 188 | load_pretrained_weights(model, model_path) 189 | 190 | # Build transform functions 191 | transforms = [] 192 | transforms += [T.Resize(image_size)] 193 | transforms += [T.ToTensor()] 194 | if pixel_norm: 195 | transforms += [T.Normalize(mean=pixel_mean, std=pixel_std)] 196 | preprocess = T.Compose(transforms) 197 | 198 | to_pil = T.ToPILImage() 199 | 200 | device = torch.device(device) 201 | model.to(device) 202 | 203 | # Class attributes 204 | self.model = model 205 | self.preprocess = preprocess 206 | self.to_pil = to_pil 207 | self.device = device 208 | 209 | def __call__(self, input): 210 | if isinstance(input, list): 211 | images = [] 212 | 213 | for element in input: 214 | if isinstance(element, str): 215 | image = Image.open(element).convert('RGB') 216 | 217 | elif isinstance(element, np.ndarray): 218 | image = self.to_pil(element) 219 | 220 | else: 221 | raise TypeError( 222 | 'Type of each element must belong to [str | numpy.ndarray]' 223 | ) 224 | 225 | image = self.preprocess(image) 226 | images.append(image) 227 | 228 | images = torch.stack(images, dim=0) 229 | images = images.to(self.device) 230 | 231 | elif isinstance(input, str): 232 | image = Image.open(input).convert('RGB') 233 | image = self.preprocess(image) 234 | images = image.unsqueeze(0).to(self.device) 235 | 236 | elif isinstance(input, np.ndarray): 237 | image = self.to_pil(input) 238 | image = self.preprocess(image) 239 | images = image.unsqueeze(0).to(self.device) 240 | 241 | elif isinstance(input, torch.Tensor): 242 | if input.dim() == 3: 243 | input = input.unsqueeze(0) 244 | images = input.to(self.device) 245 | 246 | else: 247 | raise NotImplementedError 248 | 249 | with torch.no_grad(): 250 | features = self.model(images) 251 | 252 | return features 253 | -------------------------------------------------------------------------------- /torchreid/models/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import torch 3 | 4 | from .pcb import * 5 | from .mlfn import * 6 | from .hacnn import * 7 | from .osnet import * 8 | from .senet import * 9 | from .mudeep import * 10 | from .nasnet import * 11 | from .resnet import * 12 | from .densenet import * 13 | from .xception import * 14 | from .osnet_ain import * 15 | from .resnetmid import * 16 | from .shufflenet import * 17 | from .squeezenet import * 18 | from .inceptionv4 import * 19 | from .mobilenetv2 import * 20 | from .resnet_ibn_a import * 21 | from .resnet_ibn_b import * 22 | from .shufflenetv2 import * 23 | from .inceptionresnetv2 import * 24 | 25 | __model_factory = { 26 | # image classification models 27 | 'resnet18': resnet18, 28 | 'resnet34': resnet34, 29 | 'resnet50': resnet50, 30 | 'resnet101': resnet101, 31 | 'resnet152': resnet152, 32 | 'resnext50_32x4d': resnext50_32x4d, 33 | 'resnext101_32x8d': resnext101_32x8d, 34 | 'resnet50_fc512': resnet50_fc512, 35 | 'se_resnet50': se_resnet50, 36 | 'se_resnet50_fc512': se_resnet50_fc512, 37 | 'se_resnet101': se_resnet101, 38 | 'se_resnext50_32x4d': se_resnext50_32x4d, 39 | 'se_resnext101_32x4d': se_resnext101_32x4d, 40 | 'densenet121': densenet121, 41 | 'densenet169': densenet169, 42 | 'densenet201': densenet201, 43 | 'densenet161': densenet161, 44 | 'densenet121_fc512': densenet121_fc512, 45 | 'inceptionresnetv2': inceptionresnetv2, 46 | 'inceptionv4': inceptionv4, 47 | 'xception': xception, 48 | 'resnet50_ibn_a': resnet50_ibn_a, 49 | 'resnet50_ibn_b': resnet50_ibn_b, 50 | # lightweight models 51 | 'nasnsetmobile': nasnetamobile, 52 | 'mobilenetv2_x1_0': mobilenetv2_x1_0, 53 | 'mobilenetv2_x1_4': mobilenetv2_x1_4, 54 | 'shufflenet': shufflenet, 55 | 'squeezenet1_0': squeezenet1_0, 56 | 'squeezenet1_0_fc512': squeezenet1_0_fc512, 57 | 'squeezenet1_1': squeezenet1_1, 58 | 'shufflenet_v2_x0_5': shufflenet_v2_x0_5, 59 | 'shufflenet_v2_x1_0': shufflenet_v2_x1_0, 60 | 'shufflenet_v2_x1_5': shufflenet_v2_x1_5, 61 | 'shufflenet_v2_x2_0': shufflenet_v2_x2_0, 62 | # reid-specific models 63 | 'mudeep': MuDeep, 64 | 'resnet50mid': resnet50mid, 65 | 'hacnn': HACNN, 66 | 'pcb_p6': pcb_p6, 67 | 'pcb_p4': pcb_p4, 68 | 'mlfn': mlfn, 69 | 'osnet_x1_0': osnet_x1_0, 70 | 'osnet_x0_75': osnet_x0_75, 71 | 'osnet_x0_5': osnet_x0_5, 72 | 'osnet_x0_25': osnet_x0_25, 73 | 'osnet_ibn_x1_0': osnet_ibn_x1_0, 74 | 'osnet_ain_x1_0': osnet_ain_x1_0 75 | } 76 | 77 | 78 | def show_avai_models(): 79 | """Displays available models. 80 | 81 | Examples:: 82 | >>> from torchreid import models 83 | >>> models.show_avai_models() 84 | """ 85 | print(list(__model_factory.keys())) 86 | 87 | 88 | def build_model( 89 | name, num_classes, loss='softmax', pretrained=True, use_gpu=True 90 | ): 91 | """A function wrapper for building a model. 92 | 93 | Args: 94 | name (str): model name. 95 | num_classes (int): number of training identities. 96 | loss (str, optional): loss function to optimize the model. Currently 97 | supports "softmax" and "triplet". Default is "softmax". 98 | pretrained (bool, optional): whether to load ImageNet-pretrained weights. 99 | Default is True. 100 | use_gpu (bool, optional): whether to use gpu. Default is True. 101 | 102 | Returns: 103 | nn.Module 104 | 105 | Examples:: 106 | >>> from torchreid import models 107 | >>> model = models.build_model('resnet50', 751, loss='softmax') 108 | """ 109 | avai_models = list(__model_factory.keys()) 110 | if name not in avai_models: 111 | raise KeyError( 112 | 'Unknown model: {}. Must be one of {}'.format(name, avai_models) 113 | ) 114 | return __model_factory[name]( 115 | num_classes=num_classes, 116 | loss=loss, 117 | pretrained=pretrained, 118 | use_gpu=use_gpu 119 | ) 120 | -------------------------------------------------------------------------------- /torchreid/models/mudeep.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | __all__ = ['MuDeep'] 7 | 8 | 9 | class ConvBlock(nn.Module): 10 | """Basic convolutional block. 11 | 12 | convolution + batch normalization + relu. 13 | 14 | Args: 15 | in_c (int): number of input channels. 16 | out_c (int): number of output channels. 17 | k (int or tuple): kernel size. 18 | s (int or tuple): stride. 19 | p (int or tuple): padding. 20 | """ 21 | 22 | def __init__(self, in_c, out_c, k, s, p): 23 | super(ConvBlock, self).__init__() 24 | self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) 25 | self.bn = nn.BatchNorm2d(out_c) 26 | 27 | def forward(self, x): 28 | return F.relu(self.bn(self.conv(x))) 29 | 30 | 31 | class ConvLayers(nn.Module): 32 | """Preprocessing layers.""" 33 | 34 | def __init__(self): 35 | super(ConvLayers, self).__init__() 36 | self.conv1 = ConvBlock(3, 48, k=3, s=1, p=1) 37 | self.conv2 = ConvBlock(48, 96, k=3, s=1, p=1) 38 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 39 | 40 | def forward(self, x): 41 | x = self.conv1(x) 42 | x = self.conv2(x) 43 | x = self.maxpool(x) 44 | return x 45 | 46 | 47 | class MultiScaleA(nn.Module): 48 | """Multi-scale stream layer A (Sec.3.1)""" 49 | 50 | def __init__(self): 51 | super(MultiScaleA, self).__init__() 52 | self.stream1 = nn.Sequential( 53 | ConvBlock(96, 96, k=1, s=1, p=0), 54 | ConvBlock(96, 24, k=3, s=1, p=1), 55 | ) 56 | self.stream2 = nn.Sequential( 57 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 58 | ConvBlock(96, 24, k=1, s=1, p=0), 59 | ) 60 | self.stream3 = ConvBlock(96, 24, k=1, s=1, p=0) 61 | self.stream4 = nn.Sequential( 62 | ConvBlock(96, 16, k=1, s=1, p=0), 63 | ConvBlock(16, 24, k=3, s=1, p=1), 64 | ConvBlock(24, 24, k=3, s=1, p=1), 65 | ) 66 | 67 | def forward(self, x): 68 | s1 = self.stream1(x) 69 | s2 = self.stream2(x) 70 | s3 = self.stream3(x) 71 | s4 = self.stream4(x) 72 | y = torch.cat([s1, s2, s3, s4], dim=1) 73 | return y 74 | 75 | 76 | class Reduction(nn.Module): 77 | """Reduction layer (Sec.3.1)""" 78 | 79 | def __init__(self): 80 | super(Reduction, self).__init__() 81 | self.stream1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 82 | self.stream2 = ConvBlock(96, 96, k=3, s=2, p=1) 83 | self.stream3 = nn.Sequential( 84 | ConvBlock(96, 48, k=1, s=1, p=0), 85 | ConvBlock(48, 56, k=3, s=1, p=1), 86 | ConvBlock(56, 64, k=3, s=2, p=1), 87 | ) 88 | 89 | def forward(self, x): 90 | s1 = self.stream1(x) 91 | s2 = self.stream2(x) 92 | s3 = self.stream3(x) 93 | y = torch.cat([s1, s2, s3], dim=1) 94 | return y 95 | 96 | 97 | class MultiScaleB(nn.Module): 98 | """Multi-scale stream layer B (Sec.3.1)""" 99 | 100 | def __init__(self): 101 | super(MultiScaleB, self).__init__() 102 | self.stream1 = nn.Sequential( 103 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 104 | ConvBlock(256, 256, k=1, s=1, p=0), 105 | ) 106 | self.stream2 = nn.Sequential( 107 | ConvBlock(256, 64, k=1, s=1, p=0), 108 | ConvBlock(64, 128, k=(1, 3), s=1, p=(0, 1)), 109 | ConvBlock(128, 256, k=(3, 1), s=1, p=(1, 0)), 110 | ) 111 | self.stream3 = ConvBlock(256, 256, k=1, s=1, p=0) 112 | self.stream4 = nn.Sequential( 113 | ConvBlock(256, 64, k=1, s=1, p=0), 114 | ConvBlock(64, 64, k=(1, 3), s=1, p=(0, 1)), 115 | ConvBlock(64, 128, k=(3, 1), s=1, p=(1, 0)), 116 | ConvBlock(128, 128, k=(1, 3), s=1, p=(0, 1)), 117 | ConvBlock(128, 256, k=(3, 1), s=1, p=(1, 0)), 118 | ) 119 | 120 | def forward(self, x): 121 | s1 = self.stream1(x) 122 | s2 = self.stream2(x) 123 | s3 = self.stream3(x) 124 | s4 = self.stream4(x) 125 | return s1, s2, s3, s4 126 | 127 | 128 | class Fusion(nn.Module): 129 | """Saliency-based learning fusion layer (Sec.3.2)""" 130 | 131 | def __init__(self): 132 | super(Fusion, self).__init__() 133 | self.a1 = nn.Parameter(torch.rand(1, 256, 1, 1)) 134 | self.a2 = nn.Parameter(torch.rand(1, 256, 1, 1)) 135 | self.a3 = nn.Parameter(torch.rand(1, 256, 1, 1)) 136 | self.a4 = nn.Parameter(torch.rand(1, 256, 1, 1)) 137 | 138 | # We add an average pooling layer to reduce the spatial dimension 139 | # of feature maps, which differs from the original paper. 140 | self.avgpool = nn.AvgPool2d(kernel_size=4, stride=4, padding=0) 141 | 142 | def forward(self, x1, x2, x3, x4): 143 | s1 = self.a1.expand_as(x1) * x1 144 | s2 = self.a2.expand_as(x2) * x2 145 | s3 = self.a3.expand_as(x3) * x3 146 | s4 = self.a4.expand_as(x4) * x4 147 | y = self.avgpool(s1 + s2 + s3 + s4) 148 | return y 149 | 150 | 151 | class MuDeep(nn.Module): 152 | """Multiscale deep neural network. 153 | 154 | Reference: 155 | Qian et al. Multi-scale Deep Learning Architectures 156 | for Person Re-identification. ICCV 2017. 157 | 158 | Public keys: 159 | - ``mudeep``: Multiscale deep neural network. 160 | """ 161 | 162 | def __init__(self, num_classes, loss='softmax', **kwargs): 163 | super(MuDeep, self).__init__() 164 | self.loss = loss 165 | 166 | self.block1 = ConvLayers() 167 | self.block2 = MultiScaleA() 168 | self.block3 = Reduction() 169 | self.block4 = MultiScaleB() 170 | self.block5 = Fusion() 171 | 172 | # Due to this fully connected layer, input image has to be fixed 173 | # in shape, i.e. (3, 256, 128), such that the last convolutional feature 174 | # maps are of shape (256, 16, 8). If input shape is changed, 175 | # the input dimension of this layer has to be changed accordingly. 176 | self.fc = nn.Sequential( 177 | nn.Linear(256 * 16 * 8, 4096), 178 | nn.BatchNorm1d(4096), 179 | nn.ReLU(), 180 | ) 181 | self.classifier = nn.Linear(4096, num_classes) 182 | self.feat_dim = 4096 183 | 184 | def featuremaps(self, x): 185 | x = self.block1(x) 186 | x = self.block2(x) 187 | x = self.block3(x) 188 | x = self.block4(x) 189 | x = self.block5(*x) 190 | return x 191 | 192 | def forward(self, x): 193 | x = self.featuremaps(x) 194 | x = x.view(x.size(0), -1) 195 | x = self.fc(x) 196 | y = self.classifier(x) 197 | 198 | if not self.training: 199 | return x 200 | 201 | if self.loss == 'softmax': 202 | return y 203 | elif self.loss == 'triplet': 204 | return y, x 205 | else: 206 | raise KeyError('Unsupported loss: {}'.format(self.loss)) 207 | -------------------------------------------------------------------------------- /torchreid/models/shufflenet.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import 2 | import torch 3 | import torch.utils.model_zoo as model_zoo 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | __all__ = ['shufflenet'] 8 | 9 | model_urls = { 10 | # training epoch = 90, top1 = 61.8 11 | 'imagenet': 12 | 'https://mega.nz/#!RDpUlQCY!tr_5xBEkelzDjveIYBBcGcovNCOrgfiJO9kiidz9fZM', 13 | } 14 | 15 | 16 | class ChannelShuffle(nn.Module): 17 | 18 | def __init__(self, num_groups): 19 | super(ChannelShuffle, self).__init__() 20 | self.g = num_groups 21 | 22 | def forward(self, x): 23 | b, c, h, w = x.size() 24 | n = c // self.g 25 | # reshape 26 | x = x.view(b, self.g, n, h, w) 27 | # transpose 28 | x = x.permute(0, 2, 1, 3, 4).contiguous() 29 | # flatten 30 | x = x.view(b, c, h, w) 31 | return x 32 | 33 | 34 | class Bottleneck(nn.Module): 35 | 36 | def __init__( 37 | self, 38 | in_channels, 39 | out_channels, 40 | stride, 41 | num_groups, 42 | group_conv1x1=True 43 | ): 44 | super(Bottleneck, self).__init__() 45 | assert stride in [1, 2], 'Warning: stride must be either 1 or 2' 46 | self.stride = stride 47 | mid_channels = out_channels // 4 48 | if stride == 2: 49 | out_channels -= in_channels 50 | # group conv is not applied to first conv1x1 at stage 2 51 | num_groups_conv1x1 = num_groups if group_conv1x1 else 1 52 | self.conv1 = nn.Conv2d( 53 | in_channels, 54 | mid_channels, 55 | 1, 56 | groups=num_groups_conv1x1, 57 | bias=False 58 | ) 59 | self.bn1 = nn.BatchNorm2d(mid_channels) 60 | self.shuffle1 = ChannelShuffle(num_groups) 61 | self.conv2 = nn.Conv2d( 62 | mid_channels, 63 | mid_channels, 64 | 3, 65 | stride=stride, 66 | padding=1, 67 | groups=mid_channels, 68 | bias=False 69 | ) 70 | self.bn2 = nn.BatchNorm2d(mid_channels) 71 | self.conv3 = nn.Conv2d( 72 | mid_channels, out_channels, 1, groups=num_groups, bias=False 73 | ) 74 | self.bn3 = nn.BatchNorm2d(out_channels) 75 | if stride == 2: 76 | self.shortcut = nn.AvgPool2d(3, stride=2, padding=1) 77 | 78 | def forward(self, x): 79 | out = F.relu(self.bn1(self.conv1(x))) 80 | out = self.shuffle1(out) 81 | out = self.bn2(self.conv2(out)) 82 | out = self.bn3(self.conv3(out)) 83 | if self.stride == 2: 84 | res = self.shortcut(x) 85 | out = F.relu(torch.cat([res, out], 1)) 86 | else: 87 | out = F.relu(x + out) 88 | return out 89 | 90 | 91 | # configuration of (num_groups: #out_channels) based on Table 1 in the paper 92 | cfg = { 93 | 1: [144, 288, 576], 94 | 2: [200, 400, 800], 95 | 3: [240, 480, 960], 96 | 4: [272, 544, 1088], 97 | 8: [384, 768, 1536], 98 | } 99 | 100 | 101 | class ShuffleNet(nn.Module): 102 | """ShuffleNet. 103 | 104 | Reference: 105 | Zhang et al. ShuffleNet: An Extremely Efficient Convolutional Neural 106 | Network for Mobile Devices. CVPR 2018. 107 | 108 | Public keys: 109 | - ``shufflenet``: ShuffleNet (groups=3). 110 | """ 111 | 112 | def __init__(self, num_classes, loss='softmax', num_groups=3, **kwargs): 113 | super(ShuffleNet, self).__init__() 114 | self.loss = loss 115 | 116 | self.conv1 = nn.Sequential( 117 | nn.Conv2d(3, 24, 3, stride=2, padding=1, bias=False), 118 | nn.BatchNorm2d(24), 119 | nn.ReLU(), 120 | nn.MaxPool2d(3, stride=2, padding=1), 121 | ) 122 | 123 | self.stage2 = nn.Sequential( 124 | Bottleneck( 125 | 24, cfg[num_groups][0], 2, num_groups, group_conv1x1=False 126 | ), 127 | Bottleneck(cfg[num_groups][0], cfg[num_groups][0], 1, num_groups), 128 | Bottleneck(cfg[num_groups][0], cfg[num_groups][0], 1, num_groups), 129 | Bottleneck(cfg[num_groups][0], cfg[num_groups][0], 1, num_groups), 130 | ) 131 | 132 | self.stage3 = nn.Sequential( 133 | Bottleneck(cfg[num_groups][0], cfg[num_groups][1], 2, num_groups), 134 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 135 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 136 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 137 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 138 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 139 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 140 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups), 141 | ) 142 | 143 | self.stage4 = nn.Sequential( 144 | Bottleneck(cfg[num_groups][1], cfg[num_groups][2], 2, num_groups), 145 | Bottleneck(cfg[num_groups][2], cfg[num_groups][2], 1, num_groups), 146 | Bottleneck(cfg[num_groups][2], cfg[num_groups][2], 1, num_groups), 147 | Bottleneck(cfg[num_groups][2], cfg[num_groups][2], 1, num_groups), 148 | ) 149 | 150 | self.classifier = nn.Linear(cfg[num_groups][2], num_classes) 151 | self.feat_dim = cfg[num_groups][2] 152 | 153 | def forward(self, x): 154 | x = self.conv1(x) 155 | x = self.stage2(x) 156 | x = self.stage3(x) 157 | x = self.stage4(x) 158 | x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), -1) 159 | 160 | if not self.training: 161 | return x 162 | 163 | y = self.classifier(x) 164 | 165 | if self.loss == 'softmax': 166 | return y 167 | elif self.loss == 'triplet': 168 | return y, x 169 | else: 170 | raise KeyError('Unsupported loss: {}'.format(self.loss)) 171 | 172 | 173 | def init_pretrained_weights(model, model_url): 174 | """Initializes model with pretrained weights. 175 | 176 | Layers that don't match with pretrained layers in name or size are kept unchanged. 177 | """ 178 | pretrain_dict = model_zoo.load_url(model_url) 179 | model_dict = model.state_dict() 180 | pretrain_dict = { 181 | k: v 182 | for k, v in pretrain_dict.items() 183 | if k in model_dict and model_dict[k].size() == v.size() 184 | } 185 | model_dict.update(pretrain_dict) 186 | model.load_state_dict(model_dict) 187 | 188 | 189 | def shufflenet(num_classes, loss='softmax', pretrained=True, **kwargs): 190 | model = ShuffleNet(num_classes, loss, **kwargs) 191 | if pretrained: 192 | # init_pretrained_weights(model, model_urls['imagenet']) 193 | import warnings 194 | warnings.warn( 195 | 'The imagenet pretrained weights need to be manually downloaded from {}' 196 | .format(model_urls['imagenet']) 197 | ) 198 | return model 199 | -------------------------------------------------------------------------------- /torchreid/models/squeezenet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code source: https://github.com/pytorch/vision 3 | """ 4 | from __future__ import division, absolute_import 5 | import torch 6 | import torch.nn as nn 7 | import torch.utils.model_zoo as model_zoo 8 | 9 | __all__ = ['squeezenet1_0', 'squeezenet1_1', 'squeezenet1_0_fc512'] 10 | 11 | model_urls = { 12 | 'squeezenet1_0': 13 | 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth', 14 | 'squeezenet1_1': 15 | 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth', 16 | } 17 | 18 | 19 | class Fire(nn.Module): 20 | 21 | def __init__( 22 | self, inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes 23 | ): 24 | super(Fire, self).__init__() 25 | self.inplanes = inplanes 26 | self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1) 27 | self.squeeze_activation = nn.ReLU(inplace=True) 28 | self.expand1x1 = nn.Conv2d( 29 | squeeze_planes, expand1x1_planes, kernel_size=1 30 | ) 31 | self.expand1x1_activation = nn.ReLU(inplace=True) 32 | self.expand3x3 = nn.Conv2d( 33 | squeeze_planes, expand3x3_planes, kernel_size=3, padding=1 34 | ) 35 | self.expand3x3_activation = nn.ReLU(inplace=True) 36 | 37 | def forward(self, x): 38 | x = self.squeeze_activation(self.squeeze(x)) 39 | return torch.cat( 40 | [ 41 | self.expand1x1_activation(self.expand1x1(x)), 42 | self.expand3x3_activation(self.expand3x3(x)) 43 | ], 1 44 | ) 45 | 46 | 47 | class SqueezeNet(nn.Module): 48 | """SqueezeNet. 49 | 50 | Reference: 51 | Iandola et al. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters 52 | and< 0.5 MB model size. arXiv:1602.07360. 53 | 54 | Public keys: 55 | - ``squeezenet1_0``: SqueezeNet (version=1.0). 56 | - ``squeezenet1_1``: SqueezeNet (version=1.1). 57 | - ``squeezenet1_0_fc512``: SqueezeNet (version=1.0) + FC. 58 | """ 59 | 60 | def __init__( 61 | self, 62 | num_classes, 63 | loss, 64 | version=1.0, 65 | fc_dims=None, 66 | dropout_p=None, 67 | **kwargs 68 | ): 69 | super(SqueezeNet, self).__init__() 70 | self.loss = loss 71 | self.feature_dim = 512 72 | 73 | if version not in [1.0, 1.1]: 74 | raise ValueError( 75 | 'Unsupported SqueezeNet version {version}:' 76 | '1.0 or 1.1 expected'.format(version=version) 77 | ) 78 | 79 | if version == 1.0: 80 | self.features = nn.Sequential( 81 | nn.Conv2d(3, 96, kernel_size=7, stride=2), 82 | nn.ReLU(inplace=True), 83 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), 84 | Fire(96, 16, 64, 64), 85 | Fire(128, 16, 64, 64), 86 | Fire(128, 32, 128, 128), 87 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), 88 | Fire(256, 32, 128, 128), 89 | Fire(256, 48, 192, 192), 90 | Fire(384, 48, 192, 192), 91 | Fire(384, 64, 256, 256), 92 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), 93 | Fire(512, 64, 256, 256), 94 | ) 95 | else: 96 | self.features = nn.Sequential( 97 | nn.Conv2d(3, 64, kernel_size=3, stride=2), 98 | nn.ReLU(inplace=True), 99 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), 100 | Fire(64, 16, 64, 64), 101 | Fire(128, 16, 64, 64), 102 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), 103 | Fire(128, 32, 128, 128), 104 | Fire(256, 32, 128, 128), 105 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), 106 | Fire(256, 48, 192, 192), 107 | Fire(384, 48, 192, 192), 108 | Fire(384, 64, 256, 256), 109 | Fire(512, 64, 256, 256), 110 | ) 111 | 112 | self.global_avgpool = nn.AdaptiveAvgPool2d(1) 113 | self.fc = self._construct_fc_layer(fc_dims, 512, dropout_p) 114 | self.classifier = nn.Linear(self.feature_dim, num_classes) 115 | 116 | self._init_params() 117 | 118 | def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): 119 | """Constructs fully connected layer 120 | 121 | Args: 122 | fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed 123 | input_dim (int): input dimension 124 | dropout_p (float): dropout probability, if None, dropout is unused 125 | """ 126 | if fc_dims is None: 127 | self.feature_dim = input_dim 128 | return None 129 | 130 | assert isinstance( 131 | fc_dims, (list, tuple) 132 | ), 'fc_dims must be either list or tuple, but got {}'.format( 133 | type(fc_dims) 134 | ) 135 | 136 | layers = [] 137 | for dim in fc_dims: 138 | layers.append(nn.Linear(input_dim, dim)) 139 | layers.append(nn.BatchNorm1d(dim)) 140 | layers.append(nn.ReLU(inplace=True)) 141 | if dropout_p is not None: 142 | layers.append(nn.Dropout(p=dropout_p)) 143 | input_dim = dim 144 | 145 | self.feature_dim = fc_dims[-1] 146 | 147 | return nn.Sequential(*layers) 148 | 149 | def _init_params(self): 150 | for m in self.modules(): 151 | if isinstance(m, nn.Conv2d): 152 | nn.init.kaiming_normal_( 153 | m.weight, mode='fan_out', nonlinearity='relu' 154 | ) 155 | if m.bias is not None: 156 | nn.init.constant_(m.bias, 0) 157 | elif isinstance(m, nn.BatchNorm2d): 158 | nn.init.constant_(m.weight, 1) 159 | nn.init.constant_(m.bias, 0) 160 | elif isinstance(m, nn.BatchNorm1d): 161 | nn.init.constant_(m.weight, 1) 162 | nn.init.constant_(m.bias, 0) 163 | elif isinstance(m, nn.Linear): 164 | nn.init.normal_(m.weight, 0, 0.01) 165 | if m.bias is not None: 166 | nn.init.constant_(m.bias, 0) 167 | 168 | def forward(self, x): 169 | f = self.features(x) 170 | v = self.global_avgpool(f) 171 | v = v.view(v.size(0), -1) 172 | 173 | if self.fc is not None: 174 | v = self.fc(v) 175 | 176 | if not self.training: 177 | return v 178 | 179 | y = self.classifier(v) 180 | 181 | if self.loss == 'softmax': 182 | return y 183 | elif self.loss == 'triplet': 184 | return y, v 185 | else: 186 | raise KeyError('Unsupported loss: {}'.format(self.loss)) 187 | 188 | 189 | def init_pretrained_weights(model, model_url): 190 | """Initializes model with pretrained weights. 191 | 192 | Layers that don't match with pretrained layers in name or size are kept unchanged. 193 | """ 194 | pretrain_dict = model_zoo.load_url(model_url, map_location=None) 195 | model_dict = model.state_dict() 196 | pretrain_dict = { 197 | k: v 198 | for k, v in pretrain_dict.items() 199 | if k in model_dict and model_dict[k].size() == v.size() 200 | } 201 | model_dict.update(pretrain_dict) 202 | model.load_state_dict(model_dict) 203 | 204 | 205 | def squeezenet1_0(num_classes, loss='softmax', pretrained=True, **kwargs): 206 | model = SqueezeNet( 207 | num_classes, loss, version=1.0, fc_dims=None, dropout_p=None, **kwargs 208 | ) 209 | if pretrained: 210 | init_pretrained_weights(model, model_urls['squeezenet1_0']) 211 | return model 212 | 213 | 214 | def squeezenet1_0_fc512( 215 | num_classes, loss='softmax', pretrained=True, **kwargs 216 | ): 217 | model = SqueezeNet( 218 | num_classes, 219 | loss, 220 | version=1.0, 221 | fc_dims=[512], 222 | dropout_p=None, 223 | **kwargs 224 | ) 225 | if pretrained: 226 | init_pretrained_weights(model, model_urls['squeezenet1_0']) 227 | return model 228 | 229 | 230 | def squeezenet1_1(num_classes, loss='softmax', pretrained=True, **kwargs): 231 | model = SqueezeNet( 232 | num_classes, loss, version=1.1, fc_dims=None, dropout_p=None, **kwargs 233 | ) 234 | if pretrained: 235 | init_pretrained_weights(model, model_urls['squeezenet1_1']) 236 | return model 237 | -------------------------------------------------------------------------------- /track_to_json.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # convert the detections or mtsc txt file into json for each frame 3 | import sys, os, json, argparse 4 | 5 | from tqdm import tqdm 6 | 7 | from class_ids import targetClass2id_new_nopo, targetAct2id_bupt 8 | 9 | targetClass2id = targetClass2id_new_nopo 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("filepath", help="all txt files for each video") 13 | parser.add_argument("videonamelst") 14 | parser.add_argument("cat_name") 15 | parser.add_argument("despath", help="despath/videoname_F_08d.json, index from 0") 16 | parser.add_argument("--bupt_exp", action="store_true") 17 | 18 | 19 | if __name__ == "__main__": 20 | args = parser.parse_args() 21 | 22 | videonames = [os.path.splitext(os.path.basename(line.strip()))[0] for line in open(args.videonamelst,"r").readlines()] 23 | 24 | if not os.path.exists(args.despath): 25 | os.makedirs(args.despath) 26 | 27 | if args.bupt_exp: 28 | targetClass2id = targetAct2id_bupt 29 | 30 | for videoname in tqdm(videonames, ascii=True): 31 | detfile = os.path.join(args.filepath, "%s.txt"%videoname) 32 | 33 | data = {} # frame -> boxes 34 | 35 | for line in open(detfile, "r").readlines(): 36 | # note the frameIdx start from 1 37 | frameIdx, track_id, left, top, width, height, conf, _, _, _ = line.strip().split(",") 38 | frameIdx = int(frameIdx) - 1 # note here I made a mistake, gt is 1-indexed, but out obj_tracking output is 0-indexed 39 | 40 | track_id = int(track_id) 41 | 42 | box = [float(left), float(top), float(width), float(height)] 43 | 44 | #if not data.has_key(frameIdx): 45 | if not frameIdx in data: 46 | data[frameIdx] = [] 47 | data[frameIdx].append({ 48 | "category_id": targetClass2id[args.cat_name], 49 | "cat_name": args.cat_name, 50 | "score":float(round(float(conf), 7)), 51 | "bbox": box, 52 | "segmentation": None, 53 | "trackId": track_id 54 | }) 55 | 56 | for frameIndex in data: 57 | 58 | annofile = os.path.join(args.despath, "%s_F_%08d.json"%(videoname, frameIndex)) 59 | 60 | with open(annofile, "w") as f: 61 | json.dump(data[frameIndex], f) 62 | 63 | -------------------------------------------------------------------------------- /tracks_to_json.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # convert the detections or mtsc txt file into json for each frame 3 | import sys, os, json, argparse 4 | 5 | from tqdm import tqdm 6 | from glob import glob 7 | 8 | from class_ids import targetClass2id_new_nopo, targetAct2id_bupt 9 | 10 | targetClass2id = targetClass2id_new_nopo 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("filepath", help="all txt files for each video") 14 | parser.add_argument("videonamelst") 15 | parser.add_argument("despath", help="despath/videoname_F_08d.json, index from 0") 16 | parser.add_argument("--bupt_exp", action="store_true") 17 | 18 | 19 | if __name__ == "__main__": 20 | args = parser.parse_args() 21 | 22 | # leave the .mp4 23 | videonames = [os.path.basename(line.strip()) for line in open(args.videonamelst,"r").readlines()] 24 | 25 | if not os.path.exists(args.despath): 26 | os.makedirs(args.despath) 27 | 28 | if args.bupt_exp: 29 | targetClass2id = targetAct2id_bupt 30 | 31 | for videoname in tqdm(videonames, ascii=True): 32 | #detfile = os.path.join(args.filepath, "%s.txt"%videoname) 33 | detfiles = glob(os.path.join(args.filepath, videoname, "*", "%s.txt" % (os.path.splitext(videoname)[0]))) 34 | 35 | data = {} # frame -> boxes 36 | for detfile in detfiles: 37 | #cat_name = detfile.split("/")[-2] # this does not work under windows 38 | # 1. norm the path, in windows "/" will be converted to "\" 39 | detfile = os.path.normpath(detfile) 40 | # 2. split the path using os specific separator 41 | cat_name = detfile.split(os.sep)[-2] 42 | for line in open(detfile, "r").readlines(): 43 | # note the frameIdx start from 1? 44 | frameIdx, track_id, left, top, width, height, conf, _, _, _ = line.strip().split(",") 45 | # (?) note here I made a mistake, gt is 1-indexed, but out obj_tracking output is 0-indexed 46 | #frameIdx = int(frameIdx) - 1 47 | frameIdx = int(frameIdx) 48 | 49 | track_id = int(track_id) 50 | 51 | box = [float(left), float(top), float(width), float(height)] 52 | 53 | #if not data.has_key(frameIdx): 54 | if not frameIdx in data: 55 | data[frameIdx] = [] 56 | data[frameIdx].append({ 57 | "category_id": targetClass2id[cat_name], 58 | "cat_name": cat_name, 59 | "score": float(round(float(conf), 7)), 60 | "bbox": box, 61 | "segmentation": None, 62 | "trackId": track_id 63 | }) 64 | 65 | for frameIndex in data: 66 | 67 | annofile = os.path.join(args.despath, "%s_F_%08d.json"%(os.path.splitext(videoname)[0], frameIndex)) 68 | 69 | with open(annofile, "w") as f: 70 | json.dump(data[frameIndex], f) 71 | 72 | -------------------------------------------------------------------------------- /vis_tracks.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # given MOT track file path, visualize into videos 3 | import argparse 4 | import cv2 5 | import random 6 | import os 7 | import sys 8 | 9 | from tqdm import tqdm 10 | from glob import glob 11 | import numpy as np 12 | 13 | import matplotlib.colors as mcolors # to get a list of colors 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("track_path") 17 | parser.add_argument("frame_path") 18 | parser.add_argument("video_name_lst") 19 | parser.add_argument("out_path") 20 | parser.add_argument("--show_only_global", action="store_true") 21 | 22 | def hex_color_to_rgb(s): 23 | r = int(s[1:3], 16) 24 | g = int(s[3:5], 16) 25 | b = int(s[5:7], 16) 26 | return (r, g, b) # (0-255) 27 | 28 | def load_track_file(file_path, cat_names): 29 | 30 | track_data = {} # frame_id -> {cat_name: } 31 | video_name = os.path.splitext(os.path.basename(file_path))[0] 32 | for cat_name in cat_names: 33 | track_file_path = os.path.join(file_path, cat_name, video_name + ".txt") 34 | data = [] 35 | with open(track_file_path, "r") as f: 36 | for line in f: 37 | frame_idx, track_id, left, top, width, height, conf, gid, _, _ = line.strip().split(",") 38 | data.append([frame_idx, track_id, left, top, width, height, conf, gid]) 39 | 40 | data = np.array(data, dtype="float32") # [N, 8] 41 | frame_ids = np.unique(data[:, 0]).tolist() 42 | 43 | for frame_id in frame_ids: 44 | if frame_id not in track_data: 45 | track_data[frame_id] = {} 46 | track_data[frame_id][cat_name] = data[data[:, 0] == frame_id, :] 47 | return track_data 48 | 49 | 50 | def get_or_create_color_from_dict(key, color_dict, color_list): 51 | if key not in color_dict: 52 | this_color = color_list.pop() 53 | 54 | color_dict[key] = hex_color_to_rgb(color_name_to_hex[this_color]) 55 | # recycle it 56 | color_list.insert(0, this_color) 57 | color = color_assign[key] 58 | return color 59 | 60 | def draw_boxes(im, boxes, labels=None, colors=None, font_scale=0.6, 61 | font_thick=1, box_thick=1, bottom_text=False, offsets=None): 62 | if not boxes: 63 | return im 64 | 65 | boxes = np.asarray(boxes, dtype="int") 66 | 67 | FONT = cv2.FONT_HERSHEY_SIMPLEX 68 | FONT_SCALE = font_scale 69 | 70 | if labels is not None: 71 | assert len(labels) == len(boxes), "{} != {}".format(len(labels), len(boxes)) 72 | if colors is not None: 73 | assert len(labels) == len(colors) 74 | 75 | areas = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1) 76 | sorted_inds = np.argsort(-areas) # draw large ones first 77 | assert areas.min() > 0, areas.min() 78 | 79 | im = im.copy() 80 | 81 | for i in sorted_inds: 82 | box = boxes[i, :] 83 | if box[0] < 0 or box[1] < 0 or box[2] < 0 or box[3] < 0: 84 | continue 85 | 86 | color = (218, 218, 218) 87 | if colors is not None: 88 | color = colors[i] 89 | 90 | best_color = color 91 | 92 | lineh = 2 # for box enlarging, replace with text height if there is label 93 | if labels is not None: 94 | label = labels[i] 95 | 96 | # find the best placement for the text 97 | ((linew, lineh), _) = cv2.getTextSize(label, FONT, FONT_SCALE, font_thick) 98 | bottom_left = [box[0] + 1, box[1] - 0.3 * lineh] 99 | top_left = [box[0] + 1, box[1] - 1.3 * lineh] 100 | if top_left[1] < 0: # out of image 101 | top_left[1] = box[3] - 1.3 * lineh 102 | bottom_left[1] = box[3] - 0.3 * lineh 103 | 104 | textbox = [int(top_left[0]), int(top_left[1]), 105 | int(top_left[0] + linew), int(top_left[1] + lineh)] 106 | #textbox.clip_by_shape(im.shape[:2]) 107 | 108 | offset = 0 109 | if offsets is not None: 110 | offset = lineh * offsets[i] 111 | 112 | if bottom_text: 113 | cv2.putText(im, label, (box[0] + 2, box[3] - 4 + offset), 114 | FONT, FONT_SCALE, color=best_color, thickness=font_thick) 115 | else: 116 | cv2.putText(im, label, (textbox[0], textbox[3] - offset), 117 | FONT, FONT_SCALE, color=best_color, thickness=font_thick) 118 | 119 | # expand the box on y axis for overlapping results 120 | offset = 0 121 | if offsets is not None: 122 | offset = lineh * offsets[i] 123 | box[0] -= box_thick * offsets[i] + 1 124 | box[2] += box_thick * offsets[i] + 1 125 | if bottom_text: 126 | box[1] -= box_thick * offsets[i] + 1 127 | box[3] += offset 128 | else: 129 | box[3] += box_thick * offsets[i] + 1 130 | box[1] -= offset 131 | 132 | cv2.rectangle(im, (box[0], box[1]), (box[2], box[3]), 133 | color=best_color, thickness=box_thick) 134 | return im 135 | 136 | color_name_to_hex = mcolors.CSS4_COLORS.copy() # {'whitesmoke': '#F5F5F5', ...} 137 | if __name__ == "__main__": 138 | args = parser.parse_args() 139 | 140 | color_name_list = sorted(list(color_name_to_hex.keys()))[:] 141 | random.seed(69) 142 | random.shuffle(color_name_list) 143 | 144 | color_assign = {} # global track id, obj -> name 145 | 146 | if not os.path.exists(args.out_path): 147 | os.makedirs(args.out_path) 148 | 149 | video_names = [os.path.basename(line.strip()) # with .avi 150 | for line in open(args.video_name_lst, "r").readlines()] 151 | for video_name in tqdm(video_names): 152 | video_name_no_appendix = os.path.splitext(video_name)[0] 153 | frames = glob(os.path.join(args.frame_path, video_name_no_appendix, "*.jpg")) 154 | frames.sort() 155 | 156 | # frame_id -> {cat_name: ..} 157 | track_data = load_track_file( 158 | os.path.join(args.track_path, video_name), 159 | ["Person", "Vehicle"]) 160 | 161 | target_file = os.path.join(args.out_path, "%s.mp4" % video_name_no_appendix) 162 | 163 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 164 | fps = 30.0 165 | video_writer = cv2.VideoWriter(target_file, fourcc, fps, (1920, 1080), True) 166 | 167 | count_global_ids = {} 168 | for frame in frames: 169 | filename = os.path.splitext(os.path.basename(frame))[0] 170 | frame_id = int(filename.split("_F_")[-1]) 171 | 172 | boxes = [] 173 | labels = [] 174 | box_colors = [] 175 | if frame_id in track_data: 176 | this_track_data = track_data[frame_id] 177 | for cat_name in this_track_data: 178 | for box_data in this_track_data[cat_name]: # [N, 8] 179 | # get color and label 180 | local_track_id = box_data[1] 181 | global_track_id = box_data[7] 182 | if global_track_id != -1: 183 | color_key = (global_track_id, cat_name) 184 | count_global_ids[color_key] = 1 185 | track_id = "g%s" % global_track_id 186 | else: 187 | if args.show_only_global: 188 | continue 189 | color_key = (video_name, local_track_id, cat_name) 190 | track_id = local_track_id 191 | color = get_or_create_color_from_dict( 192 | color_key, color_assign, color_name_list) 193 | box_colors.append(color) 194 | 195 | conf = box_data[6] 196 | conf_str = "" 197 | if conf != 1.: 198 | conf_str = "%.2f" % conf 199 | labels.append("%s #%s %s"%(cat_name, track_id, conf_str)) 200 | 201 | tlwh = box_data[2:6] 202 | tlbr = [tlwh[0], tlwh[1], tlwh[0] + tlwh[2], tlwh[1] + tlwh[3]] 203 | boxes.append(tlbr) 204 | 205 | new_im = cv2.imread(frame, cv2.IMREAD_COLOR) 206 | new_im = draw_boxes(new_im, boxes, labels, box_colors, font_scale=0.8, 207 | font_thick=2, box_thick=2, bottom_text=False) 208 | # write the frame idx 209 | new_im = cv2.putText(new_im, "# %d" % frame_id, 210 | (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 211 | 1, (0, 255, 0), 2) 212 | # the frames might not be 1920x1080 213 | new_im = cv2.resize(new_im, (1920, 1080)) 214 | video_writer.write(new_im) 215 | 216 | video_writer.release() 217 | tqdm.write("%s has %s global tracks:%s" % ( 218 | video_name, len(count_global_ids), count_global_ids.keys())) 219 | cv2.destroyAllWindows() 220 | --------------------------------------------------------------------------------