├── ACTIVITY_BOX.md
├── COMMANDS.md
├── LICENSE
├── README.md
├── SPEED.md
├── TRAINING.md
├── application_util
├── __init__.py
├── image_viewer.py
├── preprocessing.py
└── visualization.py
├── check_and_vis_global_tracks.py
├── class_ids.py
├── datasets.py
├── deep_sort
├── __init__.py
├── detection.py
├── iou_matching.py
├── kalman_filter.py
├── linear_assignment.py
├── nn_matching.py
├── track.py
├── tracker.py
└── utils.py
├── deformable_helper.py
├── diva_io
├── LICENSE
├── README.md
├── __init__.py
├── annotation
│ ├── __init__.py
│ ├── converter.py
│ └── kf1.py
├── docs
│ └── speed.md
├── environment.yml
├── utils
│ ├── __init__.py
│ └── log.py
└── video
│ ├── __init__.py
│ ├── frame.py
│ ├── reader.py
│ ├── speed_test.sh
│ └── test.py
├── efficientdet
├── __init__.py
├── anchors.py
├── backbone
│ ├── __init__.py
│ ├── backbone_factory.py
│ ├── efficientnet_builder.py
│ ├── efficientnet_builder_test.py
│ ├── efficientnet_lite_builder.py
│ ├── efficientnet_lite_builder_test.py
│ ├── efficientnet_model.py
│ └── efficientnet_model_test.py
├── dataloader.py
├── efficientdet_arch.py
├── hparams_config.py
├── object_detection
│ ├── __init__.py
│ ├── argmax_matcher.py
│ ├── box_coder.py
│ ├── box_list.py
│ ├── faster_rcnn_box_coder.py
│ ├── matcher.py
│ ├── preprocessor.py
│ ├── region_similarity_calculator.py
│ ├── shape_utils.py
│ ├── target_assigner.py
│ └── tf_example_decoder.py
└── utils.py
├── efficientdet_wrapper.py
├── enqueuer.py
├── enqueuer_thread.py
├── eval.py
├── generate_anchors.py
├── generate_util_graph.py
├── get_frames_resize.py
├── images
├── Person_vis_video.gif
├── Vehicle_vis_video.gif
├── actev-prizechallenge-06-2019.png
├── inf_actev_0.49audc_02-2020.png
├── multi-camera-reid.gif
├── person_multi_reid.gif
├── person_multi_reid2.gif
├── util_log_b1partial.png
├── util_log_b8multithread.png
└── vehicle_multi_reid.gif
├── main.py
├── models.py
├── multi_video_reid.py
├── nn.py
├── obj_detect_imgs.py
├── obj_detect_imgs_multi.py
├── obj_detect_imgs_multi_queuer.py
├── obj_detect_tracking.py
├── obj_detect_tracking_multi.py
├── obj_detect_tracking_multi_queuer.py
├── obj_detect_tracking_multi_queuer_tmot.py
├── single_video_reid.py
├── tensorrt_optimize.py
├── tensorrt_optimize_tf1.15.py
├── test_reid.py
├── tester.py
├── tmot
├── __init__.py
├── basetrack.py
├── kalman_filter.py
├── matching.py
└── multitracker.py
├── torchreid
├── distance.py
├── feature_extractor.py
└── models
│ ├── __init__.py
│ ├── densenet.py
│ ├── hacnn.py
│ ├── inceptionresnetv2.py
│ ├── inceptionv4.py
│ ├── mlfn.py
│ ├── mobilenetv2.py
│ ├── mudeep.py
│ ├── nasnet.py
│ ├── osnet.py
│ ├── osnet_ain.py
│ ├── pcb.py
│ ├── resnet.py
│ ├── resnet_ibn_a.py
│ ├── resnet_ibn_b.py
│ ├── resnetmid.py
│ ├── senet.py
│ ├── shufflenet.py
│ ├── shufflenetv2.py
│ ├── squeezenet.py
│ └── xception.py
├── track_to_json.py
├── tracks_to_json.py
├── trainer.py
├── utils.py
├── vis_json.py
├── vis_tracks.py
└── viz.py
/ACTIVITY_BOX.md:
--------------------------------------------------------------------------------
1 | # Frame-level Activity Detection Experiments
2 |
3 | ## Training
4 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to train on and extract all the frames into the following format: `training_frames/${videoname}/${videoname}_F_%08d.jpg`.
5 |
6 | - Put annotations into a single folder. one npz file for one frame: `training_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Here `actlabels` and `actboxes` are used during training:
7 | ```
8 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_012019_actgt_allsingle_npz.tar
9 | ```
10 |
11 | - Prepare the file list for training set and validation set. We split a small subset of the ActEV training set as the validation set and the ActEV validation set will be used for testing. You can download my file lst. Training set:
12 | ```
13 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minusminival_frames.lst
14 | ```
15 | Validation set:
16 | ```
17 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minival_frames.lst
18 | ```
19 | These file lists are in absolute path. You will need to replace the path with the correct ones.
20 |
21 | - Download MSCOCO pretrained model from Tensorpack:
22 | ```
23 | wget http://models.tensorpack.com/FasterRCNN/COCO-MaskRCNN-R101FPN1x.npz
24 | ```
25 |
26 | - Train the Actbox v1 model with 1 GPU:
27 | ```
28 | $python main.py nothing training_frames --mode train --annopath v1-training_012019_actgt_allsingle_npz \
29 | --trainlst v1-training_minusminival_frames.lst --train_skip 5 --valframepath \
30 | v1-training_frames --vallst v1-training_minival_frames.lst --valannopath \
31 | v1-training_012019_actgt_allsingle_npz --outbasepath bupt_actboxexp_resnet101_dilation_classagnostic --modelname mrcnn101 --num_epochs 20 \
32 | --save_period 2500 --rpn_batch_size 256 --frcnn_batch_size 512 --num_class 10 \
33 | --bupt_exp --max_size 1920 --short_edge_size 1080 --init_lr 0.003 --use_cosine_schedule \
34 | --warm_up_steps 5000 --optimizer momentum --rpn_test_post_nms_topk 1000 --freeze 0 \
35 | --gpu 1 --is_fpn --im_batch_size 1 --flip_image --val_skip 20 --load_from \
36 | COCO-R101FPN-MaskRCNN-Standard.npz --skip_first_eval --best_first -1 --show_loss_period \
37 | 1000 --loss_me_step 50 --ignore_vars fastrcnn/outputs --wd 0.0001 --use_dilation \
38 | --use_frcnn_class_agnostic
39 | ```
40 | You can change `--gpu 4` and `--im_batch_size 4` (and maybe `--gpuid_start`) if you have a multi-GPU machine. Note that it is a [known bug](https://github.com/tensorflow/tensorflow/issues/23458) in tf 1.13 that you would see all 4 gpu memory allocated even if you set gpu to 2. This is fixed in tf 1.14.0 (but still takes some GPU0's memory). But multi-GPU training with a subset of the GPUs (`--gpuid_start` larger than 0) will fail since tf v1.13 according to [this](https://github.com/tensorflow/tensorflow/issues/27259).
41 |
42 | ## Testing
43 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to test on and extract all the frames into the following format: `validation_frames/${videoname}/${videoname}_F_%08d.jpg`.
44 |
45 | - Put annotations into a single folder. one npz file for one frame: `testing_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Here `actlabels` and `actboxes` are used:
46 | ```
47 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_012019_actgt_allsingle_npz.tar
48 | ```
49 |
50 | - Prepare the file list for testing. We use the official validation set as testing set. You can download my file lst:
51 | ```
52 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_frames.lst
53 | ```
54 | Again, you will need to replace them with the correct absolute path.
55 |
56 | - Test the model by generating COCO-format JSON files:
57 | ```
58 | $ python main.py nothing v1-validate_frames.lst --mode forward --outbasepath \
59 | actbox_v1_test --rpn_batch_size 256 --frcnn_batch_size 512 --num_class 10 --bupt_exp \
60 | --max_size 1920 --short_edge_size 1080 --rpn_test_post_nms_topk 1000 --gpu 1 --is_fpn \
61 | --im_batch_size 1 --load_from bupt_actboxexp_resnet101_dilation_classagnostic/ \
62 | mrcnn101/01/save-best/ --use_dilation --use_frcnn_class_agnostic --log
63 | ```
64 |
65 | - Evaluate:
66 | ```
67 | $ python eval.py v1-validate_frames.lst v1-validate_012019_actgt_allsingle_npz \
68 | actbox_v1_test --bupt_exp
69 | ```
70 |
71 | - Visualize:
72 | ```
73 | $ python vis_json.py v1-validate_videos.lst v1-validate_frames/ actbox_v1_test/ \
74 | actbox_v1_test_visbox --score_thres 0.7
75 | ```
76 |
77 | - Tracking
78 | ```
79 | $ python obj_detect_tracking.py --model_path bupt_actboxexp_resnet101_dilation_classagnostic/mrcnn101/01/save-best/ --version 5 \
80 | --video_dir v1-validate_videos/ --video_lst_file v1-validate_videos.names.lst --out_dir \
81 | act_box_out --frame_gap 1 --get_tracking --tracking_dir act_track_out --min_confidence \
82 | 0.8 --tracking_objs Person-Vehicle,Pull,Riding,Talking,Transport_HeavyCarry,Vehicle-Turning,activity_carrying \
83 | --bupt_exp --num_class 10 --gpuid_start 0
84 | ```
85 |
--------------------------------------------------------------------------------
/COMMANDS.md:
--------------------------------------------------------------------------------
1 | # Example Commands
2 |
3 | ## 02-2020 0.49 pAUDC, 0.64 processing time
4 | ```
5 | $ python obj_detect_tracking.py \
6 | --model_path obj_coco_resnet50_partial_tfv1.14_1280x720_rpn300.pb \
7 | --video_dir videos --tracking_dir output/ --video_lst_file videos.lst \
8 | --version 2 --is_coco_model --use_partial_classes --frame_gap 8 \
9 | --is_load_from_pb --get_tracking \
10 | --tracking_objs Person,Vehicle --min_confidence 0.85 \
11 | --resnet50 --rpn_test_post_nms_topk 300 --max_size 1280 --short_edge_size 720 \
12 | --use_lijun_video_loader --nms_max_overlap 0.85 --max_iou_distance 0.5 \
13 | --max_cosine_distance 0.5 --nn_budget 5
14 | ```
15 | This is for processing AVI videos. For MP4 videos, run without `--use_lijun`.
16 | Add `--log_time_and_gpu` to get GPU utilization and time profile.
17 |
18 |
19 | ## 05-2020, added EfficientDet
20 | The [EfficientDet (CVPR 2020)](https://github.com/google/automl/tree/master/efficientdet) (D7) is reported to be more than 12 mAP better than the Resnet-50 FPN model we used on COCO.
21 |
22 | I have made the following changes based on the code from early May:
23 | + Added multi-level ROI align with the final detection boxes since we need the FPN box features for deep-SORT tracking. Basically since one-stage object detection models have box predictions at each feature level, I added a level index variable to keep track of each box's feature level so that in the end they can be efficiently backtracked to the original feature map and crop the features.
24 | + Similar to the MaskRCNN model, I modified the EfficientDet to allow NMS on only some of the COCO classes (currently we only care about person and vehicle) and save computations.
25 |
26 |
27 | Example command \[[d0 model from early May](https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/models/efficientdet-d0.tar.gz)\]:
28 | ```
29 | $ python obj_detect_tracking.py \
30 | --model_path efficientdet-d0 \
31 | --efficientdet_modelname efficientdet-d0 --is_efficientdet \
32 | --efficientdet_max_detection_topk 5000 \
33 | --video_dir videos --tracking_dir output/ --video_lst_file videos.lst \
34 | --version 2 --is_coco_model --use_partial_classes --frame_gap 8 \
35 | --get_tracking --tracking_objs Person,Vehicle --min_confidence 0.6 \
36 | --max_size 1280 --short_edge_size 720 \
37 | --use_lijun_video_loader --nms_max_overlap 0.85 --max_iou_distance 0.5 \
38 | --max_cosine_distance 0.5 --nn_budget 5
39 | ```
40 | This is for processing AVI videos. I have tried it with pyav==6.2.0. Install it by
41 | ```
42 | $ sudo apt-get install -y \
43 | libavformat-dev libavcodec-dev libavdevice-dev \
44 | libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
45 | $ sudo pip install av==6.2.0
46 | ```
47 |
48 | For MP4 videos, run without `--use_lijun`.
49 | Add `--log_time_and_gpu` to get GPU utilization and time profile.
50 |
51 | Example command with a partial frozen graph \[[d0-TFv1.15](https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/models/efficientd0_tfv1.15_1280x720.pb)\] (slightly faster):
52 | ```
53 | $ python obj_detect_tracking.py \
54 | --model_path efficientd0_tfv1.15_1280x720.pb --is_load_from_pb \
55 | --efficientdet_modelname efficientdet-d0 --is_efficientdet \
56 | --efficientdet_max_detection_topk 5000 \
57 | --video_dir videos --tracking_dir output/ --video_lst_file videos.lst \
58 | --version 2 --is_coco_model --use_partial_classes --frame_gap 8 \
59 | --get_tracking --tracking_objs Person,Vehicle --min_confidence 0.6 \
60 | --max_size 1280 --short_edge_size 720 \
61 | --use_lijun_video_loader --nms_max_overlap 0.85 --max_iou_distance 0.5 \
62 | --max_cosine_distance 0.5 --nn_budget 5
63 | ```
64 |
65 | [05/04/2020] Tried to optimize the frozen model with TensorRT by:
66 | ```
67 | $ python tensorrt_optimize_tf1.15.py efficientd0_tfv1.15_1280x720.pb \
68 | efficientd0_tfv1.15_1280x720_trt_fp16.pb --precision_mode FP16
69 | ```
70 | But it does not work:
71 | ```
72 | 2020-05-04 22:11:48.850233: F tensorflow/core/framework/op_kernel.cc:875] Check failed: mutable_output(index) == nullptr (0x7f82d4244ff0 vs. nullptr)
73 | Aborted (core dumped)
74 | ```
75 |
76 | Run object detection and visualization on images. This could be used to reproduce the official repo's tutorial output:
77 | ```
78 | $ python obj_detect_imgs.py --model_path efficientdet-d0 \
79 | --efficientdet_modelname efficientdet-d0 --is_efficientdet \
80 | --img_lst imgs.lst --out_dir test_d0_json \
81 | --visualize --vis_path test_d0_vis --vis_thres 0.4 \
82 | --max_size 1920 --short_edge_size 1080 \
83 | --efficientdet_max_detection_topk 5000
84 | ```
85 |
86 | ## 10-2020, comparing EfficientDet with MaskRCNN on video datasets
87 |
88 | 1. VIRAT
89 |
90 |
91 |
92 | Models |
93 | COCO-validation-AP-80classes |
94 | VIRAT Person-Val-AP |
95 | VIRAT Vehicle-Val-AP |
96 | VIRAT Bike-Val-AP |
97 |
98 |
99 | MaskRCNN, R50-FPN |
100 | 0.389 |
101 | 0.374 |
102 | 0.943 |
103 | 0.367 |
104 |
105 |
106 | MaskRCNN, R101-FPN |
107 | 0.407 |
108 | 0.378 |
109 | 0.947 |
110 | 0.399 |
111 |
112 |
113 | EfficientDet-d2 |
114 | 0.425 |
115 | 0.371 |
116 | 0.949 |
117 | 0.293 |
118 |
119 |
120 | EfficientDet-d6 |
121 | 0.513 |
122 | 0.422 |
123 | 0.947 |
124 | 0.355 |
125 |
126 |
127 |
128 | 2. AVA-Kinetics
129 |
130 |
131 |
132 | Models |
133 | COCO-validation-AP-80classes |
134 | AVA-Kinetics Train-Person-AP |
135 | AVA-Kinetics Val-Person-AP |
136 |
137 |
138 | MaskRCNN, R101-FPN |
139 | 0.407 |
140 | 0.664 |
141 | 0.682 |
142 |
143 |
144 | EfficientDet-d2 |
145 | 0.425 |
146 | 0.650 |
147 | 0.680 |
148 |
149 |
150 | EfficientDet-d6 |
151 | 0.513 |
152 | 0.623 |
153 | 0.658 |
154 |
155 |
156 |
157 | VIRAT consists of mostly small person boxes, while AVA-Kineitcs has much bigger ones. So it seems EfficientDet is slightly better on detecting small person. However, EfficientDet-d6 is about 2.4x the inference time of MaskRCNN-R101-FPN.
158 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Junwei Liang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/TRAINING.md:
--------------------------------------------------------------------------------
1 | # CMU Object Detection & Tracking for Surveillance Video Activity Detection
2 |
3 | ## Training
4 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to train on and extract all the frames into the following format: `training_frames/${videoname}/${videoname}_F_%08d.jpg`.
5 |
6 | - Put annotations into a single folder. one npz file for one frame: `training_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Only `labels` and `boxes` are used during training:
7 | ```
8 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_012019_actgt_allsingle_npz.tar
9 | ```
10 |
11 | - Prepare the file list for training set and validation set. We split a small subset of the ActEV training set as the validation set and the ActEV validation set will be used for testing. You can download my file lst. Training set:
12 | ```
13 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minusminival_frames.lst
14 | ```
15 | Validation set:
16 | ```
17 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-training_minival_frames.lst
18 | ```
19 | These file lists are in absolute path. You will need to replace the path with the correct ones.
20 |
21 | - Download MSCOCO pretrained model from Tensorpack:
22 | ```
23 | wget http://models.tensorpack.com/FasterRCNN/COCO-MaskRCNN-R101FPN1x.npz
24 | ```
25 |
26 | - Train the obj_v3 model with 1 GPU:
27 | ```
28 | $ python main.py nothing training_frames --mode train --annopath v1-training_012019_actgt_allsingle_npz \
29 | --trainlst v1-training_minusminival_frames.lst --train_skip 30 --valframepath v1-training_frames --vallst \
30 | v1-training_minival_frames.lst --valannopath v1-training_012019_actgt_allsingle_npz --outbasepath my_model \
31 | --modelname obj_v3 --num_epochs 15 --save_period 5000 --rpn_batch_size 256 --frcnn_batch_size 512 --num_class \
32 | 15 --diva_class3 --max_size 1920 --short_edge_size 1080 --init_lr 0.006 --use_cosine_schedule --warm_up_steps \
33 | 10000 --optimizer momentum --rpn_test_post_nms_topk 1000 --freeze 0 --gpu 1 --is_fpn --im_batch_size 1 \
34 | --flip_image --load_from COCO-MaskRCNN-R101FPN1x.npz --skip_first_eval --best_first -1 --show_loss_period 1000 \
35 | --loss_me_step 50 --ignore_vars fastrcnn/outputs --wd 0.0001 --use_dilation --use_frcnn_class_agnostic
36 | ```
37 | You can change `--gpu 4` and `--im_batch_size 4` (and maybe `--gpuid_start`) if you have a multi-GPU machine. Note that it is a [known bug](https://github.com/tensorflow/tensorflow/issues/23458) in tf 1.13 that you would see all 4 gpu memory allocated even if you set gpu to 2. This is fixed in tf 1.14.0 (but still takes some GPU0's memory). But multi-GPU training with a subset of the GPUs (`--gpuid_start` larger than 0) will fail since tf v1.13 according to [this](https://github.com/tensorflow/tensorflow/issues/27259).
38 |
39 | - June 2020, finetune MaskRCNN person detection on AVA-Kinetics Dataset:
40 | ```
41 | $ python main.py nothing pack_ava_kinetics_keyframes --mode train --annopath ava_kinetics_person_box_anno/ --trainlst person_train.lst --valframepath pack_ava_kinetics_keyframes --vallst person_val.lst --valannopath ava_kinetics_person_box_anno/ --outbasepath maskrcnn_finetune --modelname maskrcnn_r101fpn --num_epochs 15 --save_period 5000 --rpn_batch_size 256 --frcnn_batch_size 512 --num_class 81 --is_coco_model --one_level_framepath --max_size 560 --short_edge_size 320 --init_lr 0.001 --use_cosine_schedule --warm_up_steps 10000 --optimizer momentum --rpn_test_post_nms_topk 1000 --freeze 0 --gpu 4 --is_fpn --im_batch_size 4 --flip_image --load_from COCO-MaskRCNN-R101FPN1x.npz --show_loss_period 1000 --loss_me_step 100 --wd 0.0001 --val_skip 10
42 | ```
43 |
44 | ## Testing
45 | - Download the videos from [ActEV](https://actev.nist.gov/) or the dataset you wish to test on and extract all the frames into the following format: `validation_frames/${videoname}/${videoname}_F_%08d.jpg`.
46 |
47 | - Put annotations into a single folder. one npz file for one frame: `testing_annotations/${videoname}_F_%08d.npz`. The filename must match the frame name. You can download my processed annotations and check the data format. Only `labels` and `boxes` are used during training:
48 | ```
49 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_012019_actgt_allsingle_npz.tar
50 | ```
51 |
52 | - Prepare the file list for testing. We use the official validation set as testing set. You can download my file lst:
53 | ```
54 | wget https://aladdin-eax.inf.cs.cmu.edu/shares/diva_obj_detect_models/v1-validate_frames.lst
55 | ```
56 | Again, you will need to replace them with the correct absolute path.
57 |
58 | - Test the model by generating COCO-format JSON files:
59 | ```
60 | $ python main.py nothing v1-validate_frames.lst --mode forward --outbasepath test_jsons --rpn_batch_size 256 \
61 | --frcnn_batch_size 512 --num_class 15 --diva_class3 --max_size 1920 --short_edge_size 1080 \
62 | --rpn_test_post_nms_topk 1000 --gpu 1 --is_fpn --im_batch_size 1 --load_from my_model/obj_v3/01/save-best/ \
63 | --use_frcnn_class_agnostic --use_dilation
64 | ```
65 |
66 | - Evaluate:
67 | ```
68 | $ python eval.py v1-validate_frames.lst v1-validate_012019_actgt_allsingle_npz test_jsons/
69 | ```
70 |
71 |
--------------------------------------------------------------------------------
/application_util/__init__.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
--------------------------------------------------------------------------------
/application_util/preprocessing.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | import numpy as np
3 | import cv2
4 |
5 |
6 | def non_max_suppression(boxes, max_bbox_overlap, scores=None):
7 | """Suppress overlapping detections.
8 |
9 | Original code from [1]_ has been adapted to include confidence score.
10 |
11 | .. [1] http://www.pyimagesearch.com/2015/02/16/
12 | faster-non-maximum-suppression-python/
13 |
14 | Examples
15 | --------
16 |
17 | >>> boxes = [d.roi for d in detections]
18 | >>> scores = [d.confidence for d in detections]
19 | >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores)
20 | >>> detections = [detections[i] for i in indices]
21 |
22 | Parameters
23 | ----------
24 | boxes : ndarray
25 | Array of ROIs (x, y, width, height).
26 | max_bbox_overlap : float
27 | ROIs that overlap more than this values are suppressed.
28 | scores : Optional[array_like]
29 | Detector confidence score.
30 |
31 | Returns
32 | -------
33 | List[int]
34 | Returns indices of detections that have survived non-maxima suppression.
35 |
36 | """
37 | if len(boxes) == 0:
38 | return []
39 |
40 | boxes = boxes.astype(np.float)
41 | pick = []
42 |
43 | x1 = boxes[:, 0]
44 | y1 = boxes[:, 1]
45 | x2 = boxes[:, 2] + boxes[:, 0]
46 | y2 = boxes[:, 3] + boxes[:, 1]
47 |
48 | area = (x2 - x1 + 1) * (y2 - y1 + 1)
49 | if scores is not None:
50 | idxs = np.argsort(scores)
51 | else:
52 | idxs = np.argsort(y2)
53 |
54 | while len(idxs) > 0:
55 | last = len(idxs) - 1
56 | i = idxs[last]
57 | pick.append(i)
58 |
59 | xx1 = np.maximum(x1[i], x1[idxs[:last]])
60 | yy1 = np.maximum(y1[i], y1[idxs[:last]])
61 | xx2 = np.minimum(x2[i], x2[idxs[:last]])
62 | yy2 = np.minimum(y2[i], y2[idxs[:last]])
63 |
64 | w = np.maximum(0, xx2 - xx1 + 1)
65 | h = np.maximum(0, yy2 - yy1 + 1)
66 |
67 | overlap = (w * h) / area[idxs[:last]]
68 |
69 | idxs = np.delete(
70 | idxs, np.concatenate(
71 | ([last], np.where(overlap > max_bbox_overlap)[0])))
72 |
73 | return pick
74 |
--------------------------------------------------------------------------------
/application_util/visualization.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | import numpy as np
3 | import colorsys
4 | from .image_viewer import ImageViewer
5 |
6 |
7 | def create_unique_color_float(tag, hue_step=0.41):
8 | """Create a unique RGB color code for a given track id (tag).
9 |
10 | The color code is generated in HSV color space by moving along the
11 | hue angle and gradually changing the saturation.
12 |
13 | Parameters
14 | ----------
15 | tag : int
16 | The unique target identifying tag.
17 | hue_step : float
18 | Difference between two neighboring color codes in HSV space (more
19 | specifically, the distance in hue channel).
20 |
21 | Returns
22 | -------
23 | (float, float, float)
24 | RGB color code in range [0, 1]
25 |
26 | """
27 | h, v = (tag * hue_step) % 1, 1. - (int(tag * hue_step) % 4) / 5.
28 | r, g, b = colorsys.hsv_to_rgb(h, 1., v)
29 | return r, g, b
30 |
31 |
32 | def create_unique_color_uchar(tag, hue_step=0.41):
33 | """Create a unique RGB color code for a given track id (tag).
34 |
35 | The color code is generated in HSV color space by moving along the
36 | hue angle and gradually changing the saturation.
37 |
38 | Parameters
39 | ----------
40 | tag : int
41 | The unique target identifying tag.
42 | hue_step : float
43 | Difference between two neighboring color codes in HSV space (more
44 | specifically, the distance in hue channel).
45 |
46 | Returns
47 | -------
48 | (int, int, int)
49 | RGB color code in range [0, 255]
50 |
51 | """
52 | r, g, b = create_unique_color_float(tag, hue_step)
53 | return int(255*r), int(255*g), int(255*b)
54 |
55 |
56 | class NoVisualization(object):
57 | """
58 | A dummy visualization object that loops through all frames in a given
59 | sequence to update the tracker without performing any visualization.
60 | """
61 |
62 | def __init__(self, seq_info):
63 | self.frame_idx = seq_info["min_frame_idx"]
64 | self.last_idx = seq_info["max_frame_idx"]
65 |
66 | def set_image(self, image):
67 | pass
68 |
69 | def draw_groundtruth(self, track_ids, boxes):
70 | pass
71 |
72 | def draw_detections(self, detections):
73 | pass
74 |
75 | def draw_trackers(self, trackers):
76 | pass
77 |
78 | def run(self, frame_callback):
79 | while self.frame_idx <= self.last_idx:
80 | frame_callback(self, self.frame_idx)
81 | self.frame_idx += 1
82 |
83 |
84 | class Visualization(object):
85 | """
86 | This class shows tracking output in an OpenCV image viewer.
87 | """
88 |
89 | def __init__(self, seq_info, update_ms):
90 | image_shape = seq_info["image_size"][::-1]
91 | aspect_ratio = float(image_shape[1]) / image_shape[0]
92 | image_shape = 1024, int(aspect_ratio * 1024)
93 | self.viewer = ImageViewer(
94 | update_ms, image_shape, "Figure %s" % seq_info["sequence_name"])
95 | self.viewer.thickness = 2
96 | self.frame_idx = seq_info["min_frame_idx"]
97 | self.last_idx = seq_info["max_frame_idx"]
98 |
99 | def run(self, frame_callback):
100 | self.viewer.run(lambda: self._update_fun(frame_callback))
101 |
102 | def _update_fun(self, frame_callback):
103 | if self.frame_idx > self.last_idx:
104 | return False # Terminate
105 | frame_callback(self, self.frame_idx)
106 | self.frame_idx += 1
107 | return True
108 |
109 | def set_image(self, image):
110 | self.viewer.image = image
111 |
112 | def draw_groundtruth(self, track_ids, boxes):
113 | self.viewer.thickness = 2
114 | for track_id, box in zip(track_ids, boxes):
115 | self.viewer.color = create_unique_color_uchar(track_id)
116 | self.viewer.rectangle(*box.astype(np.int), label=str(track_id))
117 |
118 | def draw_detections(self, detections):
119 | self.viewer.thickness = 2
120 | self.viewer.color = 0, 0, 255
121 | for i, detection in enumerate(detections):
122 | self.viewer.rectangle(*detection.tlwh)
123 |
124 | def draw_trackers(self, tracks):
125 | self.viewer.thickness = 2
126 | for track in tracks:
127 | if not track.is_confirmed() or track.time_since_update > 0:
128 | continue
129 | self.viewer.color = create_unique_color_uchar(track.track_id)
130 | self.viewer.rectangle(
131 | *track.to_tlwh().astype(np.int), label=str(track.track_id))
132 | return self.viewer.image
133 | # self.viewer.gaussian(track.mean[:2], track.covariance[:2, :2],
134 | # label="%d" % track.track_id)
135 | #
136 |
--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # dataset object definition
3 | import cv2
4 | import os
5 | import logging
6 | import math
7 | import numpy as np
8 | from nn import resizeImage
9 |
10 | # dataset object need to implement the following function
11 | # get_sample(self, idx)
12 | # collect_batch(self, datalist)
13 | class ImageDataset(object):
14 | def __init__(self, cfg, split, imglst, annotations=None):
15 | """
16 | imglst: a file containing a list of absolute path to all the images
17 | """
18 | self.cfg = cfg # this should include short_edge_size, max_size, etc.
19 | self.split = split
20 | self.imglst = imglst
21 | self.annotations = annotations
22 |
23 | # machine-specific config
24 | self.num_gpu = cfg.gpu
25 | self.batch_size = cfg.im_batch_size
26 | self.batch_size_per_gpu = self.batch_size // cfg.gpu
27 | assert self.batch_size % cfg.gpu == 0, "bruh"
28 |
29 |
30 | if self.split == "train":
31 | self.num_epochs = cfg.num_epochs
32 | else:
33 | self.num_epochs = 1
34 |
35 | # load the img file list
36 | self.imgs = [line.strip() for line in open(self.imglst).readlines()]
37 |
38 | self.num_samples = len(self.imgs) # one epoch length
39 |
40 | self.num_batches_per_epoch = int(
41 | math.ceil(self.num_samples / float(self.batch_size)))
42 | self.num_batches = int(self.num_batches_per_epoch * self.num_epochs)
43 | self.valid_idxs = range(self.num_samples)
44 |
45 | logging.info("Loaded %s imgs", len(self.imgs))
46 |
47 | def get_sample(self, idx):
48 | """
49 | preprocess one sample from the list
50 | """
51 | cfg = self.cfg
52 | img_file_path = self.imgs[idx]
53 |
54 | imgname = os.path.splitext(os.path.basename(img_file_path))[0]
55 |
56 | frame = cv2.imread(img_file_path)
57 | im = frame.astype("float32")
58 |
59 | resized_image = resizeImage(im, cfg.short_edge_size, cfg.max_size)
60 |
61 | scale = (resized_image.shape[0] * 1.0 / im.shape[0] + \
62 | resized_image.shape[1] * 1.0 / im.shape[1]) / 2.0
63 |
64 | return resized_image, scale, imgname, (im.shape[0], im.shape[1])
65 |
66 | def collect_batch(self, data, idxs=None):
67 | """
68 | collect the idxs of the data list into a dictionary
69 | """
70 | if idxs is None:
71 | idxs = range(len(data))
72 | imgs, scales, imgnames, shapes = zip(*[data[idx] for idx in idxs])
73 |
74 | return {
75 | "imgs": imgs,
76 | "scales": scales,
77 | "imgnames": imgnames,
78 | "ori_shapes": shapes
79 | }
80 |
81 |
82 |
--------------------------------------------------------------------------------
/deep_sort/__init__.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 |
--------------------------------------------------------------------------------
/deep_sort/detection.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | import numpy as np
3 |
4 |
5 | class Detection(object):
6 | """
7 | This class represents a bounding box detection in a single image.
8 |
9 | Parameters
10 | ----------
11 | tlwh : array_like
12 | Bounding box in format `(x, y, w, h)`.
13 | confidence : float
14 | Detector confidence score.
15 | feature : array_like
16 | A feature vector that describes the object contained in this image.
17 |
18 | Attributes
19 | ----------
20 | tlwh : ndarray
21 | Bounding box in format `(top left x, top left y, width, height)`.
22 | confidence : ndarray
23 | Detector confidence score.
24 | feature : ndarray | NoneType
25 | A feature vector that describes the object contained in this image.
26 |
27 | """
28 |
29 | def __init__(self, tlwh, confidence, feature):
30 | self.tlwh = np.asarray(tlwh, dtype=np.float)
31 | self.confidence = float(confidence)
32 | self.feature = np.asarray(feature, dtype=np.float32)
33 |
34 | def to_tlbr(self):
35 | """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
36 | `(top left, bottom right)`.
37 | """
38 | ret = self.tlwh.copy()
39 | ret[2:] += ret[:2]
40 | return ret
41 |
42 | def to_xyah(self):
43 | """Convert bounding box to format `(center x, center y, aspect ratio,
44 | height)`, where the aspect ratio is `width / height`.
45 | """
46 | ret = self.tlwh.copy()
47 | ret[:2] += ret[2:] / 2
48 | ret[2] /= ret[3]
49 | return ret
50 |
--------------------------------------------------------------------------------
/deep_sort/iou_matching.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | from __future__ import absolute_import
3 | import numpy as np
4 | from . import linear_assignment
5 |
6 |
7 | def iou(bbox, candidates):
8 | """Computer intersection over union.
9 |
10 | Parameters
11 | ----------
12 | bbox : ndarray
13 | A bounding box in format `(top left x, top left y, width, height)`.
14 | candidates : ndarray
15 | A matrix of candidate bounding boxes (one per row) in the same format
16 | as `bbox`.
17 |
18 | Returns
19 | -------
20 | ndarray
21 | The intersection over union in [0, 1] between the `bbox` and each
22 | candidate. A higher score means a larger fraction of the `bbox` is
23 | occluded by the candidate.
24 |
25 | """
26 | bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:]
27 | candidates_tl = candidates[:, :2]
28 | candidates_br = candidates[:, :2] + candidates[:, 2:]
29 |
30 | tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis],
31 | np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]]
32 | br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis],
33 | np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]]
34 | wh = np.maximum(0., br - tl)
35 |
36 | area_intersection = wh.prod(axis=1)
37 | area_bbox = bbox[2:].prod()
38 | area_candidates = candidates[:, 2:].prod(axis=1)
39 | return area_intersection / (area_bbox + area_candidates - area_intersection)
40 |
41 |
42 | def iou_cost(tracks, detections, track_indices=None,
43 | detection_indices=None):
44 | """An intersection over union distance metric.
45 |
46 | Parameters
47 | ----------
48 | tracks : List[deep_sort.track.Track]
49 | A list of tracks.
50 | detections : List[deep_sort.detection.Detection]
51 | A list of detections.
52 | track_indices : Optional[List[int]]
53 | A list of indices to tracks that should be matched. Defaults to
54 | all `tracks`.
55 | detection_indices : Optional[List[int]]
56 | A list of indices to detections that should be matched. Defaults
57 | to all `detections`.
58 |
59 | Returns
60 | -------
61 | ndarray
62 | Returns a cost matrix of shape
63 | len(track_indices), len(detection_indices) where entry (i, j) is
64 | `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`.
65 |
66 | """
67 | if track_indices is None:
68 | track_indices = np.arange(len(tracks))
69 | if detection_indices is None:
70 | detection_indices = np.arange(len(detections))
71 |
72 | cost_matrix = np.zeros((len(track_indices), len(detection_indices)))
73 | for row, track_idx in enumerate(track_indices):
74 | if tracks[track_idx].time_since_update > 1:
75 | cost_matrix[row, :] = linear_assignment.INFTY_COST
76 | continue
77 |
78 | bbox = tracks[track_idx].to_tlwh()
79 | candidates = np.asarray([detections[i].tlwh for i in detection_indices])
80 | cost_matrix[row, :] = 1. - iou(bbox, candidates)
81 | return cost_matrix
82 |
--------------------------------------------------------------------------------
/deep_sort/kalman_filter.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | import numpy as np
3 | import scipy.linalg
4 |
5 |
6 | """
7 | Table for the 0.95 quantile of the chi-square distribution with N degrees of
8 | freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
9 | function and used as Mahalanobis gating threshold.
10 | """
11 | chi2inv95 = {
12 | 1: 3.8415,
13 | 2: 5.9915,
14 | 3: 7.8147,
15 | 4: 9.4877,
16 | 5: 11.070,
17 | 6: 12.592,
18 | 7: 14.067,
19 | 8: 15.507,
20 | 9: 16.919}
21 |
22 |
23 | class KalmanFilter(object):
24 | """
25 | A simple Kalman filter for tracking bounding boxes in image space.
26 |
27 | The 8-dimensional state space
28 |
29 | x, y, a, h, vx, vy, va, vh
30 |
31 | contains the bounding box center position (x, y), aspect ratio a, height h,
32 | and their respective velocities.
33 |
34 | Object motion follows a constant velocity model. The bounding box location
35 | (x, y, a, h) is taken as direct observation of the state space (linear
36 | observation model).
37 |
38 | """
39 |
40 | def __init__(self):
41 | ndim, dt = 4, 1.
42 |
43 | # Create Kalman filter model matrices.
44 | self._motion_mat = np.eye(2 * ndim, 2 * ndim)
45 | for i in range(ndim):
46 | self._motion_mat[i, ndim + i] = dt
47 | self._update_mat = np.eye(ndim, 2 * ndim)
48 |
49 | # Motion and observation uncertainty are chosen relative to the current
50 | # state estimate. These weights control the amount of uncertainty in
51 | # the model. This is a bit hacky.
52 | self._std_weight_position = 1. / 20
53 | self._std_weight_velocity = 1. / 160
54 |
55 | def initiate(self, measurement):
56 | """Create track from unassociated measurement.
57 |
58 | Parameters
59 | ----------
60 | measurement : ndarray
61 | Bounding box coordinates (x, y, a, h) with center position (x, y),
62 | aspect ratio a, and height h.
63 |
64 | Returns
65 | -------
66 | (ndarray, ndarray)
67 | Returns the mean vector (8 dimensional) and covariance matrix (8x8
68 | dimensional) of the new track. Unobserved velocities are initialized
69 | to 0 mean.
70 |
71 | """
72 | mean_pos = measurement
73 | mean_vel = np.zeros_like(mean_pos)
74 | mean = np.r_[mean_pos, mean_vel]
75 |
76 | std = [
77 | 2 * self._std_weight_position * measurement[3],
78 | 2 * self._std_weight_position * measurement[3],
79 | 1e-2,
80 | 2 * self._std_weight_position * measurement[3],
81 | 10 * self._std_weight_velocity * measurement[3],
82 | 10 * self._std_weight_velocity * measurement[3],
83 | 1e-5,
84 | 10 * self._std_weight_velocity * measurement[3]]
85 |
86 | covariance = np.diag(np.square(std))
87 | return mean, covariance
88 |
89 | def predict(self, mean, covariance):
90 | """Run Kalman filter prediction step.
91 |
92 | Parameters
93 | ----------
94 | mean : ndarray
95 | The 8 dimensional mean vector of the object state at the previous
96 | time step.
97 | covariance : ndarray
98 | The 8x8 dimensional covariance matrix of the object state at the
99 | previous time step.
100 |
101 | Returns
102 | -------
103 | (ndarray, ndarray)
104 | Returns the mean vector and covariance matrix of the predicted
105 | state. Unobserved velocities are initialized to 0 mean.
106 |
107 | """
108 |
109 | std_pos = [
110 | self._std_weight_position * mean[3],
111 | self._std_weight_position * mean[3],
112 | 1e-2,
113 | self._std_weight_position * mean[3]]
114 | std_vel = [
115 | self._std_weight_velocity * mean[3],
116 | self._std_weight_velocity * mean[3],
117 | 1e-5,
118 | self._std_weight_velocity * mean[3]]
119 | motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
120 |
121 | mean = np.dot(self._motion_mat, mean)
122 | covariance = np.linalg.multi_dot((
123 | self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
124 |
125 | return mean, covariance
126 |
127 | def project(self, mean, covariance):
128 | """Project state distribution to measurement space.
129 |
130 | Parameters
131 | ----------
132 | mean : ndarray
133 | The state's mean vector (8 dimensional array).
134 | covariance : ndarray
135 | The state's covariance matrix (8x8 dimensional).
136 |
137 | Returns
138 | -------
139 | (ndarray, ndarray)
140 | Returns the projected mean and covariance matrix of the given state
141 | estimate.
142 |
143 | """
144 | std = [
145 | self._std_weight_position * mean[3],
146 | self._std_weight_position * mean[3],
147 | 1e-1,
148 | self._std_weight_position * mean[3]]
149 |
150 | innovation_cov = np.diag(np.square(std))
151 |
152 | mean = np.dot(self._update_mat, mean)
153 | covariance = np.linalg.multi_dot((
154 | self._update_mat, covariance, self._update_mat.T))
155 | return mean, covariance + innovation_cov
156 |
157 | def update(self, mean, covariance, measurement):
158 | """Run Kalman filter correction step.
159 |
160 | Parameters
161 | ----------
162 | mean : ndarray
163 | The predicted state's mean vector (8 dimensional).
164 | covariance : ndarray
165 | The state's covariance matrix (8x8 dimensional).
166 | measurement : ndarray
167 | The 4 dimensional measurement vector (x, y, a, h), where (x, y)
168 | is the center position, a the aspect ratio, and h the height of the
169 | bounding box.
170 |
171 | Returns
172 | -------
173 | (ndarray, ndarray)
174 | Returns the measurement-corrected state distribution.
175 |
176 | """
177 | projected_mean, projected_cov = self.project(mean, covariance)
178 |
179 | chol_factor, lower = scipy.linalg.cho_factor(
180 | projected_cov, lower=True, check_finite=False)
181 | kalman_gain = scipy.linalg.cho_solve(
182 | (chol_factor, lower), np.dot(covariance, self._update_mat.T).T,
183 | check_finite=False).T
184 | innovation = measurement - projected_mean
185 |
186 | new_mean = mean + np.dot(innovation, kalman_gain.T)
187 | new_covariance = covariance - np.linalg.multi_dot((
188 | kalman_gain, projected_cov, kalman_gain.T))
189 | return new_mean, new_covariance
190 |
191 | def gating_distance(self, mean, covariance, measurements,
192 | only_position=False):
193 | """Compute gating distance between state distribution and measurements.
194 |
195 | A suitable distance threshold can be obtained from `chi2inv95`. If
196 | `only_position` is False, the chi-square distribution has 4 degrees of
197 | freedom, otherwise 2.
198 |
199 | Parameters
200 | ----------
201 | mean : ndarray
202 | Mean vector over the state distribution (8 dimensional).
203 | covariance : ndarray
204 | Covariance of the state distribution (8x8 dimensional).
205 | measurements : ndarray
206 | An Nx4 dimensional matrix of N measurements, each in
207 | format (x, y, a, h) where (x, y) is the bounding box center
208 | position, a the aspect ratio, and h the height.
209 | only_position : Optional[bool]
210 | If True, distance computation is done with respect to the bounding
211 | box center position only.
212 |
213 | Returns
214 | -------
215 | ndarray
216 | Returns an array of length N, where the i-th element contains the
217 | squared Mahalanobis distance between (mean, covariance) and
218 | `measurements[i]`.
219 |
220 | """
221 | mean, covariance = self.project(mean, covariance)
222 | if only_position:
223 | mean, covariance = mean[:2], covariance[:2, :2]
224 | measurements = measurements[:, :2]
225 |
226 | cholesky_factor = np.linalg.cholesky(covariance)
227 | d = measurements - mean
228 | z = scipy.linalg.solve_triangular(
229 | cholesky_factor, d.T, lower=True, check_finite=False,
230 | overwrite_b=True)
231 | squared_maha = np.sum(z * z, axis=0)
232 | return squared_maha
233 |
--------------------------------------------------------------------------------
/deep_sort/linear_assignment.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | from __future__ import absolute_import
3 | import numpy as np
4 | #from sklearn.utils.linear_assignment_ import linear_assignment
5 | from scipy.optimize import linear_sum_assignment
6 | from . import kalman_filter
7 |
8 |
9 | INFTY_COST = 1e+5
10 |
11 |
12 | def min_cost_matching(
13 | distance_metric, max_distance, tracks, detections, track_indices=None,
14 | detection_indices=None):
15 | """Solve linear assignment problem.
16 |
17 | Parameters
18 | ----------
19 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
20 | The distance metric is given a list of tracks and detections as well as
21 | a list of N track indices and M detection indices. The metric should
22 | return the NxM dimensional cost matrix, where element (i, j) is the
23 | association cost between the i-th track in the given track indices and
24 | the j-th detection in the given detection_indices.
25 | max_distance : float
26 | Gating threshold. Associations with cost larger than this value are
27 | disregarded.
28 | tracks : List[track.Track]
29 | A list of predicted tracks at the current time step.
30 | detections : List[detection.Detection]
31 | A list of detections at the current time step.
32 | track_indices : List[int]
33 | List of track indices that maps rows in `cost_matrix` to tracks in
34 | `tracks` (see description above).
35 | detection_indices : List[int]
36 | List of detection indices that maps columns in `cost_matrix` to
37 | detections in `detections` (see description above).
38 |
39 | Returns
40 | -------
41 | (List[(int, int)], List[int], List[int])
42 | Returns a tuple with the following three entries:
43 | * A list of matched track and detection indices.
44 | * A list of unmatched track indices.
45 | * A list of unmatched detection indices.
46 |
47 | """
48 | if track_indices is None:
49 | track_indices = np.arange(len(tracks))
50 | if detection_indices is None:
51 | detection_indices = np.arange(len(detections))
52 |
53 | if len(detection_indices) == 0 or len(track_indices) == 0:
54 | return [], track_indices, detection_indices # Nothing to match.
55 |
56 | cost_matrix = distance_metric(
57 | tracks, detections, track_indices, detection_indices)
58 | cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5
59 | #indices = linear_assignment(cost_matrix)
60 | indices = linear_sum_assignment(cost_matrix)
61 | indices = np.asarray(indices)
62 | indices = np.transpose(indices)
63 |
64 | matches, unmatched_tracks, unmatched_detections = [], [], []
65 | for col, detection_idx in enumerate(detection_indices):
66 | if col not in indices[:, 1]:
67 | unmatched_detections.append(detection_idx)
68 | for row, track_idx in enumerate(track_indices):
69 | if row not in indices[:, 0]:
70 | unmatched_tracks.append(track_idx)
71 | for row, col in indices:
72 | track_idx = track_indices[row]
73 | detection_idx = detection_indices[col]
74 | if cost_matrix[row, col] > max_distance:
75 | unmatched_tracks.append(track_idx)
76 | unmatched_detections.append(detection_idx)
77 | else:
78 | matches.append((track_idx, detection_idx))
79 | return matches, unmatched_tracks, unmatched_detections
80 |
81 |
82 | def matching_cascade(
83 | distance_metric, max_distance, cascade_depth, tracks, detections,
84 | track_indices=None, detection_indices=None):
85 | """Run matching cascade.
86 |
87 | Parameters
88 | ----------
89 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
90 | The distance metric is given a list of tracks and detections as well as
91 | a list of N track indices and M detection indices. The metric should
92 | return the NxM dimensional cost matrix, where element (i, j) is the
93 | association cost between the i-th track in the given track indices and
94 | the j-th detection in the given detection indices.
95 | max_distance : float
96 | Gating threshold. Associations with cost larger than this value are
97 | disregarded.
98 | cascade_depth: int
99 | The cascade depth, should be se to the maximum track age.
100 | tracks : List[track.Track]
101 | A list of predicted tracks at the current time step.
102 | detections : List[detection.Detection]
103 | A list of detections at the current time step.
104 | track_indices : Optional[List[int]]
105 | List of track indices that maps rows in `cost_matrix` to tracks in
106 | `tracks` (see description above). Defaults to all tracks.
107 | detection_indices : Optional[List[int]]
108 | List of detection indices that maps columns in `cost_matrix` to
109 | detections in `detections` (see description above). Defaults to all
110 | detections.
111 |
112 | Returns
113 | -------
114 | (List[(int, int)], List[int], List[int])
115 | Returns a tuple with the following three entries:
116 | * A list of matched track and detection indices.
117 | * A list of unmatched track indices.
118 | * A list of unmatched detection indices.
119 |
120 | """
121 | if track_indices is None:
122 | track_indices = list(range(len(tracks)))
123 | if detection_indices is None:
124 | detection_indices = list(range(len(detections)))
125 |
126 | unmatched_detections = detection_indices
127 | matches = []
128 | for level in range(cascade_depth):
129 | if len(unmatched_detections) == 0: # No detections left
130 | break
131 |
132 | track_indices_l = [
133 | k for k in track_indices
134 | if tracks[k].time_since_update == 1 + level
135 | ]
136 | if len(track_indices_l) == 0: # Nothing to match at this level
137 | continue
138 |
139 | matches_l, _, unmatched_detections = \
140 | min_cost_matching(
141 | distance_metric, max_distance, tracks, detections,
142 | track_indices_l, unmatched_detections)
143 | matches += matches_l
144 | unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches))
145 | return matches, unmatched_tracks, unmatched_detections
146 |
147 |
148 | def gate_cost_matrix(
149 | kf, cost_matrix, tracks, detections, track_indices, detection_indices,
150 | gated_cost=INFTY_COST, only_position=False):
151 | """Invalidate infeasible entries in cost matrix based on the state
152 | distributions obtained by Kalman filtering.
153 |
154 | Parameters
155 | ----------
156 | kf : The Kalman filter.
157 | cost_matrix : ndarray
158 | The NxM dimensional cost matrix, where N is the number of track indices
159 | and M is the number of detection indices, such that entry (i, j) is the
160 | association cost between `tracks[track_indices[i]]` and
161 | `detections[detection_indices[j]]`.
162 | tracks : List[track.Track]
163 | A list of predicted tracks at the current time step.
164 | detections : List[detection.Detection]
165 | A list of detections at the current time step.
166 | track_indices : List[int]
167 | List of track indices that maps rows in `cost_matrix` to tracks in
168 | `tracks` (see description above).
169 | detection_indices : List[int]
170 | List of detection indices that maps columns in `cost_matrix` to
171 | detections in `detections` (see description above).
172 | gated_cost : Optional[float]
173 | Entries in the cost matrix corresponding to infeasible associations are
174 | set this value. Defaults to a very large value.
175 | only_position : Optional[bool]
176 | If True, only the x, y position of the state distribution is considered
177 | during gating. Defaults to False.
178 |
179 | Returns
180 | -------
181 | ndarray
182 | Returns the modified cost matrix.
183 |
184 | """
185 | gating_dim = 2 if only_position else 4
186 | gating_threshold = kalman_filter.chi2inv95[gating_dim]
187 | measurements = np.asarray(
188 | [detections[i].to_xyah() for i in detection_indices])
189 | for row, track_idx in enumerate(track_indices):
190 | track = tracks[track_idx]
191 | gating_distance = kf.gating_distance(
192 | track.mean, track.covariance, measurements, only_position)
193 | cost_matrix[row, gating_distance > gating_threshold] = gated_cost
194 | return cost_matrix
195 |
--------------------------------------------------------------------------------
/deep_sort/nn_matching.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | import numpy as np
3 |
4 |
5 | def _pdist(a, b):
6 | """Compute pair-wise squared distance between points in `a` and `b`.
7 |
8 | Parameters
9 | ----------
10 | a : array_like
11 | An NxM matrix of N samples of dimensionality M.
12 | b : array_like
13 | An LxM matrix of L samples of dimensionality M.
14 |
15 | Returns
16 | -------
17 | ndarray
18 | Returns a matrix of size len(a), len(b) such that eleement (i, j)
19 | contains the squared distance between `a[i]` and `b[j]`.
20 |
21 | """
22 | a, b = np.asarray(a), np.asarray(b)
23 | if len(a) == 0 or len(b) == 0:
24 | return np.zeros((len(a), len(b)))
25 | a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1)
26 | r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :]
27 | r2 = np.clip(r2, 0., float(np.inf))
28 | return r2
29 |
30 |
31 | def _cosine_distance(a, b, data_is_normalized=False):
32 | """Compute pair-wise cosine distance between points in `a` and `b`.
33 |
34 | Parameters
35 | ----------
36 | a : array_like
37 | An NxM matrix of N samples of dimensionality M.
38 | b : array_like
39 | An LxM matrix of L samples of dimensionality M.
40 | data_is_normalized : Optional[bool]
41 | If True, assumes rows in a and b are unit length vectors.
42 | Otherwise, a and b are explicitly normalized to lenght 1.
43 |
44 | Returns
45 | -------
46 | ndarray
47 | Returns a matrix of size len(a), len(b) such that eleement (i, j)
48 | contains the squared distance between `a[i]` and `b[j]`.
49 |
50 | """
51 | if not data_is_normalized:
52 | a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True)
53 | b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True)
54 | return 1. - np.dot(a, b.T)
55 |
56 |
57 | def _nn_euclidean_distance(x, y):
58 | """ Helper function for nearest neighbor distance metric (Euclidean).
59 |
60 | Parameters
61 | ----------
62 | x : ndarray
63 | A matrix of N row-vectors (sample points).
64 | y : ndarray
65 | A matrix of M row-vectors (query points).
66 |
67 | Returns
68 | -------
69 | ndarray
70 | A vector of length M that contains for each entry in `y` the
71 | smallest Euclidean distance to a sample in `x`.
72 |
73 | """
74 | distances = _pdist(x, y)
75 | return np.maximum(0.0, distances.min(axis=0))
76 |
77 |
78 | def _nn_cosine_distance(x, y):
79 | """ Helper function for nearest neighbor distance metric (cosine).
80 |
81 | Parameters
82 | ----------
83 | x : ndarray
84 | A matrix of N row-vectors (sample points).
85 | y : ndarray
86 | A matrix of M row-vectors (query points).
87 |
88 | Returns
89 | -------
90 | ndarray
91 | A vector of length M that contains for each entry in `y` the
92 | smallest cosine distance to a sample in `x`.
93 |
94 | """
95 | distances = _cosine_distance(x, y)
96 | return distances.min(axis=0)
97 |
98 |
99 | class NearestNeighborDistanceMetric(object):
100 | """
101 | A nearest neighbor distance metric that, for each target, returns
102 | the closest distance to any sample that has been observed so far.
103 |
104 | Parameters
105 | ----------
106 | metric : str
107 | Either "euclidean" or "cosine".
108 | matching_threshold: float
109 | The matching threshold. Samples with larger distance are considered an
110 | invalid match.
111 | budget : Optional[int]
112 | If not None, fix samples per class to at most this number. Removes
113 | the oldest samples when the budget is reached.
114 |
115 | Attributes
116 | ----------
117 | samples : Dict[int -> List[ndarray]]
118 | A dictionary that maps from target identities to the list of samples
119 | that have been observed so far.
120 |
121 | """
122 |
123 | def __init__(self, metric, matching_threshold, budget=None):
124 |
125 |
126 | if metric == "euclidean":
127 | self._metric = _nn_euclidean_distance
128 | elif metric == "cosine":
129 | self._metric = _nn_cosine_distance
130 | else:
131 | raise ValueError(
132 | "Invalid metric; must be either 'euclidean' or 'cosine'")
133 | self.matching_threshold = matching_threshold
134 | self.budget = budget
135 | self.samples = {}
136 |
137 | def partial_fit(self, features, targets, active_targets):
138 | """Update the distance metric with new data.
139 |
140 | Parameters
141 | ----------
142 | features : ndarray
143 | An NxM matrix of N features of dimensionality M.
144 | targets : ndarray
145 | An integer array of associated target identities.
146 | active_targets : List[int]
147 | A list of targets that are currently present in the scene.
148 |
149 | """
150 | for feature, target in zip(features, targets):
151 | self.samples.setdefault(target, []).append(feature)
152 | if self.budget is not None:
153 | self.samples[target] = self.samples[target][-self.budget:]
154 | self.samples = {k: self.samples[k] for k in active_targets}
155 |
156 | def distance(self, features, targets):
157 | """Compute distance between features and targets.
158 |
159 | Parameters
160 | ----------
161 | features : ndarray
162 | An NxM matrix of N features of dimensionality M.
163 | targets : List[int]
164 | A list of targets to match the given `features` against.
165 |
166 | Returns
167 | -------
168 | ndarray
169 | Returns a cost matrix of shape len(targets), len(features), where
170 | element (i, j) contains the closest squared distance between
171 | `targets[i]` and `features[j]`.
172 |
173 | """
174 | cost_matrix = np.zeros((len(targets), len(features)))
175 | for i, target in enumerate(targets):
176 | cost_matrix[i, :] = self._metric(self.samples[target], features)
177 | return cost_matrix
178 |
--------------------------------------------------------------------------------
/deep_sort/track.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 |
3 |
4 | class TrackState:
5 | """
6 | Enumeration type for the single target track state. Newly created tracks are
7 | classified as `tentative` until enough evidence has been collected. Then,
8 | the track state is changed to `confirmed`. Tracks that are no longer alive
9 | are classified as `deleted` to mark them for removal from the set of active
10 | tracks.
11 |
12 | """
13 |
14 | Tentative = 1
15 | Confirmed = 2
16 | Deleted = 3
17 |
18 |
19 | class Track:
20 | """
21 | A single target track with state space `(x, y, a, h)` and associated
22 | velocities, where `(x, y)` is the center of the bounding box, `a` is the
23 | aspect ratio and `h` is the height.
24 |
25 | Parameters
26 | ----------
27 | mean : ndarray
28 | Mean vector of the initial state distribution.
29 | covariance : ndarray
30 | Covariance matrix of the initial state distribution.
31 | track_id : int
32 | A unique track identifier.
33 | n_init : int
34 | Number of consecutive detections before the track is confirmed. The
35 | track state is set to `Deleted` if a miss occurs within the first
36 | `n_init` frames.
37 | max_age : int
38 | The maximum number of consecutive misses before the track state is
39 | set to `Deleted`.
40 | feature : Optional[ndarray]
41 | Feature vector of the detection this track originates from. If not None,
42 | this feature is added to the `features` cache.
43 |
44 | Attributes
45 | ----------
46 | mean : ndarray
47 | Mean vector of the initial state distribution.
48 | covariance : ndarray
49 | Covariance matrix of the initial state distribution.
50 | track_id : int
51 | A unique track identifier.
52 | hits : int
53 | Total number of measurement updates.
54 | age : int
55 | Total number of frames since first occurance.
56 | time_since_update : int
57 | Total number of frames since last measurement update.
58 | state : TrackState
59 | The current track state.
60 | features : List[ndarray]
61 | A cache of features. On each measurement update, the associated feature
62 | vector is added to this list.
63 |
64 | """
65 |
66 | def __init__(self, mean, covariance, track_id, n_init, max_age,
67 | feature=None):
68 | self.mean = mean
69 | self.covariance = covariance
70 | self.track_id = track_id
71 | self.hits = 1
72 | self.age = 1
73 | self.time_since_update = 0
74 |
75 | self.state = TrackState.Tentative
76 | self.features = []
77 | if feature is not None:
78 | self.features.append(feature)
79 |
80 | self._n_init = n_init
81 | self._max_age = max_age
82 |
83 | def to_tlwh(self):
84 | """Get current position in bounding box format `(top left x, top left y,
85 | width, height)`.
86 |
87 | Returns
88 | -------
89 | ndarray
90 | The bounding box.
91 |
92 | """
93 | ret = self.mean[:4].copy()
94 | ret[2] *= ret[3]
95 | ret[:2] -= ret[2:] / 2
96 | return ret
97 |
98 | def to_tlbr(self):
99 | """Get current position in bounding box format `(min x, miny, max x,
100 | max y)`.
101 |
102 | Returns
103 | -------
104 | ndarray
105 | The bounding box.
106 |
107 | """
108 | ret = self.to_tlwh()
109 | ret[2:] = ret[:2] + ret[2:]
110 | return ret
111 |
112 | def predict(self, kf):
113 | """Propagate the state distribution to the current time step using a
114 | Kalman filter prediction step.
115 |
116 | Parameters
117 | ----------
118 | kf : kalman_filter.KalmanFilter
119 | The Kalman filter.
120 |
121 | """
122 | self.mean, self.covariance = kf.predict(self.mean, self.covariance)
123 | self.age += 1
124 | self.time_since_update += 1
125 |
126 | def update(self, kf, detection):
127 | """Perform Kalman filter measurement update step and update the feature
128 | cache.
129 |
130 | Parameters
131 | ----------
132 | kf : kalman_filter.KalmanFilter
133 | The Kalman filter.
134 | detection : Detection
135 | The associated detection.
136 |
137 | """
138 | self.mean, self.covariance = kf.update(
139 | self.mean, self.covariance, detection.to_xyah())
140 | self.features.append(detection.feature)
141 |
142 | self.hits += 1
143 | self.time_since_update = 0
144 | if self.state == TrackState.Tentative and self.hits >= self._n_init:
145 | self.state = TrackState.Confirmed
146 |
147 | def mark_missed(self):
148 | """Mark this track as missed (no association at the current time step).
149 | """
150 | if self.state == TrackState.Tentative:
151 | self.state = TrackState.Deleted
152 | elif self.time_since_update > self._max_age:
153 | self.state = TrackState.Deleted
154 |
155 | def is_tentative(self):
156 | """Returns True if this track is tentative (unconfirmed).
157 | """
158 | return self.state == TrackState.Tentative
159 |
160 | def is_confirmed(self):
161 | """Returns True if this track is confirmed."""
162 | return self.state == TrackState.Confirmed
163 |
164 | def is_deleted(self):
165 | """Returns True if this track is dead and should be deleted."""
166 | return self.state == TrackState.Deleted
167 |
--------------------------------------------------------------------------------
/deep_sort/tracker.py:
--------------------------------------------------------------------------------
1 | # vim: expandtab:ts=4:sw=4
2 | from __future__ import absolute_import
3 | import numpy as np
4 | from . import kalman_filter
5 | from . import linear_assignment
6 | from . import iou_matching
7 | from .track import Track
8 |
9 |
10 | class Tracker:
11 | """
12 | This is the multi-target tracker.
13 |
14 | Parameters
15 | ----------
16 | metric : nn_matching.NearestNeighborDistanceMetric
17 | A distance metric for measurement-to-track association.
18 | max_age : int
19 | Maximum number of missed misses before a track is deleted.
20 | n_init : int
21 | Number of consecutive detections before the track is confirmed. The
22 | track state is set to `Deleted` if a miss occurs within the first
23 | `n_init` frames.
24 |
25 | Attributes
26 | ----------
27 | metric : nn_matching.NearestNeighborDistanceMetric
28 | The distance metric used for measurement to track association.
29 | max_age : int
30 | Maximum number of missed misses before a track is deleted.
31 | n_init : int
32 | Number of frames that a track remains in initialization phase.
33 | kf : kalman_filter.KalmanFilter
34 | A Kalman filter to filter target trajectories in image space.
35 | tracks : List[Track]
36 | The list of active tracks at the current time step.
37 |
38 | """
39 |
40 | def __init__(self, metric, max_iou_distance=0.5, max_age=60, n_init=1):
41 | self.metric = metric
42 | self.max_iou_distance = max_iou_distance
43 | self.max_age = max_age
44 | self.n_init = n_init
45 |
46 | self.kf = kalman_filter.KalmanFilter()
47 | self.tracks = []
48 | self._next_id = 1
49 |
50 | def predict(self):
51 | """Propagate track state distributions one time step forward.
52 |
53 | This function should be called once every time step, before `update`.
54 | """
55 | for track in self.tracks:
56 | track.predict(self.kf)
57 | def update(self, detections):
58 | """Perform measurement update and track management.
59 |
60 | Parameters
61 | ----------
62 | detections : List[deep_sort.detection.Detection]
63 | A list of detections at the current time step.
64 |
65 | """
66 | # Run matching cascade.
67 | matches, unmatched_tracks, unmatched_detections = \
68 | self._match(detections)
69 |
70 | # Update track set.
71 | for track_idx, detection_idx in matches:
72 | self.tracks[track_idx].update(
73 | self.kf, detections[detection_idx])
74 | for track_idx in unmatched_tracks:
75 | self.tracks[track_idx].mark_missed()
76 | for detection_idx in unmatched_detections:
77 | self._initiate_track(detections[detection_idx])
78 | self.tracks = [t for t in self.tracks if not t.is_deleted()]
79 |
80 | # Update distance metric.
81 | active_targets = [t.track_id for t in self.tracks if t.is_confirmed()]
82 | features, targets = [], []
83 | for track in self.tracks:
84 | if not track.is_confirmed():
85 | continue
86 | features += track.features
87 | targets += [track.track_id for _ in track.features]
88 | track.features = []
89 | self.metric.partial_fit(
90 | np.asarray(features), np.asarray(targets), active_targets)
91 |
92 | def _match(self, detections):
93 |
94 | def gated_metric(tracks, dets, track_indices, detection_indices):
95 | features = np.array([dets[i].feature for i in detection_indices])
96 | targets = np.array([tracks[i].track_id for i in track_indices])
97 |
98 | cost_matrix = self.metric.distance(features, targets)
99 | cost_matrix = linear_assignment.gate_cost_matrix(
100 | self.kf, cost_matrix, tracks, dets, track_indices,
101 | detection_indices)
102 |
103 | return cost_matrix
104 |
105 | # Split track set into confirmed and unconfirmed tracks.
106 | confirmed_tracks = [
107 | i for i, t in enumerate(self.tracks) if t.is_confirmed()]
108 | unconfirmed_tracks = [
109 | i for i, t in enumerate(self.tracks) if not t.is_confirmed()]
110 |
111 | # Associate confirmed tracks using appearance features.
112 | matches_a, unmatched_tracks_a, unmatched_detections = \
113 | linear_assignment.matching_cascade(
114 | gated_metric, self.metric.matching_threshold, self.max_age,
115 | self.tracks, detections, confirmed_tracks)
116 |
117 | # Associate remaining tracks together with unconfirmed tracks using IOU.
118 | iou_track_candidates = unconfirmed_tracks + [
119 | k for k in unmatched_tracks_a if
120 | self.tracks[k].time_since_update == 1]
121 | unmatched_tracks_a = [
122 | k for k in unmatched_tracks_a if
123 | self.tracks[k].time_since_update != 1]
124 | matches_b, unmatched_tracks_b, unmatched_detections = \
125 | linear_assignment.min_cost_matching(
126 | iou_matching.iou_cost, self.max_iou_distance, self.tracks,
127 | detections, iou_track_candidates, unmatched_detections)
128 |
129 | matches = matches_a + matches_b
130 | unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b))
131 | return matches, unmatched_tracks, unmatched_detections
132 |
133 | def _initiate_track(self, detection):
134 | mean, covariance = self.kf.initiate(detection.to_xyah())
135 | self.tracks.append(Track(
136 | mean, covariance, self._next_id, self.n_init, self.max_age,
137 | detection.feature))
138 | self._next_id += 1
139 |
--------------------------------------------------------------------------------
/deep_sort/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from deep_sort.detection import Detection
3 | from bisect import bisect
4 |
5 | def create_obj_infos(cur_frame, final_boxes, final_probs, final_labels,
6 | box_feats, targetid2class, tracking_objs, min_confidence,
7 | min_detection_height, scale, is_coco_model=False,
8 | coco_to_actev_mapping=None):
9 |
10 | # tracking_objs is a single item
11 | obj_infos = []
12 | tracking_boxes = final_boxes / scale
13 | for j, (box, prob, label) in enumerate(zip(tracking_boxes, final_probs, final_labels)):
14 | cat_name = targetid2class[label]
15 | if is_coco_model:
16 | if cat_name not in coco_to_actev_mapping:
17 | continue
18 | else:
19 | cat_name = coco_to_actev_mapping[cat_name]
20 |
21 | confidence_socre = float(round(prob, 7))
22 | if cat_name not in tracking_objs or confidence_socre < min_confidence:
23 | continue
24 | box[2] -= box[0]
25 | box[3] -= box[1] # x, y, w, h
26 | avg_feat = box_feats[j]
27 | if len(avg_feat.shape) > 2: # [C, H, W]
28 | avg_feat = np.mean(box_feats[j], axis=(1, 2))
29 |
30 |
31 | #norm_feat = avg_feat / np.linalg.norm(avg_feat) # will be normed later
32 |
33 | list_feat = avg_feat.tolist()
34 | # frameIdx, xywh, conf, feature
35 | bbox_data = [cur_frame, box[0], box[1], box[2], box[3], confidence_socre] + list_feat
36 | obj_infos.append(bbox_data)
37 |
38 | detections = []
39 | for row in obj_infos:
40 | bbox, confidence, feature = row[1:5], row[5], row[6:]
41 | if bbox[3] < min_detection_height:
42 | continue
43 | detections.append(Detection(bbox, confidence, feature))
44 | return detections
45 |
46 |
47 | # 1
48 | def linear_inter_bbox(tracking_data, frame_gap):
49 | # print tracking_data.shape
50 | if tracking_data.shape[0] == 0:
51 | return tracking_data
52 | obj_indices = tracking_data[:, 1].astype(np.int)
53 | obj_ids = set(obj_indices.tolist())
54 | tracking_data_list = tracking_data.tolist()
55 | # if len(tracking_data_list) == 0:
56 | # return tracking_data
57 |
58 | # for each track
59 | for obj_index in obj_ids:
60 | mask = obj_indices == obj_index
61 | # all the frames for this track
62 | tracked_frames = tracking_data[mask][:, 0].tolist()
63 |
64 | min_frame_idx = int(min(tracked_frames))
65 | max_frame_idx = int(max(tracked_frames))
66 | whole_frames = range(min_frame_idx, max_frame_idx)
67 | missing_frames = list(set(whole_frames).difference(tracked_frames))
68 | if not missing_frames:
69 | continue
70 | for missing_frame in missing_frames:
71 | insert_index = bisect(tracked_frames, missing_frame)
72 | if insert_index == 0 or insert_index == len(whole_frames):
73 | continue
74 | selected_data = tracking_data[mask]
75 | prev_frame = selected_data[insert_index-1, 0]
76 | next_frame = selected_data[insert_index, 0]
77 | # tolerate some occlusion?
78 | if next_frame - prev_frame > 10*frame_gap:
79 | continue
80 | prev_data = selected_data[insert_index-1, 2:]
81 | next_data = selected_data[insert_index, 2:]
82 |
83 | ratio = 1.0 * (missing_frame - prev_frame) / (next_frame - prev_frame)
84 | cur_data = prev_data + (next_data - prev_data) * ratio
85 | cur_data = np.around(cur_data, decimals=2)
86 | missing_data = [missing_frame, obj_index] + cur_data.tolist()
87 | tracking_data_list.append(missing_data)
88 |
89 | tracking_data_list = sorted(tracking_data_list, key=lambda x: (x[0], x[1]))
90 | tracking_data = np.asarray(tracking_data_list)
91 | return tracking_data
92 |
93 |
94 | # 3
95 | def filter_short_objs(tracking_data):
96 | # print tracking_data.shape
97 | if tracking_data.shape[0] == 0:
98 | return tracking_data
99 | obj_indices = tracking_data[:, 1].astype(np.int)
100 | obj_ids = set(obj_indices.tolist())
101 | filter_objs = set()
102 |
103 | for obj_index in obj_ids:
104 | mask = obj_indices == obj_index
105 | num_frames = np.sum(mask)
106 | if num_frames < 2:
107 | filter_objs.add(obj_index)
108 |
109 | tracking_data_list = tracking_data.tolist()
110 | tracking_data_list = [tracklet for tracklet in tracking_data_list if int(tracklet[1]) not in filter_objs]
111 | tracking_data_list = sorted(tracking_data_list, key=lambda x: (x[0], x[1]))
112 | tracking_data = np.asarray(tracking_data_list)
113 | return tracking_data
114 |
--------------------------------------------------------------------------------
/deformable_helper.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # helper function for deformable conv
3 | import tensorflow as tf
4 |
5 | def _to_bc_h_w(x, x_shape):
6 | """(b, h, w, c) -> (b*c, h, w)"""
7 | x = tf.transpose(x, [0, 3, 1, 2])
8 | x = tf.reshape(x, (-1, x_shape[1], x_shape[2]))
9 | return x
10 |
11 | def _to_b_h_w_n_c(x, x_shape):
12 | """(b*c, h, w, n) -> (b, h, w, n, c)"""
13 | x = tf.reshape(x, (-1, x_shape[4], x_shape[1], x_shape[2], x_shape[3]))
14 | x = tf.transpose(x, [0, 2, 3, 4, 1])
15 | return x
16 |
17 | def tf_flatten(a):
18 | """Flatten tensor"""
19 | return tf.reshape(a, [-1])
20 |
21 | def _get_vals_by_coords(inputs, coords, idx, out_shape):
22 | indices = tf.stack(
23 | [idx, tf_flatten(coords[:, :, :, :, 0]),
24 | tf_flatten(coords[:, :, :, :, 1])], axis=-1
25 | )
26 | vals = tf.gather_nd(inputs, indices)
27 | vals = tf.reshape(vals, out_shape)
28 | return vals
29 |
30 | def _tf_repeat(a, repeats):
31 | """Tensorflow version of np.repeat for 1D"""
32 | # https://github.com/tensorflow/tensorflow/issues/8521
33 |
34 | if len(a.get_shape()) != 1:
35 | raise AssertionError("This is not a 1D Tensor")
36 |
37 | a = tf.expand_dims(a, -1)
38 | a = tf.tile(a, [1, repeats])
39 | a = tf_flatten(a)
40 | return a
41 |
42 | def _tf_batch_map_coordinates(inputs, coords):
43 | """Batch version of tf_map_coordinates
44 |
45 | Only supports 2D feature maps
46 |
47 | Parameters
48 | ----------
49 | inputs : ``tf.Tensor``
50 | shape = (b*c, h, w)
51 | coords : ``tf.Tensor``
52 | shape = (b*c, h, w, n, 2)
53 |
54 | Returns
55 | -------
56 | ``tf.Tensor``
57 | A Tensor with the shape as (b*c, h, w, n)
58 |
59 | """
60 | input_shape = inputs.get_shape()
61 | coords_shape = coords.get_shape()
62 | batch_channel = tf.shape(inputs)[0]
63 | input_h = tf.shape(inputs)[1]
64 | input_w = tf.shape(inputs)[2]
65 | kernel_n = int(coords_shape[3])
66 | n_coords = input_h * input_w * kernel_n
67 |
68 | coords_lt = tf.cast(tf.floor(coords), 'int32')
69 | coords_rb = tf.cast(tf.ceil(coords), 'int32')
70 | coords_lb = tf.stack([coords_lt[:, :, :, :, 0], coords_rb[:, :, :, :, 1]], axis=-1)
71 | coords_rt = tf.stack([coords_rb[:, :, :, :, 0], coords_lt[:, :, :, :, 1]], axis=-1)
72 |
73 | idx = _tf_repeat(tf.range(batch_channel), n_coords)
74 |
75 | vals_lt = _get_vals_by_coords(inputs, coords_lt, idx, (batch_channel, input_h, input_w, kernel_n))
76 | vals_rb = _get_vals_by_coords(inputs, coords_rb, idx, (batch_channel, input_h, input_w, kernel_n))
77 | vals_lb = _get_vals_by_coords(inputs, coords_lb, idx, (batch_channel, input_h, input_w, kernel_n))
78 | vals_rt = _get_vals_by_coords(inputs, coords_rt, idx, (batch_channel, input_h, input_w, kernel_n))
79 |
80 | coords_offset_lt = coords - tf.cast(coords_lt, 'float32')
81 |
82 | vals_t = vals_lt + (vals_rt - vals_lt) * coords_offset_lt[:, :, :, :, 0]
83 | vals_b = vals_lb + (vals_rb - vals_lb) * coords_offset_lt[:, :, :, :, 0]
84 | mapped_vals = vals_t + (vals_b - vals_t) * coords_offset_lt[:, :, :, :, 1]
85 |
86 | return mapped_vals
87 |
88 | def _tf_batch_map_offsets(inputs, offsets, grid_offset):
89 | """Batch map offsets into input
90 |
91 | Parameters
92 | ------------
93 | inputs : ``tf.Tensor``
94 | shape = (b, h, w, c)
95 | offsets: ``tf.Tensor``
96 | shape = (b, h, w, 2*n)
97 | grid_offset: `tf.Tensor``
98 | Offset grids shape = (h, w, n, 2)
99 |
100 | Returns
101 | -------
102 | ``tf.Tensor``
103 | A Tensor with the shape as (b, h, w, c)
104 |
105 | """
106 | input_shape = inputs.get_shape()
107 | batch_size = tf.shape(inputs)[0]
108 | kernel_n = int(int(offsets.get_shape()[3]) / 2)
109 | input_h = tf.shape(inputs)[1]
110 | input_w = tf.shape(inputs)[2]
111 | channel = input_shape[3]
112 |
113 | # inputs (b, h, w, c) --> (b*c, h, w)
114 | inputs = _to_bc_h_w(inputs, tf.shape(inputs))
115 |
116 | # offsets (b, h, w, 2*n) --> (b, h, w, n, 2)
117 | offsets = tf.reshape(offsets, (batch_size, input_h, input_w, kernel_n, 2))
118 | # offsets (b, h, w, n, 2) --> (b*c, h, w, n, 2)
119 | # offsets = tf.tile(offsets, [channel, 1, 1, 1, 1])
120 |
121 | coords = tf.expand_dims(grid_offset, 0) # grid_offset --> (1, h, w, n, 2)
122 | coords = tf.tile(coords, [batch_size, 1, 1, 1, 1]) + offsets # grid_offset --> (b, h, w, n, 2)
123 |
124 | # clip out of bound
125 | coords = tf.stack(
126 | [
127 | tf.clip_by_value(coords[:, :, :, :, 0], 0.0, tf.cast(input_h - 1, 'float32')),
128 | tf.clip_by_value(coords[:, :, :, :, 1], 0.0, tf.cast(input_w - 1, 'float32'))
129 | ], axis=-1
130 | )
131 | coords = tf.tile(coords, [channel, 1, 1, 1, 1])
132 |
133 | mapped_vals = _tf_batch_map_coordinates(inputs, coords)
134 | # (b*c, h, w, n) --> (b, h, w, n, c)
135 | mapped_vals = _to_b_h_w_n_c(mapped_vals, [batch_size, input_h, input_w, kernel_n, channel])
136 |
137 | return mapped_vals
--------------------------------------------------------------------------------
/diva_io/README.md:
--------------------------------------------------------------------------------
1 | # DIVA IO Package
2 |
3 | Version 0.3
4 |
5 | Author: Lijun Yu
6 |
7 | Email: lijun@lj-y.com
8 |
9 | IO interfaces for the [DIVA](https://www.iarpa.gov/index.php/research-programs/diva) project.
10 |
11 | ## Version History
12 |
13 | * 0.3
14 | * Optimized random access and fix missing.
15 | * Robustness improvement.
16 | * Speed test.
17 | * 0.2 (Deprecated)
18 | * Real random access in video loader.
19 | * Add annotation converter.
20 | * Warning control option.
21 | * 0.1
22 | * Initial release of video loader.
23 |
24 | ## Installation
25 |
26 | ### Integration
27 |
28 | To use as a submodule in your git project, run
29 |
30 | ```sh
31 | git submodule add https://github.com/Lijun-Yu/diva_io.git
32 | ```
33 |
34 | ### Requirements
35 |
36 | Environment requirements are listed in [environment.yml](environment.yml).
37 | For the `av` package, I recommend you install it via `conda` by
38 |
39 | ```sh
40 | conda install av -c conda-forge
41 | ```
42 |
43 | as building from `pip` would require a lot of [dependencies](http://docs.mikeboers.com/pyav/7.0.0/overview/installation.html#dependencies).
44 |
45 | ## Video Loader
46 |
47 | A robust video loader that deals with missing frames in the [MEVA dataset](http://mevadata.org).
48 |
49 | This video loader is developed based on [`PyAV`](https://github.com/mikeboers/PyAV) package.
50 | The [`pims`](https://github.com/soft-matter/pims) package was also a good reference despite its compatibility issue with current `PyAV`.
51 |
52 | For the videos in the MEVA, using `cv2.VideoCapture` would result in wrong frame ids as it never counts the missing frames.
53 | If you are using MEVA, I suggest you change to this video loader ASAP.
54 |
55 | ### Replace `cv2.VideoCapture`
56 |
57 | According to my test, this video loader returns the exact same frame as `cv2.VideoCapture` unless missing frame or decoding error occured.
58 | To replace the `cv2.VideoCapture` objects in legacy codes, simply change from
59 |
60 | ```python
61 | import cv2
62 | cap = cv2.VideoCapture(video_path)
63 | ```
64 |
65 | to
66 |
67 | ```python
68 | from diva_io.video import VideoReader
69 | cap = VideoReader(video_path)
70 | ```
71 |
72 | `VideoReader.read` follows the schema of `cv2.VideoCapture.read` but automatically inserts the missing frames while reading the video.
73 |
74 | ### Iterator Interface
75 |
76 | ```python
77 | video = VideoReader(video_path)
78 | for frame in video:
79 | # frame is a diva_io.video.frame.Frame object
80 | image = frame.numpy()
81 | # image is an uint8 array in a shape of (height, width, channel[BGR])
82 | # ... Do something with the image
83 | ```
84 |
85 | ### Random Access
86 |
87 | Random access of a frame requires decoding from the nearest key frame (approximately every 60 frames for MEVA).
88 | Averagely, this introduces a constant overhead of 0.1 seconds, which is much faster than iterating from the beginning.
89 |
90 | ```python
91 | start_frame_id = 1500
92 | length = 100
93 | video.seek(start_frame_id)
94 | for frame in video.get_iter(length):
95 | image = frame.numpy()
96 | # ... Do something with the image
97 | ```
98 |
99 | ### Video Properties
100 |
101 | ```python
102 | video.width # cap.get(cv2.CAP_PROP_FRAME_WIDTH)
103 | video.height # cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
104 | video.fps # cap.get(cv2.CAP_PROP_FPS)
105 | video.length # cap.get(cv2.CAP_PROP_FRAME_COUNT)
106 | ```
107 |
108 | ### Other Interfaces
109 |
110 | For other usages, please see the comments in [video/reader.py](video/reader.py).
111 |
112 | ### Speed
113 |
114 | See [speed.md](docs/speed.md).
115 |
116 | ## Annotation
117 |
118 | An annotation loader and converter for Kitware YML format in [meva-data-repo](https://gitlab.kitware.com/meva/meva-data-repo).
119 |
120 | Clone the meva-data-repo and set
121 |
122 | ```python
123 | annotation_dir = 'path/to/meva-data-repo/annotation/DIVA-phase-2/MEVA/meva-annotations'
124 | ```
125 |
126 | ### Convert Annotation
127 |
128 | This is to convert the annotation from Kitware YML format to ActEV Scorer JSON format.
129 | Run the following command in shell outside the repo's director,
130 |
131 | ```sh
132 | python -m diva_io.annotation.converter
133 | ```
134 |
135 | ### Read Annotation
136 |
137 | ```python
138 | from diva_io.annotation import KitwareAnnotation
139 | video_name = '2018-03-11.11-15-04.11-20-04.school.G300'
140 | annotation = KitwareAnnotation(video_name, annotation_dir)
141 | # deal with annotation.raw_data
142 | ```
143 |
--------------------------------------------------------------------------------
/diva_io/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Lijun Yu'
2 |
--------------------------------------------------------------------------------
/diva_io/annotation/__init__.py:
--------------------------------------------------------------------------------
1 | from .kf1 import KitwareAnnotation
2 |
--------------------------------------------------------------------------------
/diva_io/annotation/converter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import os.path as osp
5 | from progressbar import progressbar
6 | from concurrent.futures import ProcessPoolExecutor
7 | from ..utils import get_logger
8 | from .kf1 import KitwareAnnotation
9 |
10 |
11 | def _get_video_list(annotation_dir):
12 | path = osp.join(annotation_dir, 'list-of-annotated-meva-clips.txt')
13 | with open(path) as f:
14 | video_list = [l.strip() for l in f][2:]
15 | return video_list
16 |
17 |
18 | def _worker(job):
19 | video_name, annotation_dir = job
20 | annotation = KitwareAnnotation(video_name, annotation_dir)
21 | return annotation.get_activities_official()
22 |
23 |
24 | def _get_official_format(video_list, annotation_dir):
25 | jobs = [(video_name, annotation_dir) for video_name in video_list]
26 | pool = ProcessPoolExecutor()
27 | activities = []
28 | for result in progressbar(pool.map(_worker, jobs)):
29 | activities.extend(result)
30 | reference = {'filesProcessed': video_list, 'activities': activities}
31 | file_index = {video_name: {'framerate': 30.0, 'selected': {0: 1, 9000: 0}}
32 | for video_name in video_list}
33 | return reference, file_index
34 |
35 |
36 | def _write_files(data_dict, output_dir):
37 | os.makedirs(output_dir, exist_ok=True)
38 | logger = get_logger(__name__)
39 | for filename, data in data_dict.items():
40 | path = osp.join(output_dir, filename + '.json')
41 | if osp.exists(path):
42 | logger.warning('Overwriting file %s', path)
43 | with open(path, 'w') as f:
44 | json.dump(data, f)
45 |
46 |
47 | def convert_annotation(annotation_dir, output_dir):
48 | video_list = _get_video_list(annotation_dir)
49 | reference, file_index = _get_official_format(video_list, annotation_dir)
50 | data_dict = {'reference': reference, 'file-index': file_index}
51 | _write_files(data_dict, output_dir)
52 |
53 |
54 | def main():
55 | parser = argparse.ArgumentParser(
56 | 'Annotation Converter for KF1, from Kitware YML format to '
57 | 'ActEV Scorer JSON format.')
58 | parser.add_argument('annotation_dir')
59 | parser.add_argument('output_dir')
60 | args = parser.parse_args()
61 | convert_annotation(args.annotation_dir, args.output_dir)
62 |
63 |
64 | if __name__ == "__main__":
65 | main()
66 |
--------------------------------------------------------------------------------
/diva_io/annotation/kf1.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import os.path as osp
3 | from collections import defaultdict
4 |
5 |
6 | FIELDS = ['activities', 'geom', 'types']
7 |
8 |
9 | class KitwareAnnotation(object):
10 |
11 | def __init__(self, video_name: str, annotation_dir: str):
12 | # Please explore the structure of raw_data yourself
13 | self.video_name = video_name
14 | self.raw_data = self._load_raw_data(video_name, annotation_dir)
15 |
16 | def _split_meta(self, contents, key):
17 | meta = []
18 | i = 0
19 | while i < len(contents) and 'meta' in contents[i]:
20 | assert key not in contents[i]
21 | meta.append(contents[i]['meta'])
22 | i += 1
23 | data = [content[key] for content in contents[i:]]
24 | return meta, data
25 |
26 | def _load_file(self, video_name, annotation_dir, field):
27 | date, time_1, time_2 = video_name.split('.')[:3]
28 | for time in [time_1, time_2]:
29 | path = osp.join(annotation_dir, date, time[:2], '%s.%s.yml' % (
30 | video_name, field))
31 | if not osp.exists(path):
32 | continue
33 | with open(path) as f:
34 | contents = yaml.load(f, Loader=yaml.FullLoader)
35 | return contents
36 | path = osp.join(annotation_dir, date, time_1[:2], '%s.%s.yml' % (
37 | video_name, field))
38 | raise FileNotFoundError(path)
39 |
40 | def _load_raw_data(self, video_name, annotation_dir):
41 | raw_data = {'meta': {}}
42 | for field in FIELDS:
43 | contents = self._load_file(video_name, annotation_dir, field)
44 | key = field if field != 'activities' else 'act'
45 | raw_data['meta'][field], raw_data[field] = self._split_meta(
46 | contents, key)
47 | objs = defaultdict(dict)
48 | for obj in raw_data['geom']:
49 | obj['g0'] = [int(x) for x in obj['g0'].split()]
50 | objs[obj['id1']][obj['ts0']] = obj
51 | for obj in raw_data['types']:
52 | objs[obj['id1']]['type'] = [*obj['cset3'].keys()][0]
53 | for act in raw_data['activities']:
54 | for actor in act.get('actors', []):
55 | obj = objs[actor['id1']]
56 | geoms = []
57 | for ts in actor['timespan']:
58 | start, end = ts['tsr0']
59 | for time in range(start, end + 1):
60 | geoms.append(obj[time])
61 | actor['geoms'] = geoms
62 | actor['type'] = obj['type']
63 | return raw_data
64 |
65 | def get_activities_official(self):
66 | activities = []
67 | for act in self.raw_data['activities']:
68 | act_id = act['id2']
69 | act_type = [*act['act2'].keys()][0]
70 | if act_type.startswith('empty'):
71 | continue
72 | start, end = act['timespan'][0]['tsr0']
73 | objects = []
74 | for actor in act['actors']:
75 | actor_id = actor['id1']
76 | bbox_history = {}
77 | for geom in actor['geoms']:
78 | frame_id = geom['ts0']
79 | x1, y1, x2, y2 = geom['g0']
80 | bbox_history[frame_id] = {
81 | 'presenceConf': 1,
82 | 'boundingBox': {
83 | 'x': min(x1, x2), 'y': min(y1, y2),
84 | 'w': abs(x2 - x1), 'h': abs(y2 - y1)}}
85 | for frame_id in range(start, end + 1):
86 | if frame_id not in bbox_history:
87 | bbox_history[frame_id] = {}
88 | obj = {'objectType': 'Vehicle', 'objectID': actor_id,
89 | 'localization': {self.video_name: bbox_history}}
90 | objects.append(obj)
91 | activity = {
92 | 'activity': act_type, 'activityID': act_id,
93 | 'presenceConf': 1, 'alertFrame': start,
94 | 'localization': {self.video_name: {start: 1, end + 1: 0}},
95 | 'objects': objects}
96 | activities.append(activity)
97 | return activities
98 |
--------------------------------------------------------------------------------
/diva_io/docs/speed.md:
--------------------------------------------------------------------------------
1 | # Speed of diva_io.video.VideoReader
2 |
3 | Test performed by [video/speed_test.sh](../video/speed_test.sh).
4 |
5 | ```sh
6 | ./video/speed_test.sh
7 | ```
8 |
9 | ## Overall Performance
10 |
11 | Loading all frames of 7 videos from the [MEVA dataset](http://mevadata.org). Each video is 5-min long and 1080p at 30 fps.
12 |
13 | | | `diva_io.video. VideoReader (fix_missing=True)` | `diva_io.video. VideoReader (fix_missing=False)` | `pymovie.editor .VideoFileClip` | `cv2.VideoCapture` |
14 | |:---------------:|:----------------------------------------------:|:-----------------------------------------------:|:-------------------------------:|:------------------:|
15 | | User Time | 338.12s | 329.00s | 904.09s | 844.35s |
16 | | System Time | 0.80s | 0.60s | 317.14s | 6.44s |
17 | | CPU Utilization | 99% | 99% | 293% | 264% |
18 | | Total Time | 338.98s | 329.60s | 416.31s | 321.06s |
19 |
20 | ## Detailed Results
21 |
22 | | Video Name | Video Description | `diva_io.video .VideoReader (fix_missing=True)` | `diva_io.video .VideoReader (fix_missing=False)` | `pymovie.editor .VideoFileClip` | `cv2.VideoCapture` |
23 | |:----------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------:|:------------------------------------------------:|:-------------------------------:|:------------------:|
24 | | 2018-03-11.16-30-08.16-35-08.hospital.G436.avi | No missing | 0:45 | 0:44 | 1:00 | 0:26 |
25 | | 2018-03-07.16-55-06.17-00-06.school.G336.avi | Missing 104-109, 2294 | 0:55 | 0:53 | 0:59 | 0:26 |
26 | | 2018-03-11.11-25-01.11-30-01.school.G424.avi | Missing 7391-7499 | 0:38 | 0:37 | 0:58 | 0:26 |
27 | | 2018-03-11.16-25-00.16-30-00.school.G639.avi | Bidirectional frames, missing 1, 4 | 0:55 | 0:53 | 0:59 | 0:27 |
28 | | 2018-03-11.11-35-00.11-40-00.school.G299.avi | Packet id and frame id unsychronized, missing 5789-5797 | 0:50 | 0:49 | 0:58 | 1:42 |
29 | | 2018-03-11.11-35-00.11-40-00.school.G330.avi | Packet id and frame id unsychronized, missing 5755-5761 | 0:50 | 0:49 | 0:58 | 0:39 |
30 | | 2018-03-12.10-05-00.10-10-00.hospital.G436.avi | First packet fail | 0:41 | 0:41 | 0:59 | 1:11 |
31 |
--------------------------------------------------------------------------------
/diva_io/environment.yml:
--------------------------------------------------------------------------------
1 | name: diva_io
2 | channels:
3 | - pkgs/main
4 | - conda-forge
5 | dependencies:
6 | - python
7 | - numpy
8 | - av
9 |
--------------------------------------------------------------------------------
/diva_io/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .log import get_logger
2 |
--------------------------------------------------------------------------------
/diva_io/utils/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | def get_logger(name, level=logging.INFO, log_file=None):
5 | logger = logging.getLogger(name)
6 | logger.setLevel(level)
7 | formatter = logging.Formatter(
8 | '%(asctime)s %(name)s %(levelname)s %(message)s')
9 | handlers = [logging.StreamHandler()]
10 | if log_file is not None:
11 | handlers.append(logging.FileHandler(log_file))
12 | for handler in handlers:
13 | handler.setLevel(level)
14 | handler.setFormatter(formatter)
15 | logger.addHandler(handler)
16 | return logger
17 |
--------------------------------------------------------------------------------
/diva_io/video/__init__.py:
--------------------------------------------------------------------------------
1 | from .reader import VideoReader
2 |
--------------------------------------------------------------------------------
/diva_io/video/frame.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from av import VideoFrame
3 |
4 |
5 | class Frame(object):
6 |
7 | def __init__(self, frame, fix_missing_offset= 0):
8 | """Frame wrapper of av.VideoFrame.
9 |
10 | Parameters
11 | ----------
12 | frame : av.VideoFrame
13 | VideoFrame object to be wrapped.
14 | fix_missing_offset : int, optional
15 | Frame id offset to fix a missing frame, by default 0
16 | """
17 | assert isinstance(frame, VideoFrame)
18 | self.frame = frame
19 | self.fix_missing_offset = fix_missing_offset
20 |
21 | @property
22 | def frame_id(self):
23 | """Frame id for external use, including fixing for a missing frame.
24 | """
25 | return self.frame_index_display + self.fix_missing_offset
26 |
27 | def image(self):
28 | """Get PIL Image for visualization in jupyter.
29 |
30 | Returns
31 | -------
32 | PIL.Image
33 | Image for visualization in jupyter.
34 | """
35 | return self.frame.to_image()
36 |
37 | def numpy(self, format='bgr24', width= None,
38 | height= None):
39 | """Get numpy array of the frame in the specified format.
40 |
41 | Parameters
42 | ----------
43 | format : str, optional
44 | Format parameter of av.VideoFrame.reformat(), by default 'bgr24'.
45 | width : int, optional
46 | Desired width of the frame, by default None
47 | height : int, optional
48 | Desired height of the frame, by default None
49 |
50 | Returns
51 | -------
52 | np.ndarray
53 | Numpy array of the frame.
54 | """
55 | return self.frame.to_ndarray(width=width, height=height, format=format)
56 |
57 | @property
58 | def frame_index_display(self):
59 | """The correct frame index for display, 0 based.
60 |
61 | Returns
62 | -------
63 | int
64 | Frame index for display.
65 | """
66 | return self.frame.pts - 1
67 |
68 | @property
69 | def frame_index_store(self):
70 | """The frame index as stored in the video, 0 based.
71 | If you used cv2.VideoCapture.read() to read a video sequentially, this
72 | is the index you would get.
73 |
74 | Returns
75 | -------
76 | int
77 | Frame index as stored.
78 | """
79 | return self.frame.index
80 |
81 | def __repr__(self):
82 | return '<%s contains %s>' % (
83 | repr(self.__class__)[8:-2], repr(self.frame))
84 |
85 | def __getattr__(self, name):
86 | return getattr(self.frame, name)
87 |
--------------------------------------------------------------------------------
/diva_io/video/speed_test.sh:
--------------------------------------------------------------------------------
1 | #! /bin/zsh
2 |
3 | video_dir=$1
4 | cd $(pwd)/$(dirname $0)/../..
5 |
6 | echo "diva_io.video.VideoReader(fix_missing=True)"
7 | time python -c "from diva_io.video.test import speed_test_divaio; speed_test_divaio(\"$video_dir\", True)"
8 |
9 | echo "diva_io.video.VideoReader(fix_missing=False)"
10 | time python -c "from diva_io.video.test import speed_test_divaio; speed_test_divaio(\"$video_dir\", False)"
11 |
12 | echo "moviepy.editor.VideoFileClip"
13 | time python -c "from diva_io.video.test import speed_test_moviepy; speed_test_moviepy(\"$video_dir\")"
14 |
15 | echo "cv2.VideoCapture"
16 | time python -c "from diva_io.video.test import speed_test_opencv; speed_test_opencv(\"$video_dir\")"
17 |
--------------------------------------------------------------------------------
/diva_io/video/test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import os.path as osp
4 | from progressbar import ProgressBar
5 | from .reader import VideoReader
6 |
7 | VIDEO_LIST = [
8 | '2018-03-11.16-30-08.16-35-08.hospital.G436.avi', # no missing
9 | '2018-03-07.16-55-06.17-00-06.school.G336.avi', # have missing
10 | '2018-03-11.11-25-01.11-30-01.school.G424.avi',
11 | '2018-03-11.16-25-00.16-30-00.school.G639.avi', # bidirectional
12 | '2018-03-11.11-35-00.11-40-00.school.G299.avi', # frame id misorder
13 | '2018-03-11.11-35-00.11-40-00.school.G330.avi',
14 | '2018-03-12.10-05-00.10-10-00.hospital.G436.avi' # first frame fail
15 | ]
16 |
17 |
18 | def integrity_test(video_list, video_dir, random_access_point=(5790, 100)):
19 | print('No fix missing')
20 | for video_name in video_list:
21 | print('\t', video_name, flush=True)
22 | bar = ProgressBar().start()
23 | v = VideoReader(video_name, video_dir, fix_missing=False)
24 | for i, f in bar(enumerate(v)):
25 | pass
26 |
27 | print('Fix missing with random access')
28 | start_frame_id, length = random_access_point
29 | for video_name in video_list:
30 | print('\t', video_name, flush=True)
31 | bar = ProgressBar().start()
32 | v = VideoReader(video_name, video_dir)
33 | for i, f in bar(enumerate(v)):
34 | assert f.frame_id == i
35 | bar = ProgressBar().start()
36 | v = VideoReader(video_name, video_dir)
37 | v.seek(start_frame_id)
38 | for i, frame in bar(enumerate(v.get_iter(length))):
39 | assert frame.frame_id == start_frame_id + i
40 |
41 |
42 | def speed_test_opencv(video_dir, video_list=VIDEO_LIST):
43 | import cv2
44 | for video_name in video_list:
45 | print('\t', video_name, flush=True)
46 | bar = ProgressBar().start()
47 | cap = cv2.VideoCapture(osp.join(video_dir, video_name))
48 | for _ in bar(range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)))):
49 | cap.read()
50 |
51 |
52 | def speed_test_moviepy(video_dir, video_list=VIDEO_LIST):
53 | from moviepy.editor import VideoFileClip
54 | for video_name in video_list:
55 | print('\t', video_name, flush=True)
56 | bar = ProgressBar().start()
57 | clip = VideoFileClip(osp.join(video_dir, video_name))
58 | for i in bar(range(int(clip.duration * clip.fps))):
59 | clip.get_frame(i / clip.fps)
60 |
61 |
62 | def speed_test_divaio(video_dir, fix_missing, video_list=VIDEO_LIST):
63 | for video_name in video_list:
64 | print('\t', video_name, flush=True)
65 | bar = ProgressBar().start()
66 | video = VideoReader(video_name, video_dir, fix_missing=fix_missing)
67 | for _ in bar(range(video.length)):
68 | video.read()
69 |
70 |
71 | if __name__ == "__main__":
72 | video_dir = sys.argv[1]
73 | integrity_test(VIDEO_LIST, video_dir)
74 |
--------------------------------------------------------------------------------
/efficientdet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/efficientdet/__init__.py
--------------------------------------------------------------------------------
/efficientdet/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
--------------------------------------------------------------------------------
/efficientdet/backbone/backbone_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Backbone network factory."""
16 |
17 | from . import efficientnet_builder
18 | from . import efficientnet_lite_builder
19 |
20 |
21 | def get_model_builder(model_name):
22 | """Get the model_builder module for a given model name."""
23 | if model_name.startswith('efficientnet-lite'):
24 | return efficientnet_lite_builder
25 | elif model_name.startswith('efficientnet-b'):
26 | return efficientnet_builder
27 | else:
28 | raise ValueError('Unknown model name {}'.format(model_name))
29 |
--------------------------------------------------------------------------------
/efficientdet/backbone/efficientnet_builder_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for efficientnet_builder."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import numpy as np
22 | import tensorflow.compat.v1 as tf
23 |
24 | from backbone import efficientnet_builder
25 |
26 |
27 | class EfficientnetBuilderTest(tf.test.TestCase):
28 |
29 | def _test_model_params(self,
30 | model_name,
31 | input_size,
32 | expected_params,
33 | override_params=None,
34 | features_only=False,
35 | pooled_features_only=False):
36 | images = tf.zeros((1, input_size, input_size, 3), dtype=tf.float32)
37 | efficientnet_builder.build_model(
38 | images,
39 | model_name=model_name,
40 | override_params=override_params,
41 | training=True,
42 | features_only=features_only,
43 | pooled_features_only=pooled_features_only)
44 | num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
45 | self.assertEqual(num_params, expected_params)
46 |
47 | def test_efficientnet_b0(self):
48 | self._test_model_params('efficientnet-b0', 224, expected_params=5288548)
49 |
50 | def test_efficientnet_b1(self):
51 | self._test_model_params('efficientnet-b1', 240, expected_params=7794184)
52 |
53 | def test_efficientnet_b2(self):
54 | self._test_model_params('efficientnet-b2', 260, expected_params=9109994)
55 |
56 | def test_efficientnet_b3(self):
57 | self._test_model_params('efficientnet-b3', 300, expected_params=12233232)
58 |
59 | def test_efficientnet_b4(self):
60 | self._test_model_params('efficientnet-b4', 380, expected_params=19341616)
61 |
62 | def test_efficientnet_b5(self):
63 | self._test_model_params('efficientnet-b5', 456, expected_params=30389784)
64 |
65 | def test_efficientnet_b6(self):
66 | self._test_model_params('efficientnet-b6', 528, expected_params=43040704)
67 |
68 | def test_efficientnet_b7(self):
69 | self._test_model_params('efficientnet-b7', 600, expected_params=66347960)
70 |
71 | def test_efficientnet_b0_with_customized_num_classes(self):
72 | self._test_model_params(
73 | 'efficientnet-b0',
74 | 224,
75 | expected_params=4135648,
76 | override_params={'num_classes': 100})
77 |
78 | def test_efficientnet_b0_with_features_only(self):
79 | self._test_model_params(
80 | 'efficientnet-b0', 224, features_only=True, expected_params=3595388)
81 |
82 | def test_efficientnet_b0_with_pooled_features_only(self):
83 | self._test_model_params(
84 | 'efficientnet-b0',
85 | 224,
86 | pooled_features_only=True,
87 | expected_params=4007548)
88 |
89 | def test_efficientnet_b0_fails_if_both_features_requested(self):
90 | with self.assertRaises(AssertionError):
91 | efficientnet_builder.build_model(
92 | None,
93 | model_name='efficientnet-b0',
94 | training=True,
95 | features_only=True,
96 | pooled_features_only=True)
97 |
98 | def test_efficientnet_b0_base(self):
99 | # Creates a base model using the model configuration.
100 | images = tf.zeros((1, 224, 224, 3), dtype=tf.float32)
101 | _, endpoints = efficientnet_builder.build_model_base(
102 | images, model_name='efficientnet-b0', training=True)
103 |
104 | # reduction_1 to reduction_5 should be in endpoints
105 | self.assertIn('reduction_1', endpoints)
106 | self.assertIn('reduction_5', endpoints)
107 | # reduction_5 should be the last one: no reduction_6.
108 | self.assertNotIn('reduction_6', endpoints)
109 |
110 |
111 | if __name__ == '__main__':
112 | # Disable eager to allow tf.profile works for #params/#flops.
113 | tf.disable_eager_execution()
114 | tf.test.main()
115 |
--------------------------------------------------------------------------------
/efficientdet/backbone/efficientnet_lite_builder_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for efficientnet_lite_builder."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import numpy as np
22 | import tensorflow.compat.v1 as tf
23 |
24 | from backbone import efficientnet_lite_builder
25 |
26 |
27 | class EfficientnetBuilderTest(tf.test.TestCase):
28 |
29 | def _test_model_params(self,
30 | model_name,
31 | input_size,
32 | expected_params,
33 | override_params=None,
34 | features_only=False,
35 | pooled_features_only=False):
36 | images = tf.zeros((1, input_size, input_size, 3), dtype=tf.float32)
37 | efficientnet_lite_builder.build_model(
38 | images,
39 | model_name=model_name,
40 | override_params=override_params,
41 | training=True,
42 | features_only=features_only,
43 | pooled_features_only=pooled_features_only)
44 | num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
45 |
46 | self.assertEqual(num_params, expected_params)
47 |
48 | def test_efficientnet_b0(self):
49 | self._test_model_params(
50 | 'efficientnet-lite0', 224, expected_params=4652008)
51 |
52 | def test_efficientnet_b1(self):
53 | self._test_model_params(
54 | 'efficientnet-lite1', 240, expected_params=5416680)
55 |
56 | def test_efficientnet_b2(self):
57 | self._test_model_params(
58 | 'efficientnet-lite2', 260, expected_params=6092072)
59 |
60 | def test_efficientnet_b3(self):
61 | self._test_model_params(
62 | 'efficientnet-lite3', 280, expected_params=8197096)
63 |
64 | def test_efficientnet_b4(self):
65 | self._test_model_params(
66 | 'efficientnet-lite4', 300, expected_params=13006568)
67 |
68 |
69 | if __name__ == '__main__':
70 | # Disable eager to allow tf.profile works for #params/#flops.
71 | tf.disable_eager_execution()
72 | tf.test.main()
73 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | # Object detection data loaders and libraries are mostly based on RetinaNet:
16 | # https://github.com/tensorflow/tpu/tree/master/models/official/retinanet
17 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/box_coder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Base box coder.
16 |
17 | Box coders convert between coordinate frames, namely image-centric
18 | (with (0,0) on the top left of image) and anchor-centric (with (0,0) being
19 | defined by a specific anchor).
20 |
21 | Users of a BoxCoder can call two methods:
22 | encode: which encodes a box with respect to a given anchor
23 | (or rather, a tensor of boxes wrt a corresponding tensor of anchors) and
24 | decode: which inverts this encoding with a decode operation.
25 | In both cases, the arguments are assumed to be in 1-1 correspondence already;
26 | it is not the job of a BoxCoder to perform matching.
27 | """
28 | from abc import ABCMeta
29 | from abc import abstractmethod
30 | from abc import abstractproperty
31 |
32 | import tensorflow.compat.v1 as tf
33 |
34 |
35 | # Box coder types.
36 | FASTER_RCNN = 'faster_rcnn'
37 | KEYPOINT = 'keypoint'
38 | MEAN_STDDEV = 'mean_stddev'
39 | SQUARE = 'square'
40 |
41 |
42 | class BoxCoder(object):
43 | """Abstract base class for box coder."""
44 | __metaclass__ = ABCMeta
45 |
46 | @abstractproperty
47 | def code_size(self):
48 | """Return the size of each code.
49 |
50 | This number is a constant and should agree with the output of the `encode`
51 | op (e.g. if rel_codes is the output of self.encode(...), then it should have
52 | shape [N, code_size()]). This abstractproperty should be overridden by
53 | implementations.
54 |
55 | Returns:
56 | an integer constant
57 | """
58 | pass
59 |
60 | def encode(self, boxes, anchors):
61 | """Encode a box list relative to an anchor collection.
62 |
63 | Args:
64 | boxes: BoxList holding N boxes to be encoded
65 | anchors: BoxList of N anchors
66 |
67 | Returns:
68 | a tensor representing N relative-encoded boxes
69 | """
70 | with tf.name_scope('Encode'):
71 | return self._encode(boxes, anchors)
72 |
73 | def decode(self, rel_codes, anchors):
74 | """Decode boxes that are encoded relative to an anchor collection.
75 |
76 | Args:
77 | rel_codes: a tensor representing N relative-encoded boxes
78 | anchors: BoxList of anchors
79 |
80 | Returns:
81 | boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
82 | with corners y_min, x_min, y_max, x_max)
83 | """
84 | with tf.name_scope('Decode'):
85 | return self._decode(rel_codes, anchors)
86 |
87 | @abstractmethod
88 | def _encode(self, boxes, anchors):
89 | """Method to be overridden by implementations.
90 |
91 | Args:
92 | boxes: BoxList holding N boxes to be encoded
93 | anchors: BoxList of N anchors
94 |
95 | Returns:
96 | a tensor representing N relative-encoded boxes
97 | """
98 | pass
99 |
100 | @abstractmethod
101 | def _decode(self, rel_codes, anchors):
102 | """Method to be overridden by implementations.
103 |
104 | Args:
105 | rel_codes: a tensor representing N relative-encoded boxes
106 | anchors: BoxList of anchors
107 |
108 | Returns:
109 | boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
110 | with corners y_min, x_min, y_max, x_max)
111 | """
112 | pass
113 |
114 |
115 | def batch_decode(encoded_boxes, box_coder, anchors):
116 | """Decode a batch of encoded boxes.
117 |
118 | This op takes a batch of encoded bounding boxes and transforms
119 | them to a batch of bounding boxes specified by their corners in
120 | the order of [y_min, x_min, y_max, x_max].
121 |
122 | Args:
123 | encoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
124 | code_size] representing the location of the objects.
125 | box_coder: a BoxCoder object.
126 | anchors: a BoxList of anchors used to encode `encoded_boxes`.
127 |
128 | Returns:
129 | decoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
130 | coder_size] representing the corners of the objects in the order
131 | of [y_min, x_min, y_max, x_max].
132 |
133 | Raises:
134 | ValueError: if batch sizes of the inputs are inconsistent, or if
135 | the number of anchors inferred from encoded_boxes and anchors are
136 | inconsistent.
137 | """
138 | encoded_boxes.get_shape().assert_has_rank(3)
139 | if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static():
140 | raise ValueError('The number of anchors inferred from encoded_boxes'
141 | ' and anchors are inconsistent: shape[1] of encoded_boxes'
142 | ' %s should be equal to the number of anchors: %s.' %
143 | (encoded_boxes.get_shape()[1].value,
144 | anchors.num_boxes_static()))
145 |
146 | decoded_boxes = tf.stack([
147 | box_coder.decode(boxes, anchors).get()
148 | for boxes in tf.unstack(encoded_boxes)
149 | ])
150 | return decoded_boxes
151 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/box_list.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Bounding Box List definition.
16 |
17 | BoxList represents a list of bounding boxes as tensorflow
18 | tensors, where each bounding box is represented as a row of 4 numbers,
19 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes
20 | within a given list correspond to a single image. See also
21 | box_list_ops.py for common box related operations (such as area, iou, etc).
22 |
23 | Optionally, users can add additional related fields (such as weights).
24 | We assume the following things to be true about fields:
25 | * they correspond to boxes in the box_list along the 0th dimension
26 | * they have inferable rank at graph construction time
27 | * all dimensions except for possibly the 0th can be inferred
28 | (i.e., not None) at graph construction time.
29 |
30 | Some other notes:
31 | * Following tensorflow conventions, we use height, width ordering,
32 | and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering
33 | * Tensors are always provided as (flat) [N, 4] tensors.
34 | """
35 |
36 | import tensorflow.compat.v1 as tf
37 |
38 |
39 | class BoxList(object):
40 | """Box collection."""
41 |
42 | def __init__(self, boxes):
43 | """Constructs box collection.
44 |
45 | Args:
46 | boxes: a tensor of shape [N, 4] representing box corners
47 |
48 | Raises:
49 | ValueError: if invalid dimensions for bbox data or if bbox data is not in
50 | float32 format.
51 | """
52 | if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
53 | raise ValueError('Invalid dimensions for box data.')
54 | if boxes.dtype != tf.float32:
55 | raise ValueError('Invalid tensor type: should be tf.float32')
56 | self.data = {'boxes': boxes}
57 |
58 | def num_boxes(self):
59 | """Returns number of boxes held in collection.
60 |
61 | Returns:
62 | a tensor representing the number of boxes held in the collection.
63 | """
64 | return tf.shape(self.data['boxes'])[0]
65 |
66 | def num_boxes_static(self):
67 | """Returns number of boxes held in collection.
68 |
69 | This number is inferred at graph construction time rather than run-time.
70 |
71 | Returns:
72 | Number of boxes held in collection (integer) or None if this is not
73 | inferable at graph construction time.
74 | """
75 | return self.data['boxes'].get_shape().as_list()[0]
76 |
77 | def get_all_fields(self):
78 | """Returns all fields."""
79 | return self.data.keys()
80 |
81 | def get_extra_fields(self):
82 | """Returns all non-box fields (i.e., everything not named 'boxes')."""
83 | return [k for k in self.data.keys() if k != 'boxes']
84 |
85 | def add_field(self, field, field_data):
86 | """Add field to box list.
87 |
88 | This method can be used to add related box data such as
89 | weights/labels, etc.
90 |
91 | Args:
92 | field: a string key to access the data via `get`
93 | field_data: a tensor containing the data to store in the BoxList
94 | """
95 | self.data[field] = field_data
96 |
97 | def has_field(self, field):
98 | return field in self.data
99 |
100 | def get(self):
101 | """Convenience function for accessing box coordinates.
102 |
103 | Returns:
104 | a tensor with shape [N, 4] representing box coordinates.
105 | """
106 | return self.get_field('boxes')
107 |
108 | def set(self, boxes):
109 | """Convenience function for setting box coordinates.
110 |
111 | Args:
112 | boxes: a tensor of shape [N, 4] representing box corners
113 |
114 | Raises:
115 | ValueError: if invalid dimensions for bbox data
116 | """
117 | if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
118 | raise ValueError('Invalid dimensions for box data.')
119 | self.data['boxes'] = boxes
120 |
121 | def get_field(self, field):
122 | """Accesses a box collection and associated fields.
123 |
124 | This function returns specified field with object; if no field is specified,
125 | it returns the box coordinates.
126 |
127 | Args:
128 | field: this optional string parameter can be used to specify
129 | a related field to be accessed.
130 |
131 | Returns:
132 | a tensor representing the box collection or an associated field.
133 |
134 | Raises:
135 | ValueError: if invalid field
136 | """
137 | if not self.has_field(field):
138 | raise ValueError('field ' + str(field) + ' does not exist')
139 | return self.data[field]
140 |
141 | def set_field(self, field, value):
142 | """Sets the value of a field.
143 |
144 | Updates the field of a box_list with a given value.
145 |
146 | Args:
147 | field: (string) name of the field to set value.
148 | value: the value to assign to the field.
149 |
150 | Raises:
151 | ValueError: if the box_list does not have specified field.
152 | """
153 | if not self.has_field(field):
154 | raise ValueError('field %s does not exist' % field)
155 | self.data[field] = value
156 |
157 | def get_center_coordinates_and_sizes(self, scope=None):
158 | """Computes the center coordinates, height and width of the boxes.
159 |
160 | Args:
161 | scope: name scope of the function.
162 |
163 | Returns:
164 | a list of 4 1-D tensors [ycenter, xcenter, height, width].
165 | """
166 | with tf.name_scope(scope, 'get_center_coordinates_and_sizes'):
167 | box_corners = self.get()
168 | ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners))
169 | width = xmax - xmin
170 | height = ymax - ymin
171 | ycenter = ymin + height / 2.
172 | xcenter = xmin + width / 2.
173 | return [ycenter, xcenter, height, width]
174 |
175 | def transpose_coordinates(self, scope=None):
176 | """Transpose the coordinate representation in a boxlist.
177 |
178 | Args:
179 | scope: name scope of the function.
180 | """
181 | with tf.name_scope(scope, 'transpose_coordinates'):
182 | y_min, x_min, y_max, x_max = tf.split(
183 | value=self.get(), num_or_size_splits=4, axis=1)
184 | self.set(tf.concat([x_min, y_min, x_max, y_max], 1))
185 |
186 | def as_tensor_dict(self, fields=None):
187 | """Retrieves specified fields as a dictionary of tensors.
188 |
189 | Args:
190 | fields: (optional) list of fields to return in the dictionary.
191 | If None (default), all fields are returned.
192 |
193 | Returns:
194 | tensor_dict: A dictionary of tensors specified by fields.
195 |
196 | Raises:
197 | ValueError: if specified field is not contained in boxlist.
198 | """
199 | tensor_dict = {}
200 | if fields is None:
201 | fields = self.get_all_fields()
202 | for field in fields:
203 | if not self.has_field(field):
204 | raise ValueError('boxlist must contain all specified fields')
205 | tensor_dict[field] = self.get_field(field)
206 | return tensor_dict
207 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/faster_rcnn_box_coder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Faster RCNN box coder.
16 |
17 | Faster RCNN box coder follows the coding schema described below:
18 | ty = (y - ya) / ha
19 | tx = (x - xa) / wa
20 | th = log(h / ha)
21 | tw = log(w / wa)
22 | where x, y, w, h denote the box's center coordinates, width and height
23 | respectively. Similarly, xa, ya, wa, ha denote the anchor's center
24 | coordinates, width and height. tx, ty, tw and th denote the anchor-encoded
25 | center, width and height respectively.
26 |
27 | See http://arxiv.org/abs/1506.01497 for details.
28 | """
29 |
30 | import tensorflow.compat.v1 as tf
31 |
32 | from . import box_coder
33 | from . import box_list
34 |
35 | EPSILON = 1e-8
36 |
37 |
38 | class FasterRcnnBoxCoder(box_coder.BoxCoder):
39 | """Faster RCNN box coder."""
40 |
41 | def __init__(self, scale_factors=None):
42 | """Constructor for FasterRcnnBoxCoder.
43 |
44 | Args:
45 | scale_factors: List of 4 positive scalars to scale ty, tx, th and tw.
46 | If set to None, does not perform scaling. For Faster RCNN,
47 | the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
48 | """
49 | if scale_factors:
50 | assert len(scale_factors) == 4
51 | for scalar in scale_factors:
52 | assert scalar > 0
53 | self._scale_factors = scale_factors
54 |
55 | @property
56 | def code_size(self):
57 | return 4
58 |
59 | def _encode(self, boxes, anchors):
60 | """Encode a box collection with respect to anchor collection.
61 |
62 | Args:
63 | boxes: BoxList holding N boxes to be encoded.
64 | anchors: BoxList of anchors.
65 |
66 | Returns:
67 | a tensor representing N anchor-encoded boxes of the format
68 | [ty, tx, th, tw].
69 | """
70 | # Convert anchors to the center coordinate representation.
71 | ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
72 | ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes()
73 | # Avoid NaN in division and log below.
74 | ha += EPSILON
75 | wa += EPSILON
76 | h += EPSILON
77 | w += EPSILON
78 |
79 | tx = (xcenter - xcenter_a) / wa
80 | ty = (ycenter - ycenter_a) / ha
81 | tw = tf.log(w / wa)
82 | th = tf.log(h / ha)
83 | # Scales location targets as used in paper for joint training.
84 | if self._scale_factors:
85 | ty *= self._scale_factors[0]
86 | tx *= self._scale_factors[1]
87 | th *= self._scale_factors[2]
88 | tw *= self._scale_factors[3]
89 | return tf.transpose(tf.stack([ty, tx, th, tw]))
90 |
91 | def _decode(self, rel_codes, anchors):
92 | """Decode relative codes to boxes.
93 |
94 | Args:
95 | rel_codes: a tensor representing N anchor-encoded boxes.
96 | anchors: BoxList of anchors.
97 |
98 | Returns:
99 | boxes: BoxList holding N bounding boxes.
100 | """
101 | ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
102 |
103 | ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes))
104 | if self._scale_factors:
105 | ty /= self._scale_factors[0]
106 | tx /= self._scale_factors[1]
107 | th /= self._scale_factors[2]
108 | tw /= self._scale_factors[3]
109 | w = tf.exp(tw) * wa
110 | h = tf.exp(th) * ha
111 | ycenter = ty * ha + ycenter_a
112 | xcenter = tx * wa + xcenter_a
113 | ymin = ycenter - h / 2.
114 | xmin = xcenter - w / 2.
115 | ymax = ycenter + h / 2.
116 | xmax = xcenter + w / 2.
117 | return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
118 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/region_similarity_calculator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Region Similarity Calculators for BoxLists.
16 |
17 | Region Similarity Calculators compare a pairwise measure of similarity
18 | between the boxes in two BoxLists.
19 | """
20 | from abc import ABCMeta
21 | from abc import abstractmethod
22 |
23 | import tensorflow.compat.v1 as tf
24 |
25 |
26 | def area(boxlist, scope=None):
27 | """Computes area of boxes.
28 |
29 | Args:
30 | boxlist: BoxList holding N boxes
31 | scope: name scope.
32 |
33 | Returns:
34 | a tensor with shape [N] representing box areas.
35 | """
36 | with tf.name_scope(scope, 'Area'):
37 | y_min, x_min, y_max, x_max = tf.split(
38 | value=boxlist.get(), num_or_size_splits=4, axis=1)
39 | return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
40 |
41 |
42 | def intersection(boxlist1, boxlist2, scope=None):
43 | """Compute pairwise intersection areas between boxes.
44 |
45 | Args:
46 | boxlist1: BoxList holding N boxes
47 | boxlist2: BoxList holding M boxes
48 | scope: name scope.
49 |
50 | Returns:
51 | a tensor with shape [N, M] representing pairwise intersections
52 | """
53 | with tf.name_scope(scope, 'Intersection'):
54 | y_min1, x_min1, y_max1, x_max1 = tf.split(
55 | value=boxlist1.get(), num_or_size_splits=4, axis=1)
56 | y_min2, x_min2, y_max2, x_max2 = tf.split(
57 | value=boxlist2.get(), num_or_size_splits=4, axis=1)
58 | all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
59 | all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
60 | intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
61 | all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
62 | all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
63 | intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
64 | return intersect_heights * intersect_widths
65 |
66 |
67 | def iou(boxlist1, boxlist2, scope=None):
68 | """Computes pairwise intersection-over-union between box collections.
69 |
70 | Args:
71 | boxlist1: BoxList holding N boxes
72 | boxlist2: BoxList holding M boxes
73 | scope: name scope.
74 |
75 | Returns:
76 | a tensor with shape [N, M] representing pairwise iou scores.
77 | """
78 | with tf.name_scope(scope, 'IOU'):
79 | intersections = intersection(boxlist1, boxlist2)
80 | areas1 = area(boxlist1)
81 | areas2 = area(boxlist2)
82 | unions = (
83 | tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
84 | return tf.where(
85 | tf.equal(intersections, 0.0),
86 | tf.zeros_like(intersections), tf.truediv(intersections, unions))
87 |
88 |
89 | class RegionSimilarityCalculator(object):
90 | """Abstract base class for region similarity calculator."""
91 | __metaclass__ = ABCMeta
92 |
93 | def compare(self, boxlist1, boxlist2, scope=None):
94 | """Computes matrix of pairwise similarity between BoxLists.
95 |
96 | This op (to be overridden) computes a measure of pairwise similarity between
97 | the boxes in the given BoxLists. Higher values indicate more similarity.
98 |
99 | Note that this method simply measures similarity and does not explicitly
100 | perform a matching.
101 |
102 | Args:
103 | boxlist1: BoxList holding N boxes.
104 | boxlist2: BoxList holding M boxes.
105 | scope: Op scope name. Defaults to 'Compare' if None.
106 |
107 | Returns:
108 | a (float32) tensor of shape [N, M] with pairwise similarity score.
109 | """
110 | with tf.name_scope(scope, 'Compare', [boxlist1, boxlist2]) as scope:
111 | return self._compare(boxlist1, boxlist2)
112 |
113 | @abstractmethod
114 | def _compare(self, boxlist1, boxlist2):
115 | pass
116 |
117 |
118 | class IouSimilarity(RegionSimilarityCalculator):
119 | """Class to compute similarity based on Intersection over Union (IOU) metric.
120 |
121 | This class computes pairwise similarity between two BoxLists based on IOU.
122 | """
123 |
124 | def _compare(self, boxlist1, boxlist2):
125 | """Compute pairwise IOU similarity between the two BoxLists.
126 |
127 | Args:
128 | boxlist1: BoxList holding N boxes.
129 | boxlist2: BoxList holding M boxes.
130 |
131 | Returns:
132 | A tensor with shape [N, M] representing pairwise iou scores.
133 | """
134 | return iou(boxlist1, boxlist2)
135 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/shape_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Utils used to manipulate tensor shapes."""
16 |
17 | import tensorflow.compat.v1 as tf
18 |
19 |
20 | def assert_shape_equal(shape_a, shape_b):
21 | """Asserts that shape_a and shape_b are equal.
22 |
23 | If the shapes are static, raises a ValueError when the shapes
24 | mismatch.
25 |
26 | If the shapes are dynamic, raises a tf InvalidArgumentError when the shapes
27 | mismatch.
28 |
29 | Args:
30 | shape_a: a list containing shape of the first tensor.
31 | shape_b: a list containing shape of the second tensor.
32 |
33 | Returns:
34 | Either a tf.no_op() when shapes are all static and a tf.assert_equal() op
35 | when the shapes are dynamic.
36 |
37 | Raises:
38 | ValueError: When shapes are both static and unequal.
39 | """
40 | if (all(isinstance(dim, int) for dim in shape_a) and
41 | all(isinstance(dim, int) for dim in shape_b)):
42 | if shape_a != shape_b:
43 | raise ValueError('Unequal shapes {}, {}'.format(shape_a, shape_b))
44 | else: return tf.no_op()
45 | else:
46 | return tf.assert_equal(shape_a, shape_b)
47 |
48 |
49 | def combined_static_and_dynamic_shape(tensor):
50 | """Returns a list containing static and dynamic values for the dimensions.
51 |
52 | Returns a list of static and dynamic values for shape dimensions. This is
53 | useful to preserve static shapes when available in reshape operation.
54 |
55 | Args:
56 | tensor: A tensor of any type.
57 |
58 | Returns:
59 | A list of size tensor.shape.ndims containing integers or a scalar tensor.
60 | """
61 | static_tensor_shape = tensor.shape.as_list()
62 | dynamic_tensor_shape = tf.shape(tensor)
63 | combined_shape = []
64 | for index, dim in enumerate(static_tensor_shape):
65 | if dim is not None:
66 | combined_shape.append(dim)
67 | else:
68 | combined_shape.append(dynamic_tensor_shape[index])
69 | return combined_shape
70 |
--------------------------------------------------------------------------------
/efficientdet/object_detection/tf_example_decoder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Research. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tensorflow Example proto decoder for object detection.
16 |
17 | A decoder to decode string tensors containing serialized tensorflow.Example
18 | protos for object detection.
19 | """
20 |
21 | import tensorflow.compat.v1 as tf
22 |
23 |
24 | def _get_source_id_from_encoded_image(parsed_tensors):
25 | return tf.strings.as_string(
26 | tf.strings.to_hash_bucket_fast(parsed_tensors['image/encoded'],
27 | 2**63 - 1))
28 |
29 |
30 | class TfExampleDecoder(object):
31 | """Tensorflow Example proto decoder."""
32 |
33 | def __init__(self, include_mask=False, regenerate_source_id=False):
34 | self._include_mask = include_mask
35 | self._regenerate_source_id = regenerate_source_id
36 | self._keys_to_features = {
37 | 'image/encoded': tf.FixedLenFeature((), tf.string),
38 | 'image/source_id': tf.FixedLenFeature((), tf.string, ''),
39 | 'image/height': tf.FixedLenFeature((), tf.int64, -1),
40 | 'image/width': tf.FixedLenFeature((), tf.int64, -1),
41 | 'image/object/bbox/xmin': tf.VarLenFeature(tf.float32),
42 | 'image/object/bbox/xmax': tf.VarLenFeature(tf.float32),
43 | 'image/object/bbox/ymin': tf.VarLenFeature(tf.float32),
44 | 'image/object/bbox/ymax': tf.VarLenFeature(tf.float32),
45 | 'image/object/class/label': tf.VarLenFeature(tf.int64),
46 | 'image/object/area': tf.VarLenFeature(tf.float32),
47 | 'image/object/is_crowd': tf.VarLenFeature(tf.int64),
48 | }
49 | if include_mask:
50 | self._keys_to_features.update({
51 | 'image/object/mask':
52 | tf.VarLenFeature(tf.string),
53 | })
54 |
55 | def _decode_image(self, parsed_tensors):
56 | """Decodes the image and set its static shape."""
57 | image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
58 | image.set_shape([None, None, 3])
59 | return image
60 |
61 | def _decode_boxes(self, parsed_tensors):
62 | """Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
63 | xmin = parsed_tensors['image/object/bbox/xmin']
64 | xmax = parsed_tensors['image/object/bbox/xmax']
65 | ymin = parsed_tensors['image/object/bbox/ymin']
66 | ymax = parsed_tensors['image/object/bbox/ymax']
67 | return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
68 |
69 | def _decode_masks(self, parsed_tensors):
70 | """Decode a set of PNG masks to the tf.float32 tensors."""
71 | def _decode_png_mask(png_bytes):
72 | mask = tf.squeeze(
73 | tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
74 | mask = tf.cast(mask, dtype=tf.float32)
75 | mask.set_shape([None, None])
76 | return mask
77 |
78 | height = parsed_tensors['image/height']
79 | width = parsed_tensors['image/width']
80 | masks = parsed_tensors['image/object/mask']
81 | return tf.cond(
82 | tf.greater(tf.size(masks), 0),
83 | lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
84 | lambda: tf.zeros([0, height, width], dtype=tf.float32))
85 |
86 | def _decode_areas(self, parsed_tensors):
87 | xmin = parsed_tensors['image/object/bbox/xmin']
88 | xmax = parsed_tensors['image/object/bbox/xmax']
89 | ymin = parsed_tensors['image/object/bbox/ymin']
90 | ymax = parsed_tensors['image/object/bbox/ymax']
91 | return tf.cond(
92 | tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
93 | lambda: parsed_tensors['image/object/area'],
94 | lambda: (xmax - xmin) * (ymax - ymin))
95 |
96 | def decode(self, serialized_example):
97 | """Decode the serialized example.
98 |
99 | Args:
100 | serialized_example: a single serialized tf.Example string.
101 |
102 | Returns:
103 | decoded_tensors: a dictionary of tensors with the following fields:
104 | - image: a uint8 tensor of shape [None, None, 3].
105 | - source_id: a string scalar tensor.
106 | - height: an integer scalar tensor.
107 | - width: an integer scalar tensor.
108 | - groundtruth_classes: a int64 tensor of shape [None].
109 | - groundtruth_is_crowd: a bool tensor of shape [None].
110 | - groundtruth_area: a float32 tensor of shape [None].
111 | - groundtruth_boxes: a float32 tensor of shape [None, 4].
112 | - groundtruth_instance_masks: a float32 tensor of shape
113 | [None, None, None].
114 | - groundtruth_instance_masks_png: a string tensor of shape [None].
115 | """
116 | parsed_tensors = tf.io.parse_single_example(
117 | serialized_example, self._keys_to_features)
118 | for k in parsed_tensors:
119 | if isinstance(parsed_tensors[k], tf.SparseTensor):
120 | if parsed_tensors[k].dtype == tf.string:
121 | parsed_tensors[k] = tf.sparse_tensor_to_dense(
122 | parsed_tensors[k], default_value='')
123 | else:
124 | parsed_tensors[k] = tf.sparse_tensor_to_dense(
125 | parsed_tensors[k], default_value=0)
126 |
127 | image = self._decode_image(parsed_tensors)
128 | boxes = self._decode_boxes(parsed_tensors)
129 | areas = self._decode_areas(parsed_tensors)
130 |
131 | decode_image_shape = tf.logical_or(
132 | tf.equal(parsed_tensors['image/height'], -1),
133 | tf.equal(parsed_tensors['image/width'], -1))
134 | image_shape = tf.cast(tf.shape(image), dtype=tf.int64)
135 |
136 | parsed_tensors['image/height'] = tf.where(decode_image_shape,
137 | image_shape[0],
138 | parsed_tensors['image/height'])
139 | parsed_tensors['image/width'] = tf.where(decode_image_shape, image_shape[1],
140 | parsed_tensors['image/width'])
141 |
142 | is_crowds = tf.cond(
143 | tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
144 | lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
145 | lambda: tf.zeros_like(parsed_tensors['image/object/class/label'], dtype=tf.bool)) # pylint: disable=line-too-long
146 | if self._regenerate_source_id:
147 | source_id = _get_source_id_from_encoded_image(parsed_tensors)
148 | else:
149 | source_id = tf.cond(
150 | tf.greater(tf.strings.length(parsed_tensors['image/source_id']),
151 | 0), lambda: parsed_tensors['image/source_id'],
152 | lambda: _get_source_id_from_encoded_image(parsed_tensors))
153 | if self._include_mask:
154 | masks = self._decode_masks(parsed_tensors)
155 |
156 | decoded_tensors = {
157 | 'image': image,
158 | 'source_id': source_id,
159 | 'height': parsed_tensors['image/height'],
160 | 'width': parsed_tensors['image/width'],
161 | 'groundtruth_classes': parsed_tensors['image/object/class/label'],
162 | 'groundtruth_is_crowd': is_crowds,
163 | 'groundtruth_area': areas,
164 | 'groundtruth_boxes': boxes,
165 | }
166 | if self._include_mask:
167 | decoded_tensors.update({
168 | 'groundtruth_instance_masks': masks,
169 | 'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
170 | })
171 | return decoded_tensors
172 |
--------------------------------------------------------------------------------
/generate_anchors.py:
--------------------------------------------------------------------------------
1 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
2 |
3 | # --------------------------------------------------------
4 | # Faster R-CNN
5 | # Copyright (c) 2015 Microsoft
6 | # Licensed under The MIT License [see LICENSE for details]
7 | # Written by Ross Girshick and Sean Bell
8 | # --------------------------------------------------------
9 |
10 | from six.moves import range
11 | import numpy as np
12 |
13 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
14 | #
15 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
16 | # >> anchors
17 | #
18 | # anchors =
19 | #
20 | # -83 -39 100 56
21 | # -175 -87 192 104
22 | # -359 -183 376 200
23 | # -55 -55 72 72
24 | # -119 -119 136 136
25 | # -247 -247 264 264
26 | # -35 -79 52 96
27 | # -79 -167 96 184
28 | # -167 -343 184 360
29 |
30 | #array([[ -83., -39., 100., 56.],
31 | # [-175., -87., 192., 104.],
32 | # [-359., -183., 376., 200.],
33 | # [ -55., -55., 72., 72.],
34 | # [-119., -119., 136., 136.],
35 | # [-247., -247., 264., 264.],
36 | # [ -35., -79., 52., 96.],
37 | # [ -79., -167., 96., 184.],
38 | # [-167., -343., 184., 360.]])
39 | # base_size -> anchor_stride=16,
40 | # scales -> scales=np.array((32, 64, 128, 256, 512), dtype=np.float) / 16,
41 | # generate anchor for one position
42 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
43 | scales=2**np.arange(3, 6)):
44 | """
45 | Generate anchor (reference) windows by enumerating aspect ratios X
46 | scales wrt a reference (0, 0, 15, 15) window.
47 | """
48 | # anchor box, 0-indexed, x1,y1,x2,y2
49 | base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1
50 | # with the same center, same size, -> [0.5,1.0,2.0] boxes
51 | # [[0,0,15,15],[0,0,22,11.],..]
52 | ratio_anchors = _ratio_enum(base_anchor, ratios)
53 | # -> [[0,0,31,31],....]
54 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
55 | for i in range(ratio_anchors.shape[0])])
56 | return anchors
57 |
58 | def _whctrs(anchor): # x1,y1,x2,y2: (0,0,15,15) -> (16,16,8,8)
59 | """
60 | Return width, height, x center, and y center for an anchor (window).
61 | """
62 |
63 | w = anchor[2] - anchor[0] + 1
64 | h = anchor[3] - anchor[1] + 1
65 | x_ctr = anchor[0] + 0.5 * (w - 1)
66 | y_ctr = anchor[1] + 0.5 * (h - 1)
67 | return w, h, x_ctr, y_ctr
68 |
69 | def _mkanchors(ws, hs, x_ctr, y_ctr):
70 | """
71 | Given a vector of widths (ws) and heights (hs) around a center
72 | (x_ctr, y_ctr), output a set of anchors (windows).
73 | """
74 |
75 | ws = ws[:, np.newaxis] # [k] -> [k,1]
76 | hs = hs[:, np.newaxis]
77 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
78 | y_ctr - 0.5 * (hs - 1),
79 | x_ctr + 0.5 * (ws - 1),
80 | y_ctr + 0.5 * (hs - 1)))
81 | return anchors
82 |
83 | def _ratio_enum(anchor, ratios):
84 | """
85 | Enumerate a set of anchors for each aspect ratio wrt an anchor.
86 | """
87 |
88 | w, h, x_ctr, y_ctr = _whctrs(anchor) # 0,0,15,15 -> # 16,16, 8,8,
89 | size = w * h # 16 * 16 = 256
90 | # given the same size, get the box with different ratio
91 | size_ratios = size / ratios # ratios: [0.5,1,2] -> [512,256,128]
92 | ws = np.round(np.sqrt(size_ratios)) # np_round to a int, -> [sqrt(512),16,sqrt(128)]
93 | hs = np.round(ws * ratios) # [sqrt(512)*0.5, 16 * 1, sqrt(128)*2]
94 | # ws*hs == w*h
95 | # get anchors with the same x,y,center
96 | # a list of [x1,y1,x2,y2]
97 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
98 | return anchors
99 |
100 | def _scale_enum(anchor, scales):
101 | """
102 | Enumerate a set of anchors for each scale wrt an anchor.
103 | """
104 |
105 | w, h, x_ctr, y_ctr = _whctrs(anchor)
106 | ws = w * scales
107 | hs = h * scales
108 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
109 | return anchors
110 |
111 | if __name__ == '__main__':
112 | #import time
113 | #t = time.time()
114 | #a = generate_anchors()
115 | #print(time.time() - t)
116 | #print(a)
117 | #from IPython import embed; embed()
118 |
119 | print(generate_anchors(
120 | 16, scales=np.asarray((2, 4, 8, 16, 32), 'float32'),
121 | ratios=[0.5,1,2]))
122 |
--------------------------------------------------------------------------------
/generate_util_graph.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # generate cpu/gpu util graph based on the json data
3 |
4 | import argparse
5 | import sys
6 | import os
7 | import json
8 | import matplotlib
9 | matplotlib.use('Agg')
10 | from matplotlib import pyplot as plt
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("logs")
14 | parser.add_argument("output_png")
15 |
16 | if __name__ == "__main__":
17 | args = parser.parse_args()
18 |
19 | with open(args.logs, "r") as f:
20 | data = json.load(f)
21 |
22 | # timing as the timestamp as the x axis, and others as y axis
23 | # timestamp to local time in seconds
24 | start_time = data["timing"][0]
25 | timings = [round(o - start_time, 1) for o in data["timing"]]
26 |
27 | # cpu and gpu util
28 | cpu_util = [round(o, 1) for o in data["cpu_utilization"]]
29 | gpu_util = [round(o, 1) for o in data["gpu_utilization"]]
30 |
31 | # gpu mem and ram, in MB
32 | ram_used = [round(o, 1) for o in data["ram_used"]]
33 | gpu_mem = [round(o, 1) for o in data["gpu_memory"]]
34 |
35 | # plot!
36 | plt.figure(figsize=(10, 6))
37 | # cpu util
38 | plt.subplot(221)
39 | plt.plot(timings, cpu_util, "g-")
40 | plt.title("cpu util %")
41 | plt.xlabel("seconds")
42 | plt.grid(True)
43 |
44 | plt.subplot(222)
45 | plt.plot(timings, ram_used, "g-")
46 | plt.title("ram used (MB)")
47 | plt.xlabel("seconds")
48 | plt.grid(True)
49 |
50 | plt.subplot(223)
51 | plt.plot(timings, gpu_util, "b-")
52 | plt.title("gpu util %")
53 | plt.xlabel("seconds")
54 | plt.grid(True)
55 |
56 | plt.subplot(224)
57 | plt.plot(timings, gpu_mem, "b-")
58 | plt.title("GPU mem (MB)")
59 | plt.xlabel("seconds")
60 | plt.grid(True)
61 |
62 | plt.subplots_adjust(hspace=0.5, wspace=0.3)
63 |
64 | plt.savefig(args.output_png, dpi=400)
65 |
66 |
67 |
--------------------------------------------------------------------------------
/get_frames_resize.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """
3 | given a list of videos, get all the frames and resize note that the
4 | frames are 0-indexed
5 | """
6 |
7 | import argparse
8 | import cv2
9 | import os
10 | import pickle
11 | import sys
12 |
13 | from tqdm import tqdm
14 |
15 | parser = argparse.ArgumentParser()
16 |
17 | parser.add_argument("videolist")
18 | parser.add_argument("despath")
19 |
20 | parser.add_argument("--resize", default=False, action="store_true")
21 | parser.add_argument("--size", default=800, type=int)
22 | parser.add_argument("--maxsize", default=1333, type=int)
23 |
24 |
25 |
26 | parser.add_argument("--job", type=int, default=1, help="total job")
27 | parser.add_argument("--curJob", type=int, default=1,
28 | help="this script run job Num")
29 | parser.add_argument("--statspath", default=None,
30 | help="path to write videoname.p to save some stats for "
31 | "that video")
32 | parser.add_argument("--use_2level", action="store_true",
33 | help="make videoname/frames dir")
34 | parser.add_argument("--name_level", type=int, default=None,
35 | help="add the top level folder name to the videoname")
36 | parser.add_argument("--cv2path", default=None)
37 |
38 | parser.add_argument("--use_moviepy", action="store_true")
39 | parser.add_argument("--use_lijun", action="store_true")
40 |
41 |
42 | def get_new_hw(h, w, size, max_size):
43 | """Get new hw."""
44 | scale = size * 1.0 / min(h, w)
45 | if h < w:
46 | newh, neww = size, scale * w
47 | else:
48 | newh, neww = scale * h, size
49 | if max(newh, neww) > max_size:
50 | scale = max_size * 1.0 / max(newh, neww)
51 | newh = newh * scale
52 | neww = neww * scale
53 | neww = int(neww + 0.5)
54 | newh = int(newh + 0.5)
55 | return neww, newh
56 |
57 |
58 | if __name__ == "__main__":
59 | args = parser.parse_args()
60 | if args.cv2path is not None:
61 | sys.path = [args.cv2path] + sys.path
62 |
63 | if args.use_moviepy:
64 | from moviepy.editor import VideoFileClip
65 | elif args.use_lijun:
66 | from diva_io.video import VideoReader
67 |
68 | # still need this to write image
69 | print("using opencv version:%s"%(cv2.__version__))
70 |
71 | if not os.path.exists(args.despath):
72 | os.makedirs(args.despath)
73 |
74 | if args.statspath is not None and not os.path.exists(args.statspath):
75 | os.makedirs(args.statspath)
76 |
77 | count = 0
78 | for line in tqdm(open(args.videolist, "r").readlines()):
79 | count += 1
80 | if (count % args.job) != (args.curJob-1):
81 | continue
82 |
83 | video = line.strip()
84 |
85 | stats = {"h":None, "w":None, "fps":None, "frame_count":None,
86 | "actual_frame_count":None}
87 |
88 | videoname = os.path.splitext(os.path.basename(video))[0]
89 |
90 | targetpath = args.despath
91 |
92 | if args.use_2level:
93 | targetpath = os.path.join(args.despath, videoname)
94 | if not os.path.exists(targetpath):
95 | os.makedirs(targetpath)
96 |
97 | if args.name_level is not None:
98 | foldernames = video.split("/")
99 | prefixes = foldernames[-1-args.name_level:-1]
100 | videoname = "__".join(prefixes + [videoname])
101 |
102 | if args.use_moviepy:
103 | vcap = VideoFileClip(video, audio=False)
104 | frame_count = int(vcap.fps * vcap.duration) # uh
105 | vcap_iter = vcap.iter_frames()
106 | elif args.use_lijun:
107 | vcap = VideoReader(video)
108 | frame_count = int(vcap.length)
109 | else:
110 | try:
111 | vcap = cv2.VideoCapture(video)
112 | if not vcap.isOpened():
113 | raise Exception("cannot open %s"%video)
114 | except Exception as e:
115 | raise e
116 |
117 | if cv2.__version__.split(".") != "2":
118 | frame_width = vcap.get(cv2.CAP_PROP_FRAME_WIDTH)
119 | frame_height = vcap.get(cv2.CAP_PROP_FRAME_HEIGHT)
120 |
121 | fps = vcap.get(cv2.CAP_PROP_FPS)
122 | frame_count = vcap.get(cv2.CAP_PROP_FRAME_COUNT)
123 | else:
124 | frame_width = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)
125 | frame_height = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)
126 |
127 | fps = vcap.get(cv2.cv.CV_CAP_PROP_FPS)
128 | frame_count = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)
129 | stats['h'] = frame_height
130 | stats['w'] = frame_width
131 |
132 | stats['fps'] = fps
133 |
134 |
135 |
136 | stats['frame_count'] = frame_count
137 |
138 | cur_frame = 0
139 | count_actual = 0
140 | while cur_frame < frame_count:
141 | if args.use_moviepy:
142 | suc = True
143 | frame = next(vcap_iter)
144 |
145 | else:
146 | suc, frame = vcap.read()
147 |
148 | if not suc:
149 | cur_frame += 1
150 | tqdm.write("warning, %s frame of %s failed" % (cur_frame, videoname))
151 | continue
152 | count_actual += 1
153 | if args.use_moviepy:
154 | # moviepy ask ffmpeg to get rgb24
155 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
156 |
157 | frame = frame.astype("float32")
158 |
159 | if args.resize:
160 | neww, newh = get_new_hw(frame.shape[0],
161 | frame.shape[1], args.size, args.maxsize)
162 |
163 | frame = cv2.resize(frame, (neww, newh), interpolation=cv2.INTER_LINEAR)
164 |
165 | cv2.imwrite(os.path.join(targetpath,
166 | "%s_F_%08d.jpg" % (videoname, cur_frame)), frame)
167 |
168 | cur_frame += 1
169 |
170 | stats['actual_frame_count'] = count_actual
171 |
172 | if args.statspath is not None:
173 | with open(os.path.join(args.statspath, "%s.p" % videoname), "wb") as fs:
174 | pickle.dump(stats, fs)
175 | if not args.use_moviepy and not args.use_lijun:
176 | vcap.release()
177 |
--------------------------------------------------------------------------------
/images/Person_vis_video.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/Person_vis_video.gif
--------------------------------------------------------------------------------
/images/Vehicle_vis_video.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/Vehicle_vis_video.gif
--------------------------------------------------------------------------------
/images/actev-prizechallenge-06-2019.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/actev-prizechallenge-06-2019.png
--------------------------------------------------------------------------------
/images/inf_actev_0.49audc_02-2020.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/inf_actev_0.49audc_02-2020.png
--------------------------------------------------------------------------------
/images/multi-camera-reid.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/multi-camera-reid.gif
--------------------------------------------------------------------------------
/images/person_multi_reid.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/person_multi_reid.gif
--------------------------------------------------------------------------------
/images/person_multi_reid2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/person_multi_reid2.gif
--------------------------------------------------------------------------------
/images/util_log_b1partial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/util_log_b1partial.png
--------------------------------------------------------------------------------
/images/util_log_b8multithread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/util_log_b8multithread.png
--------------------------------------------------------------------------------
/images/vehicle_multi_reid.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/images/vehicle_multi_reid.gif
--------------------------------------------------------------------------------
/tensorrt_optimize.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """Given the tensorflow frozen graph, use TensorRT to optimize,
3 | get a new frozen graph."""
4 |
5 | from __future__ import print_function
6 |
7 | import argparse
8 | import time
9 | import tensorflow as tf
10 | import tensorflow.contrib.tensorrt as trt
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("pbfile")
14 | parser.add_argument("newpbfile")
15 | parser.add_argument("--precision_mode", default="FP32",
16 | help="FP32, FP16, or INT8")
17 | parser.add_argument("--maximum_cached_engines", default=100,
18 | help="Don't know what this does.")
19 |
20 |
21 | # parameter
22 | # https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html
23 | if __name__ == "__main__":
24 | args = parser.parse_args()
25 |
26 | max_batch_size = 1
27 | precision_mode = args.precision_mode
28 | minimum_segment_size = 2 # smaller the faster? 5 -60?
29 | max_workspace_size_bytes = 1 << 32
30 | maximum_cached_engines = args.maximum_cached_engines
31 |
32 | output_names = [
33 | "final_boxes",
34 | "final_labels",
35 | "final_probs",
36 | "fpn_box_feat",
37 | ]
38 |
39 | tf_config = tf.ConfigProto()
40 | tf_config.gpu_options.allow_growth = True
41 |
42 | with tf.Graph().as_default() as tf_graph:
43 | with tf.Session(config=tf_config) as tf_sess:
44 | with tf.gfile.GFile(args.pbfile, "rb") as f:
45 | frozen_graph = tf.GraphDef()
46 | frozen_graph.ParseFromString(f.read())
47 |
48 | graph_size = len(frozen_graph.SerializeToString())
49 | num_nodes = len(frozen_graph.node)
50 | start_time = time.time()
51 | frozen_graph = trt.create_inference_graph(
52 | input_graph_def=frozen_graph,
53 | outputs=output_names,
54 | max_batch_size=max_batch_size,
55 | max_workspace_size_bytes=max_workspace_size_bytes,
56 | precision_mode=precision_mode,
57 | minimum_segment_size=minimum_segment_size,
58 | is_dynamic_op=True, # this is needed for FPN
59 | maximum_cached_engines=maximum_cached_engines)
60 | end_time = time.time()
61 | print("graph_size(MB)(native_tf): %.1f" % (float(graph_size)/(1<<20)))
62 | print("graph_size(MB)(trt): %.1f" % (
63 | float(len(frozen_graph.SerializeToString()))/(1<<20)))
64 | print("num_nodes(native_tf): %d" % num_nodes)
65 | print("num_nodes(tftrt_total): %d" % len(frozen_graph.node))
66 | print("num_nodes(trt_only): %d" % len(
67 | [1 for n in frozen_graph.node if str(n.op) == "TRTEngineOp"]))
68 | print("time(s) (trt_conversion): %.4f" % (end_time - start_time))
69 | with open(args.newpbfile, "wb") as f:
70 | f.write(frozen_graph.SerializeToString())
71 |
--------------------------------------------------------------------------------
/tensorrt_optimize_tf1.15.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """Given the tensorflow frozen graph, use TensorRT to optimize,
3 | get a new frozen graph."""
4 |
5 | from __future__ import print_function
6 |
7 | import argparse
8 | import time
9 | import tensorflow as tf
10 | from tensorflow.python.compiler.tensorrt import trt_convert as trt
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("pbfile")
14 | parser.add_argument("newpbfile")
15 | parser.add_argument("--precision_mode", default="FP32",
16 | help="FP32, FP16, or INT8")
17 |
18 |
19 | # parameter
20 | # https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html
21 | if __name__ == "__main__":
22 | args = parser.parse_args()
23 |
24 | # not sure what these do, so leave them default
25 | #max_batch_size = 1
26 | #minimum_segment_size = 2 # smaller the faster? 5 -60?
27 | #max_workspace_size_bytes = 1 << 32
28 | #maximum_cached_engines = 1
29 |
30 | output_names = [
31 | "final_boxes",
32 | "final_labels",
33 | "final_probs",
34 | "fpn_box_feat",
35 | ]
36 |
37 | tf_config = tf.ConfigProto()
38 | tf_config.gpu_options.allow_growth = True
39 |
40 | with tf.Graph().as_default() as tf_graph:
41 | with tf.Session(config=tf_config) as tf_sess:
42 | with tf.gfile.GFile(args.pbfile, "rb") as f:
43 | frozen_graph = tf.GraphDef()
44 | frozen_graph.ParseFromString(f.read())
45 |
46 | converter = trt.TrtGraphConverter(
47 | input_graph_def=frozen_graph,
48 | nodes_blacklist=output_names,
49 | is_dynamic_op=False,
50 | precision_mode=args.precision_mode) #output nodes
51 | trt_graph = converter.convert()
52 | #converter.save(args.newpbfile)
53 |
54 |
55 | with open(args.newpbfile, "wb") as f:
56 | f.write(trt_graph.SerializeToString())
57 |
--------------------------------------------------------------------------------
/test_reid.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """Test person_reid and vehicle reid model"""
3 |
4 | import argparse
5 | import os
6 | from glob import glob
7 | import numpy as np
8 |
9 | from torchreid.feature_extractor import FeatureExtractor
10 | from torchreid.distance import compute_distance_matrix
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("query_img")
14 | parser.add_argument("test_img_prefix")
15 | parser.add_argument("--gpuid", default=0, type=int,
16 | help="gpu id")
17 | parser.add_argument("--vehicle_reid_model", default=None)
18 | parser.add_argument("--person_reid_model", default=None)
19 | parser.add_argument("--p_model_name", default="osnet_x1_0")
20 |
21 |
22 | if __name__ == "__main__":
23 | args = parser.parse_args()
24 |
25 | if args.person_reid_model is not None:
26 | extractor = FeatureExtractor(
27 | model_name=args.p_model_name,
28 | model_path=args.person_reid_model,
29 | device="cuda:%d" % args.gpuid
30 | )
31 |
32 | elif args.vehicle_reid_model is not None:
33 | extractor = FeatureExtractor(
34 | model_name="resnet101",
35 | model_path=args.vehicle_reid_model,
36 | device="cuda:%d" % args.gpuid
37 | )
38 | else:
39 | raise Exception("Please provide a model!")
40 |
41 | test_imgs = glob(args.test_img_prefix + "*")
42 | test_imgs.sort()
43 | assert test_imgs
44 | img_list = [args.query_img] + test_imgs
45 | print(img_list)
46 | features = extractor(img_list)
47 |
48 | print(features.shape) # [n, 512]
49 | # compute nxn distance
50 | distmat = compute_distance_matrix(features, features, metric='euclidean')
51 | np.set_printoptions(suppress=True, precision=3)
52 | print(distmat.cpu().numpy())
53 |
54 |
--------------------------------------------------------------------------------
/tester.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # tester, given the config with model path
3 |
4 |
5 | import tensorflow as tf
6 |
7 |
8 | class Tester():
9 | def __init__(self,models,config,add_mask=True):
10 | self.config = config
11 | self.models = models
12 |
13 | # infereence out:
14 | self.final_boxes = [model.final_boxes for model in models]
15 | # [R]
16 | self.final_labels = [model.final_labels for model in models]
17 | self.final_probs = [model.final_probs for model in models]
18 |
19 | if config.add_act:
20 | if config.act_v2:
21 | self.act_single_boxes = [model.act_single_boxes for model in models]
22 | self.act_single_label_logits = [model.act_single_label_logits for model in models]
23 | else:
24 | self.act_final_boxes = [model.act_final_boxes for model in models]
25 | # [R]
26 | self.act_final_labels = [model.act_final_labels for model in models]
27 | self.act_final_probs = [model.act_final_probs for model in models]
28 |
29 | self.small_object = False
30 | if config.use_small_object_head:
31 | self.small_object = True
32 | if self.small_object:
33 | # infereence out:
34 | self.so_final_boxes = [model.so_final_boxes for model in models]
35 | # [R]
36 | self.so_final_labels = [model.so_final_labels for model in models]
37 | self.so_final_probs = [model.so_final_probs for model in models]
38 |
39 | self.add_mask = add_mask
40 |
41 | if add_mask:
42 | # [R,14,14]
43 | self.final_masks = [model.final_masks for model in models]
44 |
45 |
46 | def step(self,sess,batch):
47 | config = self.config
48 | # give one batch of Dataset, use model to get the result,
49 | assert isinstance(sess,tf.Session)
50 | batchIdxs,batch_datas = batch
51 | #assert len(batch_datas) == len(self.models) # there may be less data in the end
52 | num_input = len(batch_datas) # use this to cap the model input
53 |
54 | feed_dict = {}
55 |
56 | for _,batch_data,model in zip(range(num_input),batch_datas,self.models):
57 | feed_dict.update(model.get_feed_dict(batch_data,is_train=False))
58 |
59 | sess_input = []
60 | if self.add_mask:
61 | for _,boxes,labels,probs,masks in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.final_masks):
62 | sess_input+=[boxes,labels,probs,masks]
63 | else:
64 | if self.small_object:
65 | for _,boxes,labels,probs,so_boxes, so_labels, so_probs in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.so_final_boxes,self.so_final_labels,self.so_final_probs):
66 | sess_input+=[boxes,labels,probs,so_boxes,so_labels,so_probs]
67 | else:
68 | for _,boxes,labels,probs in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs):
69 | sess_input+=[boxes,labels,probs]
70 |
71 | if config.add_act:
72 | sess_input = []
73 | if config.act_v2:
74 | for _,boxes,labels,probs,actsingleboxes,actsinglelabels in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.act_single_boxes,self.act_single_label_logits):
75 | sess_input+=[boxes,labels,probs,actsingleboxes,actsinglelabels]
76 | else:
77 | for _,boxes,labels,probs,actboxes,actlabels,actprobs in zip(range(num_input),self.final_boxes,self.final_labels,self.final_probs,self.act_final_boxes,self.act_final_labels,self.act_final_probs):
78 | sess_input+=[boxes,labels,probs,actboxes,actlabels,actprobs]
79 |
80 |
81 | #final_boxes, final_probs, final_labels, final_masks = sess.run([self.final_boxes, self.final_probs, self.final_labels, self.final_masks],feed_dict=feed_dict)
82 | #return final_boxes, final_probs, final_labels, final_masks
83 | outputs = sess.run(sess_input,feed_dict=feed_dict)
84 | if self.add_mask:
85 | pn = 4
86 | else:
87 | if self.small_object:
88 | pn = 6
89 | else:
90 | pn = 3
91 | if config.add_act:
92 | if config.act_v2:
93 | pn = 5
94 | else:
95 | pn = 6
96 | outputs = [outputs[i*pn:(i*pn+pn)] for i in range(num_input)]
97 | else:
98 | outputs = [outputs[i*pn:(i*pn+pn)] for i in range(num_input)]
99 | return outputs
100 |
--------------------------------------------------------------------------------
/tmot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunweiLiang/Object_Detection_Tracking/fe3063cd8d9d946dbf9fe6f776c04e75f4ecba90/tmot/__init__.py
--------------------------------------------------------------------------------
/tmot/basetrack.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import OrderedDict
3 |
4 |
5 | class TrackState(object):
6 | New = 0
7 | Tracked = 1
8 | Lost = 2
9 | Removed = 3
10 |
11 |
12 | class BaseTrack(object):
13 | _count = 0
14 |
15 | track_id = 0
16 | is_activated = False
17 | state = TrackState.New
18 |
19 | history = OrderedDict()
20 | features = []
21 | curr_feature = None
22 | score = 0
23 | start_frame = 0
24 | frame_id = 0
25 | time_since_update = 0
26 |
27 | # multi-camera
28 | location = (np.inf, np.inf)
29 |
30 | @property
31 | def end_frame(self):
32 | return self.frame_id
33 |
34 | @staticmethod
35 | def next_id():
36 | BaseTrack._count += 1
37 | return BaseTrack._count
38 |
39 | def activate(self, *args):
40 | raise NotImplementedError
41 |
42 | def predict(self):
43 | raise NotImplementedError
44 |
45 | def update(self, *args, **kwargs):
46 | raise NotImplementedError
47 |
48 | def mark_lost(self):
49 | self.state = TrackState.Lost
50 |
51 | def mark_removed(self):
52 | self.state = TrackState.Removed
53 |
54 |
--------------------------------------------------------------------------------
/tmot/matching.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy
3 | from scipy.spatial.distance import cdist
4 | import lap # 0.4.0
5 |
6 | from cython_bbox import bbox_overlaps as bbox_ious
7 | from . import kalman_filter
8 |
9 | def merge_matches(m1, m2, shape):
10 | O,P,Q = shape
11 | m1 = np.asarray(m1)
12 | m2 = np.asarray(m2)
13 |
14 | M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
15 | M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))
16 |
17 | mask = M1*M2
18 | match = mask.nonzero()
19 | match = list(zip(match[0], match[1]))
20 | unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
21 | unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))
22 |
23 | return match, unmatched_O, unmatched_Q
24 |
25 |
26 | def linear_assignment(cost_matrix, thresh):
27 | if cost_matrix.size == 0:
28 | return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
29 | matches, unmatched_a, unmatched_b = [], [], []
30 | cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
31 | for ix, mx in enumerate(x):
32 | if mx >= 0:
33 | matches.append([ix, mx])
34 | unmatched_a = np.where(x < 0)[0]
35 | unmatched_b = np.where(y < 0)[0]
36 | matches = np.asarray(matches)
37 | return matches, unmatched_a, unmatched_b
38 |
39 |
40 | def ious(atlbrs, btlbrs):
41 | """
42 | Compute cost based on IoU
43 | :type atlbrs: list[tlbr] | np.ndarray
44 | :type atlbrs: list[tlbr] | np.ndarray
45 |
46 | :rtype ious np.ndarray
47 | """
48 | ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
49 | if ious.size == 0:
50 | return ious
51 |
52 | ious = bbox_ious(
53 | np.ascontiguousarray(atlbrs, dtype=np.float),
54 | np.ascontiguousarray(btlbrs, dtype=np.float)
55 | )
56 |
57 | return ious
58 |
59 |
60 | def iou_distance(atracks, btracks):
61 | """
62 | Compute cost based on IoU
63 | :type atracks: list[STrack]
64 | :type btracks: list[STrack]
65 |
66 | :rtype cost_matrix np.ndarray
67 | """
68 |
69 | if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
70 | atlbrs = atracks
71 | btlbrs = btracks
72 | else:
73 | atlbrs = [track.tlbr for track in atracks]
74 | btlbrs = [track.tlbr for track in btracks]
75 | _ious = ious(atlbrs, btlbrs)
76 | cost_matrix = 1 - _ious
77 |
78 | return cost_matrix
79 |
80 | def embedding_distance(tracks, detections, metric='cosine'):
81 | """
82 | :param tracks: list[STrack]
83 | :param detections: list[BaseTrack]
84 | :param metric:
85 | :return: cost_matrix np.ndarray
86 | """
87 |
88 | cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
89 | if cost_matrix.size == 0:
90 | return cost_matrix
91 | det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
92 | track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float)
93 | cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features
94 |
95 | return cost_matrix
96 |
97 |
98 | def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98):
99 | if cost_matrix.size == 0:
100 | return cost_matrix
101 | gating_dim = 2 if only_position else 4
102 | gating_threshold = kalman_filter.chi2inv95[gating_dim]
103 | measurements = np.asarray([det.to_xyah() for det in detections])
104 | for row, track in enumerate(tracks):
105 | gating_distance = kf.gating_distance(
106 | track.mean, track.covariance, measurements, only_position, metric='maha')
107 | cost_matrix[row, gating_distance > gating_threshold] = np.inf
108 | cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance
109 | return cost_matrix
110 |
--------------------------------------------------------------------------------
/torchreid/distance.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, absolute_import
2 | import torch
3 | from torch.nn import functional as F
4 |
5 |
6 | def compute_distance_matrix(input1, input2, metric='euclidean'):
7 | """A wrapper function for computing distance matrix.
8 |
9 | Args:
10 | input1 (torch.Tensor): 2-D feature matrix.
11 | input2 (torch.Tensor): 2-D feature matrix.
12 | metric (str, optional): "euclidean" or "cosine".
13 | Default is "euclidean".
14 |
15 | Returns:
16 | torch.Tensor: distance matrix.
17 |
18 | Examples::
19 | >>> from torchreid import metrics
20 | >>> input1 = torch.rand(10, 2048)
21 | >>> input2 = torch.rand(100, 2048)
22 | >>> distmat = metrics.compute_distance_matrix(input1, input2)
23 | >>> distmat.size() # (10, 100)
24 | """
25 | # check input
26 | assert isinstance(input1, torch.Tensor)
27 | assert isinstance(input2, torch.Tensor)
28 | assert input1.dim() == 2, 'Expected 2-D tensor, but got {}-D'.format(
29 | input1.dim()
30 | )
31 | assert input2.dim() == 2, 'Expected 2-D tensor, but got {}-D'.format(
32 | input2.dim()
33 | )
34 | assert input1.size(1) == input2.size(1)
35 |
36 | if metric == 'euclidean':
37 | distmat = euclidean_squared_distance(input1, input2)
38 | elif metric == 'cosine':
39 | distmat = cosine_distance(input1, input2)
40 | else:
41 | raise ValueError(
42 | 'Unknown distance metric: {}. '
43 | 'Please choose either "euclidean" or "cosine"'.format(metric)
44 | )
45 |
46 | return distmat
47 |
48 |
49 | def euclidean_squared_distance(input1, input2):
50 | """Computes euclidean squared distance.
51 |
52 | Args:
53 | input1 (torch.Tensor): 2-D feature matrix.
54 | input2 (torch.Tensor): 2-D feature matrix.
55 |
56 | Returns:
57 | torch.Tensor: distance matrix.
58 | """
59 | m, n = input1.size(0), input2.size(0)
60 | mat1 = torch.pow(input1, 2).sum(dim=1, keepdim=True).expand(m, n)
61 | mat2 = torch.pow(input2, 2).sum(dim=1, keepdim=True).expand(n, m).t()
62 | distmat = mat1 + mat2
63 | distmat.addmm_(input1, input2.t(), beta=1, alpha=-2)
64 | return distmat
65 |
66 |
67 | def cosine_distance(input1, input2):
68 | """Computes cosine distance.
69 |
70 | Args:
71 | input1 (torch.Tensor): 2-D feature matrix.
72 | input2 (torch.Tensor): 2-D feature matrix.
73 |
74 | Returns:
75 | torch.Tensor: distance matrix.
76 | """
77 | input1_normed = F.normalize(input1, p=2, dim=1)
78 | input2_normed = F.normalize(input2, p=2, dim=1)
79 | distmat = 1 - torch.mm(input1_normed, input2_normed.t())
80 | return distmat
81 |
--------------------------------------------------------------------------------
/torchreid/feature_extractor.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | import numpy as np
3 | import torch
4 | import torchvision.transforms as T
5 | from PIL import Image
6 |
7 | from .models import build_model
8 |
9 | import warnings
10 | import os.path as osp
11 | import pickle
12 | from functools import partial
13 | from collections import OrderedDict
14 |
15 | def check_isfile(fpath):
16 | """Checks if the given path is a file.
17 |
18 | Args:
19 | fpath (str): file path.
20 |
21 | Returns:
22 | bool
23 | """
24 | isfile = osp.isfile(fpath)
25 | if not isfile:
26 | warnings.warn('No file found at "{}"'.format(fpath))
27 | return isfile
28 |
29 | def load_pretrained_weights(model, weight_path):
30 | r"""Loads pretrianed weights to model.
31 |
32 | Features::
33 | - Incompatible layers (unmatched in name or size) will be ignored.
34 | - Can automatically deal with keys containing "module.".
35 |
36 | Args:
37 | model (nn.Module): network model.
38 | weight_path (str): path to pretrained weights.
39 |
40 | Examples::
41 | >>> from torchreid.utils import load_pretrained_weights
42 | >>> weight_path = 'log/my_model/model-best.pth.tar'
43 | >>> load_pretrained_weights(model, weight_path)
44 | """
45 | checkpoint = load_checkpoint(weight_path)
46 | if 'state_dict' in checkpoint:
47 | state_dict = checkpoint['state_dict']
48 | else:
49 | state_dict = checkpoint
50 |
51 | model_dict = model.state_dict()
52 | new_state_dict = OrderedDict()
53 | matched_layers, discarded_layers = [], []
54 |
55 | for k, v in state_dict.items():
56 | if k.startswith('module.'):
57 | k = k[7:] # discard module.
58 |
59 | if k in model_dict and model_dict[k].size() == v.size():
60 | new_state_dict[k] = v
61 | matched_layers.append(k)
62 | else:
63 | discarded_layers.append(k)
64 |
65 | model_dict.update(new_state_dict)
66 | model.load_state_dict(model_dict)
67 |
68 | if len(matched_layers) == 0:
69 | warnings.warn(
70 | 'The pretrained weights "{}" cannot be loaded, '
71 | 'please check the key names manually '
72 | '(** ignored and continue **)'.format(weight_path)
73 | )
74 | #else:
75 | #print(
76 | # 'Successfully loaded pretrained weights from "{}"'.
77 | # format(weight_path)
78 | #)
79 | #if len(discarded_layers) > 0:
80 | # print(
81 | # '** The following layers are discarded '
82 | # 'due to unmatched keys or layer size: {}'.
83 | # format(discarded_layers)
84 | # )
85 |
86 | def load_checkpoint(fpath):
87 | r"""Loads checkpoint.
88 |
89 | ``UnicodeDecodeError`` can be well handled, which means
90 | python2-saved files can be read from python3.
91 |
92 | Args:
93 | fpath (str): path to checkpoint.
94 |
95 | Returns:
96 | dict
97 |
98 | Examples::
99 | >>> from torchreid.utils import load_checkpoint
100 | >>> fpath = 'log/my_model/model.pth.tar-10'
101 | >>> checkpoint = load_checkpoint(fpath)
102 | """
103 | if fpath is None:
104 | raise ValueError('File path is None')
105 | if not osp.exists(fpath):
106 | raise FileNotFoundError('File is not found at "{}"'.format(fpath))
107 | map_location = None if torch.cuda.is_available() else 'cpu'
108 | try:
109 | checkpoint = torch.load(fpath, map_location=map_location)
110 | except UnicodeDecodeError:
111 | pickle.load = partial(pickle.load, encoding="latin1")
112 | pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1")
113 | checkpoint = torch.load(
114 | fpath, pickle_module=pickle, map_location=map_location
115 | )
116 | except Exception:
117 | print('Unable to load checkpoint from "{}"'.format(fpath))
118 | raise
119 | return checkpoint
120 |
121 | class FeatureExtractor(object):
122 | """A simple API for feature extraction.
123 |
124 | FeatureExtractor can be used like a python function, which
125 | accepts input of the following types:
126 | - a list of strings (image paths)
127 | - a list of numpy.ndarray each with shape (H, W, C)
128 | - a single string (image path)
129 | - a single numpy.ndarray with shape (H, W, C)
130 | - a torch.Tensor with shape (B, C, H, W) or (C, H, W)
131 |
132 | Returned is a torch tensor with shape (B, D) where D is the
133 | feature dimension.
134 |
135 | Args:
136 | model_name (str): model name.
137 | model_path (str): path to model weights.
138 | image_size (sequence or int): image height and width.
139 | pixel_mean (list): pixel mean for normalization.
140 | pixel_std (list): pixel std for normalization.
141 | pixel_norm (bool): whether to normalize pixels.
142 | device (str): 'cpu' or 'cuda' (could be specific gpu devices).
143 | verbose (bool): show model details.
144 |
145 | Examples::
146 |
147 | from torchreid.utils import FeatureExtractor
148 |
149 | extractor = FeatureExtractor(
150 | model_name='osnet_x1_0',
151 | model_path='a/b/c/model.pth.tar',
152 | device='cuda'
153 | )
154 |
155 | image_list = [
156 | 'a/b/c/image001.jpg',
157 | 'a/b/c/image002.jpg',
158 | 'a/b/c/image003.jpg',
159 | 'a/b/c/image004.jpg',
160 | 'a/b/c/image005.jpg'
161 | ]
162 |
163 | features = extractor(image_list)
164 | print(features.shape) # output (5, 512)
165 | """
166 |
167 | def __init__(
168 | self,
169 | model_name='',
170 | model_path='',
171 | image_size=(256, 128), # (h, w)
172 | pixel_mean=[0.485, 0.456, 0.406],
173 | pixel_std=[0.229, 0.224, 0.225],
174 | pixel_norm=True,
175 | device='cuda',
176 | verbose=True
177 | ):
178 | # Build model
179 | model = build_model(
180 | model_name,
181 | num_classes=1,
182 | pretrained=False,
183 | use_gpu=device.startswith('cuda')
184 | )
185 | model.eval()
186 |
187 | if model_path and check_isfile(model_path):
188 | load_pretrained_weights(model, model_path)
189 |
190 | # Build transform functions
191 | transforms = []
192 | transforms += [T.Resize(image_size)]
193 | transforms += [T.ToTensor()]
194 | if pixel_norm:
195 | transforms += [T.Normalize(mean=pixel_mean, std=pixel_std)]
196 | preprocess = T.Compose(transforms)
197 |
198 | to_pil = T.ToPILImage()
199 |
200 | device = torch.device(device)
201 | model.to(device)
202 |
203 | # Class attributes
204 | self.model = model
205 | self.preprocess = preprocess
206 | self.to_pil = to_pil
207 | self.device = device
208 |
209 | def __call__(self, input):
210 | if isinstance(input, list):
211 | images = []
212 |
213 | for element in input:
214 | if isinstance(element, str):
215 | image = Image.open(element).convert('RGB')
216 |
217 | elif isinstance(element, np.ndarray):
218 | image = self.to_pil(element)
219 |
220 | else:
221 | raise TypeError(
222 | 'Type of each element must belong to [str | numpy.ndarray]'
223 | )
224 |
225 | image = self.preprocess(image)
226 | images.append(image)
227 |
228 | images = torch.stack(images, dim=0)
229 | images = images.to(self.device)
230 |
231 | elif isinstance(input, str):
232 | image = Image.open(input).convert('RGB')
233 | image = self.preprocess(image)
234 | images = image.unsqueeze(0).to(self.device)
235 |
236 | elif isinstance(input, np.ndarray):
237 | image = self.to_pil(input)
238 | image = self.preprocess(image)
239 | images = image.unsqueeze(0).to(self.device)
240 |
241 | elif isinstance(input, torch.Tensor):
242 | if input.dim() == 3:
243 | input = input.unsqueeze(0)
244 | images = input.to(self.device)
245 |
246 | else:
247 | raise NotImplementedError
248 |
249 | with torch.no_grad():
250 | features = self.model(images)
251 |
252 | return features
253 |
--------------------------------------------------------------------------------
/torchreid/models/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | import torch
3 |
4 | from .pcb import *
5 | from .mlfn import *
6 | from .hacnn import *
7 | from .osnet import *
8 | from .senet import *
9 | from .mudeep import *
10 | from .nasnet import *
11 | from .resnet import *
12 | from .densenet import *
13 | from .xception import *
14 | from .osnet_ain import *
15 | from .resnetmid import *
16 | from .shufflenet import *
17 | from .squeezenet import *
18 | from .inceptionv4 import *
19 | from .mobilenetv2 import *
20 | from .resnet_ibn_a import *
21 | from .resnet_ibn_b import *
22 | from .shufflenetv2 import *
23 | from .inceptionresnetv2 import *
24 |
25 | __model_factory = {
26 | # image classification models
27 | 'resnet18': resnet18,
28 | 'resnet34': resnet34,
29 | 'resnet50': resnet50,
30 | 'resnet101': resnet101,
31 | 'resnet152': resnet152,
32 | 'resnext50_32x4d': resnext50_32x4d,
33 | 'resnext101_32x8d': resnext101_32x8d,
34 | 'resnet50_fc512': resnet50_fc512,
35 | 'se_resnet50': se_resnet50,
36 | 'se_resnet50_fc512': se_resnet50_fc512,
37 | 'se_resnet101': se_resnet101,
38 | 'se_resnext50_32x4d': se_resnext50_32x4d,
39 | 'se_resnext101_32x4d': se_resnext101_32x4d,
40 | 'densenet121': densenet121,
41 | 'densenet169': densenet169,
42 | 'densenet201': densenet201,
43 | 'densenet161': densenet161,
44 | 'densenet121_fc512': densenet121_fc512,
45 | 'inceptionresnetv2': inceptionresnetv2,
46 | 'inceptionv4': inceptionv4,
47 | 'xception': xception,
48 | 'resnet50_ibn_a': resnet50_ibn_a,
49 | 'resnet50_ibn_b': resnet50_ibn_b,
50 | # lightweight models
51 | 'nasnsetmobile': nasnetamobile,
52 | 'mobilenetv2_x1_0': mobilenetv2_x1_0,
53 | 'mobilenetv2_x1_4': mobilenetv2_x1_4,
54 | 'shufflenet': shufflenet,
55 | 'squeezenet1_0': squeezenet1_0,
56 | 'squeezenet1_0_fc512': squeezenet1_0_fc512,
57 | 'squeezenet1_1': squeezenet1_1,
58 | 'shufflenet_v2_x0_5': shufflenet_v2_x0_5,
59 | 'shufflenet_v2_x1_0': shufflenet_v2_x1_0,
60 | 'shufflenet_v2_x1_5': shufflenet_v2_x1_5,
61 | 'shufflenet_v2_x2_0': shufflenet_v2_x2_0,
62 | # reid-specific models
63 | 'mudeep': MuDeep,
64 | 'resnet50mid': resnet50mid,
65 | 'hacnn': HACNN,
66 | 'pcb_p6': pcb_p6,
67 | 'pcb_p4': pcb_p4,
68 | 'mlfn': mlfn,
69 | 'osnet_x1_0': osnet_x1_0,
70 | 'osnet_x0_75': osnet_x0_75,
71 | 'osnet_x0_5': osnet_x0_5,
72 | 'osnet_x0_25': osnet_x0_25,
73 | 'osnet_ibn_x1_0': osnet_ibn_x1_0,
74 | 'osnet_ain_x1_0': osnet_ain_x1_0
75 | }
76 |
77 |
78 | def show_avai_models():
79 | """Displays available models.
80 |
81 | Examples::
82 | >>> from torchreid import models
83 | >>> models.show_avai_models()
84 | """
85 | print(list(__model_factory.keys()))
86 |
87 |
88 | def build_model(
89 | name, num_classes, loss='softmax', pretrained=True, use_gpu=True
90 | ):
91 | """A function wrapper for building a model.
92 |
93 | Args:
94 | name (str): model name.
95 | num_classes (int): number of training identities.
96 | loss (str, optional): loss function to optimize the model. Currently
97 | supports "softmax" and "triplet". Default is "softmax".
98 | pretrained (bool, optional): whether to load ImageNet-pretrained weights.
99 | Default is True.
100 | use_gpu (bool, optional): whether to use gpu. Default is True.
101 |
102 | Returns:
103 | nn.Module
104 |
105 | Examples::
106 | >>> from torchreid import models
107 | >>> model = models.build_model('resnet50', 751, loss='softmax')
108 | """
109 | avai_models = list(__model_factory.keys())
110 | if name not in avai_models:
111 | raise KeyError(
112 | 'Unknown model: {}. Must be one of {}'.format(name, avai_models)
113 | )
114 | return __model_factory[name](
115 | num_classes=num_classes,
116 | loss=loss,
117 | pretrained=pretrained,
118 | use_gpu=use_gpu
119 | )
120 |
--------------------------------------------------------------------------------
/torchreid/models/mudeep.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, absolute_import
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 | __all__ = ['MuDeep']
7 |
8 |
9 | class ConvBlock(nn.Module):
10 | """Basic convolutional block.
11 |
12 | convolution + batch normalization + relu.
13 |
14 | Args:
15 | in_c (int): number of input channels.
16 | out_c (int): number of output channels.
17 | k (int or tuple): kernel size.
18 | s (int or tuple): stride.
19 | p (int or tuple): padding.
20 | """
21 |
22 | def __init__(self, in_c, out_c, k, s, p):
23 | super(ConvBlock, self).__init__()
24 | self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p)
25 | self.bn = nn.BatchNorm2d(out_c)
26 |
27 | def forward(self, x):
28 | return F.relu(self.bn(self.conv(x)))
29 |
30 |
31 | class ConvLayers(nn.Module):
32 | """Preprocessing layers."""
33 |
34 | def __init__(self):
35 | super(ConvLayers, self).__init__()
36 | self.conv1 = ConvBlock(3, 48, k=3, s=1, p=1)
37 | self.conv2 = ConvBlock(48, 96, k=3, s=1, p=1)
38 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
39 |
40 | def forward(self, x):
41 | x = self.conv1(x)
42 | x = self.conv2(x)
43 | x = self.maxpool(x)
44 | return x
45 |
46 |
47 | class MultiScaleA(nn.Module):
48 | """Multi-scale stream layer A (Sec.3.1)"""
49 |
50 | def __init__(self):
51 | super(MultiScaleA, self).__init__()
52 | self.stream1 = nn.Sequential(
53 | ConvBlock(96, 96, k=1, s=1, p=0),
54 | ConvBlock(96, 24, k=3, s=1, p=1),
55 | )
56 | self.stream2 = nn.Sequential(
57 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
58 | ConvBlock(96, 24, k=1, s=1, p=0),
59 | )
60 | self.stream3 = ConvBlock(96, 24, k=1, s=1, p=0)
61 | self.stream4 = nn.Sequential(
62 | ConvBlock(96, 16, k=1, s=1, p=0),
63 | ConvBlock(16, 24, k=3, s=1, p=1),
64 | ConvBlock(24, 24, k=3, s=1, p=1),
65 | )
66 |
67 | def forward(self, x):
68 | s1 = self.stream1(x)
69 | s2 = self.stream2(x)
70 | s3 = self.stream3(x)
71 | s4 = self.stream4(x)
72 | y = torch.cat([s1, s2, s3, s4], dim=1)
73 | return y
74 |
75 |
76 | class Reduction(nn.Module):
77 | """Reduction layer (Sec.3.1)"""
78 |
79 | def __init__(self):
80 | super(Reduction, self).__init__()
81 | self.stream1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
82 | self.stream2 = ConvBlock(96, 96, k=3, s=2, p=1)
83 | self.stream3 = nn.Sequential(
84 | ConvBlock(96, 48, k=1, s=1, p=0),
85 | ConvBlock(48, 56, k=3, s=1, p=1),
86 | ConvBlock(56, 64, k=3, s=2, p=1),
87 | )
88 |
89 | def forward(self, x):
90 | s1 = self.stream1(x)
91 | s2 = self.stream2(x)
92 | s3 = self.stream3(x)
93 | y = torch.cat([s1, s2, s3], dim=1)
94 | return y
95 |
96 |
97 | class MultiScaleB(nn.Module):
98 | """Multi-scale stream layer B (Sec.3.1)"""
99 |
100 | def __init__(self):
101 | super(MultiScaleB, self).__init__()
102 | self.stream1 = nn.Sequential(
103 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
104 | ConvBlock(256, 256, k=1, s=1, p=0),
105 | )
106 | self.stream2 = nn.Sequential(
107 | ConvBlock(256, 64, k=1, s=1, p=0),
108 | ConvBlock(64, 128, k=(1, 3), s=1, p=(0, 1)),
109 | ConvBlock(128, 256, k=(3, 1), s=1, p=(1, 0)),
110 | )
111 | self.stream3 = ConvBlock(256, 256, k=1, s=1, p=0)
112 | self.stream4 = nn.Sequential(
113 | ConvBlock(256, 64, k=1, s=1, p=0),
114 | ConvBlock(64, 64, k=(1, 3), s=1, p=(0, 1)),
115 | ConvBlock(64, 128, k=(3, 1), s=1, p=(1, 0)),
116 | ConvBlock(128, 128, k=(1, 3), s=1, p=(0, 1)),
117 | ConvBlock(128, 256, k=(3, 1), s=1, p=(1, 0)),
118 | )
119 |
120 | def forward(self, x):
121 | s1 = self.stream1(x)
122 | s2 = self.stream2(x)
123 | s3 = self.stream3(x)
124 | s4 = self.stream4(x)
125 | return s1, s2, s3, s4
126 |
127 |
128 | class Fusion(nn.Module):
129 | """Saliency-based learning fusion layer (Sec.3.2)"""
130 |
131 | def __init__(self):
132 | super(Fusion, self).__init__()
133 | self.a1 = nn.Parameter(torch.rand(1, 256, 1, 1))
134 | self.a2 = nn.Parameter(torch.rand(1, 256, 1, 1))
135 | self.a3 = nn.Parameter(torch.rand(1, 256, 1, 1))
136 | self.a4 = nn.Parameter(torch.rand(1, 256, 1, 1))
137 |
138 | # We add an average pooling layer to reduce the spatial dimension
139 | # of feature maps, which differs from the original paper.
140 | self.avgpool = nn.AvgPool2d(kernel_size=4, stride=4, padding=0)
141 |
142 | def forward(self, x1, x2, x3, x4):
143 | s1 = self.a1.expand_as(x1) * x1
144 | s2 = self.a2.expand_as(x2) * x2
145 | s3 = self.a3.expand_as(x3) * x3
146 | s4 = self.a4.expand_as(x4) * x4
147 | y = self.avgpool(s1 + s2 + s3 + s4)
148 | return y
149 |
150 |
151 | class MuDeep(nn.Module):
152 | """Multiscale deep neural network.
153 |
154 | Reference:
155 | Qian et al. Multi-scale Deep Learning Architectures
156 | for Person Re-identification. ICCV 2017.
157 |
158 | Public keys:
159 | - ``mudeep``: Multiscale deep neural network.
160 | """
161 |
162 | def __init__(self, num_classes, loss='softmax', **kwargs):
163 | super(MuDeep, self).__init__()
164 | self.loss = loss
165 |
166 | self.block1 = ConvLayers()
167 | self.block2 = MultiScaleA()
168 | self.block3 = Reduction()
169 | self.block4 = MultiScaleB()
170 | self.block5 = Fusion()
171 |
172 | # Due to this fully connected layer, input image has to be fixed
173 | # in shape, i.e. (3, 256, 128), such that the last convolutional feature
174 | # maps are of shape (256, 16, 8). If input shape is changed,
175 | # the input dimension of this layer has to be changed accordingly.
176 | self.fc = nn.Sequential(
177 | nn.Linear(256 * 16 * 8, 4096),
178 | nn.BatchNorm1d(4096),
179 | nn.ReLU(),
180 | )
181 | self.classifier = nn.Linear(4096, num_classes)
182 | self.feat_dim = 4096
183 |
184 | def featuremaps(self, x):
185 | x = self.block1(x)
186 | x = self.block2(x)
187 | x = self.block3(x)
188 | x = self.block4(x)
189 | x = self.block5(*x)
190 | return x
191 |
192 | def forward(self, x):
193 | x = self.featuremaps(x)
194 | x = x.view(x.size(0), -1)
195 | x = self.fc(x)
196 | y = self.classifier(x)
197 |
198 | if not self.training:
199 | return x
200 |
201 | if self.loss == 'softmax':
202 | return y
203 | elif self.loss == 'triplet':
204 | return y, x
205 | else:
206 | raise KeyError('Unsupported loss: {}'.format(self.loss))
207 |
--------------------------------------------------------------------------------
/torchreid/models/shufflenet.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, absolute_import
2 | import torch
3 | import torch.utils.model_zoo as model_zoo
4 | from torch import nn
5 | from torch.nn import functional as F
6 |
7 | __all__ = ['shufflenet']
8 |
9 | model_urls = {
10 | # training epoch = 90, top1 = 61.8
11 | 'imagenet':
12 | 'https://mega.nz/#!RDpUlQCY!tr_5xBEkelzDjveIYBBcGcovNCOrgfiJO9kiidz9fZM',
13 | }
14 |
15 |
16 | class ChannelShuffle(nn.Module):
17 |
18 | def __init__(self, num_groups):
19 | super(ChannelShuffle, self).__init__()
20 | self.g = num_groups
21 |
22 | def forward(self, x):
23 | b, c, h, w = x.size()
24 | n = c // self.g
25 | # reshape
26 | x = x.view(b, self.g, n, h, w)
27 | # transpose
28 | x = x.permute(0, 2, 1, 3, 4).contiguous()
29 | # flatten
30 | x = x.view(b, c, h, w)
31 | return x
32 |
33 |
34 | class Bottleneck(nn.Module):
35 |
36 | def __init__(
37 | self,
38 | in_channels,
39 | out_channels,
40 | stride,
41 | num_groups,
42 | group_conv1x1=True
43 | ):
44 | super(Bottleneck, self).__init__()
45 | assert stride in [1, 2], 'Warning: stride must be either 1 or 2'
46 | self.stride = stride
47 | mid_channels = out_channels // 4
48 | if stride == 2:
49 | out_channels -= in_channels
50 | # group conv is not applied to first conv1x1 at stage 2
51 | num_groups_conv1x1 = num_groups if group_conv1x1 else 1
52 | self.conv1 = nn.Conv2d(
53 | in_channels,
54 | mid_channels,
55 | 1,
56 | groups=num_groups_conv1x1,
57 | bias=False
58 | )
59 | self.bn1 = nn.BatchNorm2d(mid_channels)
60 | self.shuffle1 = ChannelShuffle(num_groups)
61 | self.conv2 = nn.Conv2d(
62 | mid_channels,
63 | mid_channels,
64 | 3,
65 | stride=stride,
66 | padding=1,
67 | groups=mid_channels,
68 | bias=False
69 | )
70 | self.bn2 = nn.BatchNorm2d(mid_channels)
71 | self.conv3 = nn.Conv2d(
72 | mid_channels, out_channels, 1, groups=num_groups, bias=False
73 | )
74 | self.bn3 = nn.BatchNorm2d(out_channels)
75 | if stride == 2:
76 | self.shortcut = nn.AvgPool2d(3, stride=2, padding=1)
77 |
78 | def forward(self, x):
79 | out = F.relu(self.bn1(self.conv1(x)))
80 | out = self.shuffle1(out)
81 | out = self.bn2(self.conv2(out))
82 | out = self.bn3(self.conv3(out))
83 | if self.stride == 2:
84 | res = self.shortcut(x)
85 | out = F.relu(torch.cat([res, out], 1))
86 | else:
87 | out = F.relu(x + out)
88 | return out
89 |
90 |
91 | # configuration of (num_groups: #out_channels) based on Table 1 in the paper
92 | cfg = {
93 | 1: [144, 288, 576],
94 | 2: [200, 400, 800],
95 | 3: [240, 480, 960],
96 | 4: [272, 544, 1088],
97 | 8: [384, 768, 1536],
98 | }
99 |
100 |
101 | class ShuffleNet(nn.Module):
102 | """ShuffleNet.
103 |
104 | Reference:
105 | Zhang et al. ShuffleNet: An Extremely Efficient Convolutional Neural
106 | Network for Mobile Devices. CVPR 2018.
107 |
108 | Public keys:
109 | - ``shufflenet``: ShuffleNet (groups=3).
110 | """
111 |
112 | def __init__(self, num_classes, loss='softmax', num_groups=3, **kwargs):
113 | super(ShuffleNet, self).__init__()
114 | self.loss = loss
115 |
116 | self.conv1 = nn.Sequential(
117 | nn.Conv2d(3, 24, 3, stride=2, padding=1, bias=False),
118 | nn.BatchNorm2d(24),
119 | nn.ReLU(),
120 | nn.MaxPool2d(3, stride=2, padding=1),
121 | )
122 |
123 | self.stage2 = nn.Sequential(
124 | Bottleneck(
125 | 24, cfg[num_groups][0], 2, num_groups, group_conv1x1=False
126 | ),
127 | Bottleneck(cfg[num_groups][0], cfg[num_groups][0], 1, num_groups),
128 | Bottleneck(cfg[num_groups][0], cfg[num_groups][0], 1, num_groups),
129 | Bottleneck(cfg[num_groups][0], cfg[num_groups][0], 1, num_groups),
130 | )
131 |
132 | self.stage3 = nn.Sequential(
133 | Bottleneck(cfg[num_groups][0], cfg[num_groups][1], 2, num_groups),
134 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
135 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
136 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
137 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
138 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
139 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
140 | Bottleneck(cfg[num_groups][1], cfg[num_groups][1], 1, num_groups),
141 | )
142 |
143 | self.stage4 = nn.Sequential(
144 | Bottleneck(cfg[num_groups][1], cfg[num_groups][2], 2, num_groups),
145 | Bottleneck(cfg[num_groups][2], cfg[num_groups][2], 1, num_groups),
146 | Bottleneck(cfg[num_groups][2], cfg[num_groups][2], 1, num_groups),
147 | Bottleneck(cfg[num_groups][2], cfg[num_groups][2], 1, num_groups),
148 | )
149 |
150 | self.classifier = nn.Linear(cfg[num_groups][2], num_classes)
151 | self.feat_dim = cfg[num_groups][2]
152 |
153 | def forward(self, x):
154 | x = self.conv1(x)
155 | x = self.stage2(x)
156 | x = self.stage3(x)
157 | x = self.stage4(x)
158 | x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), -1)
159 |
160 | if not self.training:
161 | return x
162 |
163 | y = self.classifier(x)
164 |
165 | if self.loss == 'softmax':
166 | return y
167 | elif self.loss == 'triplet':
168 | return y, x
169 | else:
170 | raise KeyError('Unsupported loss: {}'.format(self.loss))
171 |
172 |
173 | def init_pretrained_weights(model, model_url):
174 | """Initializes model with pretrained weights.
175 |
176 | Layers that don't match with pretrained layers in name or size are kept unchanged.
177 | """
178 | pretrain_dict = model_zoo.load_url(model_url)
179 | model_dict = model.state_dict()
180 | pretrain_dict = {
181 | k: v
182 | for k, v in pretrain_dict.items()
183 | if k in model_dict and model_dict[k].size() == v.size()
184 | }
185 | model_dict.update(pretrain_dict)
186 | model.load_state_dict(model_dict)
187 |
188 |
189 | def shufflenet(num_classes, loss='softmax', pretrained=True, **kwargs):
190 | model = ShuffleNet(num_classes, loss, **kwargs)
191 | if pretrained:
192 | # init_pretrained_weights(model, model_urls['imagenet'])
193 | import warnings
194 | warnings.warn(
195 | 'The imagenet pretrained weights need to be manually downloaded from {}'
196 | .format(model_urls['imagenet'])
197 | )
198 | return model
199 |
--------------------------------------------------------------------------------
/torchreid/models/squeezenet.py:
--------------------------------------------------------------------------------
1 | """
2 | Code source: https://github.com/pytorch/vision
3 | """
4 | from __future__ import division, absolute_import
5 | import torch
6 | import torch.nn as nn
7 | import torch.utils.model_zoo as model_zoo
8 |
9 | __all__ = ['squeezenet1_0', 'squeezenet1_1', 'squeezenet1_0_fc512']
10 |
11 | model_urls = {
12 | 'squeezenet1_0':
13 | 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
14 | 'squeezenet1_1':
15 | 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
16 | }
17 |
18 |
19 | class Fire(nn.Module):
20 |
21 | def __init__(
22 | self, inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes
23 | ):
24 | super(Fire, self).__init__()
25 | self.inplanes = inplanes
26 | self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
27 | self.squeeze_activation = nn.ReLU(inplace=True)
28 | self.expand1x1 = nn.Conv2d(
29 | squeeze_planes, expand1x1_planes, kernel_size=1
30 | )
31 | self.expand1x1_activation = nn.ReLU(inplace=True)
32 | self.expand3x3 = nn.Conv2d(
33 | squeeze_planes, expand3x3_planes, kernel_size=3, padding=1
34 | )
35 | self.expand3x3_activation = nn.ReLU(inplace=True)
36 |
37 | def forward(self, x):
38 | x = self.squeeze_activation(self.squeeze(x))
39 | return torch.cat(
40 | [
41 | self.expand1x1_activation(self.expand1x1(x)),
42 | self.expand3x3_activation(self.expand3x3(x))
43 | ], 1
44 | )
45 |
46 |
47 | class SqueezeNet(nn.Module):
48 | """SqueezeNet.
49 |
50 | Reference:
51 | Iandola et al. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
52 | and< 0.5 MB model size. arXiv:1602.07360.
53 |
54 | Public keys:
55 | - ``squeezenet1_0``: SqueezeNet (version=1.0).
56 | - ``squeezenet1_1``: SqueezeNet (version=1.1).
57 | - ``squeezenet1_0_fc512``: SqueezeNet (version=1.0) + FC.
58 | """
59 |
60 | def __init__(
61 | self,
62 | num_classes,
63 | loss,
64 | version=1.0,
65 | fc_dims=None,
66 | dropout_p=None,
67 | **kwargs
68 | ):
69 | super(SqueezeNet, self).__init__()
70 | self.loss = loss
71 | self.feature_dim = 512
72 |
73 | if version not in [1.0, 1.1]:
74 | raise ValueError(
75 | 'Unsupported SqueezeNet version {version}:'
76 | '1.0 or 1.1 expected'.format(version=version)
77 | )
78 |
79 | if version == 1.0:
80 | self.features = nn.Sequential(
81 | nn.Conv2d(3, 96, kernel_size=7, stride=2),
82 | nn.ReLU(inplace=True),
83 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
84 | Fire(96, 16, 64, 64),
85 | Fire(128, 16, 64, 64),
86 | Fire(128, 32, 128, 128),
87 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
88 | Fire(256, 32, 128, 128),
89 | Fire(256, 48, 192, 192),
90 | Fire(384, 48, 192, 192),
91 | Fire(384, 64, 256, 256),
92 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
93 | Fire(512, 64, 256, 256),
94 | )
95 | else:
96 | self.features = nn.Sequential(
97 | nn.Conv2d(3, 64, kernel_size=3, stride=2),
98 | nn.ReLU(inplace=True),
99 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
100 | Fire(64, 16, 64, 64),
101 | Fire(128, 16, 64, 64),
102 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
103 | Fire(128, 32, 128, 128),
104 | Fire(256, 32, 128, 128),
105 | nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
106 | Fire(256, 48, 192, 192),
107 | Fire(384, 48, 192, 192),
108 | Fire(384, 64, 256, 256),
109 | Fire(512, 64, 256, 256),
110 | )
111 |
112 | self.global_avgpool = nn.AdaptiveAvgPool2d(1)
113 | self.fc = self._construct_fc_layer(fc_dims, 512, dropout_p)
114 | self.classifier = nn.Linear(self.feature_dim, num_classes)
115 |
116 | self._init_params()
117 |
118 | def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None):
119 | """Constructs fully connected layer
120 |
121 | Args:
122 | fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed
123 | input_dim (int): input dimension
124 | dropout_p (float): dropout probability, if None, dropout is unused
125 | """
126 | if fc_dims is None:
127 | self.feature_dim = input_dim
128 | return None
129 |
130 | assert isinstance(
131 | fc_dims, (list, tuple)
132 | ), 'fc_dims must be either list or tuple, but got {}'.format(
133 | type(fc_dims)
134 | )
135 |
136 | layers = []
137 | for dim in fc_dims:
138 | layers.append(nn.Linear(input_dim, dim))
139 | layers.append(nn.BatchNorm1d(dim))
140 | layers.append(nn.ReLU(inplace=True))
141 | if dropout_p is not None:
142 | layers.append(nn.Dropout(p=dropout_p))
143 | input_dim = dim
144 |
145 | self.feature_dim = fc_dims[-1]
146 |
147 | return nn.Sequential(*layers)
148 |
149 | def _init_params(self):
150 | for m in self.modules():
151 | if isinstance(m, nn.Conv2d):
152 | nn.init.kaiming_normal_(
153 | m.weight, mode='fan_out', nonlinearity='relu'
154 | )
155 | if m.bias is not None:
156 | nn.init.constant_(m.bias, 0)
157 | elif isinstance(m, nn.BatchNorm2d):
158 | nn.init.constant_(m.weight, 1)
159 | nn.init.constant_(m.bias, 0)
160 | elif isinstance(m, nn.BatchNorm1d):
161 | nn.init.constant_(m.weight, 1)
162 | nn.init.constant_(m.bias, 0)
163 | elif isinstance(m, nn.Linear):
164 | nn.init.normal_(m.weight, 0, 0.01)
165 | if m.bias is not None:
166 | nn.init.constant_(m.bias, 0)
167 |
168 | def forward(self, x):
169 | f = self.features(x)
170 | v = self.global_avgpool(f)
171 | v = v.view(v.size(0), -1)
172 |
173 | if self.fc is not None:
174 | v = self.fc(v)
175 |
176 | if not self.training:
177 | return v
178 |
179 | y = self.classifier(v)
180 |
181 | if self.loss == 'softmax':
182 | return y
183 | elif self.loss == 'triplet':
184 | return y, v
185 | else:
186 | raise KeyError('Unsupported loss: {}'.format(self.loss))
187 |
188 |
189 | def init_pretrained_weights(model, model_url):
190 | """Initializes model with pretrained weights.
191 |
192 | Layers that don't match with pretrained layers in name or size are kept unchanged.
193 | """
194 | pretrain_dict = model_zoo.load_url(model_url, map_location=None)
195 | model_dict = model.state_dict()
196 | pretrain_dict = {
197 | k: v
198 | for k, v in pretrain_dict.items()
199 | if k in model_dict and model_dict[k].size() == v.size()
200 | }
201 | model_dict.update(pretrain_dict)
202 | model.load_state_dict(model_dict)
203 |
204 |
205 | def squeezenet1_0(num_classes, loss='softmax', pretrained=True, **kwargs):
206 | model = SqueezeNet(
207 | num_classes, loss, version=1.0, fc_dims=None, dropout_p=None, **kwargs
208 | )
209 | if pretrained:
210 | init_pretrained_weights(model, model_urls['squeezenet1_0'])
211 | return model
212 |
213 |
214 | def squeezenet1_0_fc512(
215 | num_classes, loss='softmax', pretrained=True, **kwargs
216 | ):
217 | model = SqueezeNet(
218 | num_classes,
219 | loss,
220 | version=1.0,
221 | fc_dims=[512],
222 | dropout_p=None,
223 | **kwargs
224 | )
225 | if pretrained:
226 | init_pretrained_weights(model, model_urls['squeezenet1_0'])
227 | return model
228 |
229 |
230 | def squeezenet1_1(num_classes, loss='softmax', pretrained=True, **kwargs):
231 | model = SqueezeNet(
232 | num_classes, loss, version=1.1, fc_dims=None, dropout_p=None, **kwargs
233 | )
234 | if pretrained:
235 | init_pretrained_weights(model, model_urls['squeezenet1_1'])
236 | return model
237 |
--------------------------------------------------------------------------------
/track_to_json.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # convert the detections or mtsc txt file into json for each frame
3 | import sys, os, json, argparse
4 |
5 | from tqdm import tqdm
6 |
7 | from class_ids import targetClass2id_new_nopo, targetAct2id_bupt
8 |
9 | targetClass2id = targetClass2id_new_nopo
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("filepath", help="all txt files for each video")
13 | parser.add_argument("videonamelst")
14 | parser.add_argument("cat_name")
15 | parser.add_argument("despath", help="despath/videoname_F_08d.json, index from 0")
16 | parser.add_argument("--bupt_exp", action="store_true")
17 |
18 |
19 | if __name__ == "__main__":
20 | args = parser.parse_args()
21 |
22 | videonames = [os.path.splitext(os.path.basename(line.strip()))[0] for line in open(args.videonamelst,"r").readlines()]
23 |
24 | if not os.path.exists(args.despath):
25 | os.makedirs(args.despath)
26 |
27 | if args.bupt_exp:
28 | targetClass2id = targetAct2id_bupt
29 |
30 | for videoname in tqdm(videonames, ascii=True):
31 | detfile = os.path.join(args.filepath, "%s.txt"%videoname)
32 |
33 | data = {} # frame -> boxes
34 |
35 | for line in open(detfile, "r").readlines():
36 | # note the frameIdx start from 1
37 | frameIdx, track_id, left, top, width, height, conf, _, _, _ = line.strip().split(",")
38 | frameIdx = int(frameIdx) - 1 # note here I made a mistake, gt is 1-indexed, but out obj_tracking output is 0-indexed
39 |
40 | track_id = int(track_id)
41 |
42 | box = [float(left), float(top), float(width), float(height)]
43 |
44 | #if not data.has_key(frameIdx):
45 | if not frameIdx in data:
46 | data[frameIdx] = []
47 | data[frameIdx].append({
48 | "category_id": targetClass2id[args.cat_name],
49 | "cat_name": args.cat_name,
50 | "score":float(round(float(conf), 7)),
51 | "bbox": box,
52 | "segmentation": None,
53 | "trackId": track_id
54 | })
55 |
56 | for frameIndex in data:
57 |
58 | annofile = os.path.join(args.despath, "%s_F_%08d.json"%(videoname, frameIndex))
59 |
60 | with open(annofile, "w") as f:
61 | json.dump(data[frameIndex], f)
62 |
63 |
--------------------------------------------------------------------------------
/tracks_to_json.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # convert the detections or mtsc txt file into json for each frame
3 | import sys, os, json, argparse
4 |
5 | from tqdm import tqdm
6 | from glob import glob
7 |
8 | from class_ids import targetClass2id_new_nopo, targetAct2id_bupt
9 |
10 | targetClass2id = targetClass2id_new_nopo
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("filepath", help="all txt files for each video")
14 | parser.add_argument("videonamelst")
15 | parser.add_argument("despath", help="despath/videoname_F_08d.json, index from 0")
16 | parser.add_argument("--bupt_exp", action="store_true")
17 |
18 |
19 | if __name__ == "__main__":
20 | args = parser.parse_args()
21 |
22 | # leave the .mp4
23 | videonames = [os.path.basename(line.strip()) for line in open(args.videonamelst,"r").readlines()]
24 |
25 | if not os.path.exists(args.despath):
26 | os.makedirs(args.despath)
27 |
28 | if args.bupt_exp:
29 | targetClass2id = targetAct2id_bupt
30 |
31 | for videoname in tqdm(videonames, ascii=True):
32 | #detfile = os.path.join(args.filepath, "%s.txt"%videoname)
33 | detfiles = glob(os.path.join(args.filepath, videoname, "*", "%s.txt" % (os.path.splitext(videoname)[0])))
34 |
35 | data = {} # frame -> boxes
36 | for detfile in detfiles:
37 | #cat_name = detfile.split("/")[-2] # this does not work under windows
38 | # 1. norm the path, in windows "/" will be converted to "\"
39 | detfile = os.path.normpath(detfile)
40 | # 2. split the path using os specific separator
41 | cat_name = detfile.split(os.sep)[-2]
42 | for line in open(detfile, "r").readlines():
43 | # note the frameIdx start from 1?
44 | frameIdx, track_id, left, top, width, height, conf, _, _, _ = line.strip().split(",")
45 | # (?) note here I made a mistake, gt is 1-indexed, but out obj_tracking output is 0-indexed
46 | #frameIdx = int(frameIdx) - 1
47 | frameIdx = int(frameIdx)
48 |
49 | track_id = int(track_id)
50 |
51 | box = [float(left), float(top), float(width), float(height)]
52 |
53 | #if not data.has_key(frameIdx):
54 | if not frameIdx in data:
55 | data[frameIdx] = []
56 | data[frameIdx].append({
57 | "category_id": targetClass2id[cat_name],
58 | "cat_name": cat_name,
59 | "score": float(round(float(conf), 7)),
60 | "bbox": box,
61 | "segmentation": None,
62 | "trackId": track_id
63 | })
64 |
65 | for frameIndex in data:
66 |
67 | annofile = os.path.join(args.despath, "%s_F_%08d.json"%(os.path.splitext(videoname)[0], frameIndex))
68 |
69 | with open(annofile, "w") as f:
70 | json.dump(data[frameIndex], f)
71 |
72 |
--------------------------------------------------------------------------------
/vis_tracks.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # given MOT track file path, visualize into videos
3 | import argparse
4 | import cv2
5 | import random
6 | import os
7 | import sys
8 |
9 | from tqdm import tqdm
10 | from glob import glob
11 | import numpy as np
12 |
13 | import matplotlib.colors as mcolors # to get a list of colors
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("track_path")
17 | parser.add_argument("frame_path")
18 | parser.add_argument("video_name_lst")
19 | parser.add_argument("out_path")
20 | parser.add_argument("--show_only_global", action="store_true")
21 |
22 | def hex_color_to_rgb(s):
23 | r = int(s[1:3], 16)
24 | g = int(s[3:5], 16)
25 | b = int(s[5:7], 16)
26 | return (r, g, b) # (0-255)
27 |
28 | def load_track_file(file_path, cat_names):
29 |
30 | track_data = {} # frame_id -> {cat_name: }
31 | video_name = os.path.splitext(os.path.basename(file_path))[0]
32 | for cat_name in cat_names:
33 | track_file_path = os.path.join(file_path, cat_name, video_name + ".txt")
34 | data = []
35 | with open(track_file_path, "r") as f:
36 | for line in f:
37 | frame_idx, track_id, left, top, width, height, conf, gid, _, _ = line.strip().split(",")
38 | data.append([frame_idx, track_id, left, top, width, height, conf, gid])
39 |
40 | data = np.array(data, dtype="float32") # [N, 8]
41 | frame_ids = np.unique(data[:, 0]).tolist()
42 |
43 | for frame_id in frame_ids:
44 | if frame_id not in track_data:
45 | track_data[frame_id] = {}
46 | track_data[frame_id][cat_name] = data[data[:, 0] == frame_id, :]
47 | return track_data
48 |
49 |
50 | def get_or_create_color_from_dict(key, color_dict, color_list):
51 | if key not in color_dict:
52 | this_color = color_list.pop()
53 |
54 | color_dict[key] = hex_color_to_rgb(color_name_to_hex[this_color])
55 | # recycle it
56 | color_list.insert(0, this_color)
57 | color = color_assign[key]
58 | return color
59 |
60 | def draw_boxes(im, boxes, labels=None, colors=None, font_scale=0.6,
61 | font_thick=1, box_thick=1, bottom_text=False, offsets=None):
62 | if not boxes:
63 | return im
64 |
65 | boxes = np.asarray(boxes, dtype="int")
66 |
67 | FONT = cv2.FONT_HERSHEY_SIMPLEX
68 | FONT_SCALE = font_scale
69 |
70 | if labels is not None:
71 | assert len(labels) == len(boxes), "{} != {}".format(len(labels), len(boxes))
72 | if colors is not None:
73 | assert len(labels) == len(colors)
74 |
75 | areas = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
76 | sorted_inds = np.argsort(-areas) # draw large ones first
77 | assert areas.min() > 0, areas.min()
78 |
79 | im = im.copy()
80 |
81 | for i in sorted_inds:
82 | box = boxes[i, :]
83 | if box[0] < 0 or box[1] < 0 or box[2] < 0 or box[3] < 0:
84 | continue
85 |
86 | color = (218, 218, 218)
87 | if colors is not None:
88 | color = colors[i]
89 |
90 | best_color = color
91 |
92 | lineh = 2 # for box enlarging, replace with text height if there is label
93 | if labels is not None:
94 | label = labels[i]
95 |
96 | # find the best placement for the text
97 | ((linew, lineh), _) = cv2.getTextSize(label, FONT, FONT_SCALE, font_thick)
98 | bottom_left = [box[0] + 1, box[1] - 0.3 * lineh]
99 | top_left = [box[0] + 1, box[1] - 1.3 * lineh]
100 | if top_left[1] < 0: # out of image
101 | top_left[1] = box[3] - 1.3 * lineh
102 | bottom_left[1] = box[3] - 0.3 * lineh
103 |
104 | textbox = [int(top_left[0]), int(top_left[1]),
105 | int(top_left[0] + linew), int(top_left[1] + lineh)]
106 | #textbox.clip_by_shape(im.shape[:2])
107 |
108 | offset = 0
109 | if offsets is not None:
110 | offset = lineh * offsets[i]
111 |
112 | if bottom_text:
113 | cv2.putText(im, label, (box[0] + 2, box[3] - 4 + offset),
114 | FONT, FONT_SCALE, color=best_color, thickness=font_thick)
115 | else:
116 | cv2.putText(im, label, (textbox[0], textbox[3] - offset),
117 | FONT, FONT_SCALE, color=best_color, thickness=font_thick)
118 |
119 | # expand the box on y axis for overlapping results
120 | offset = 0
121 | if offsets is not None:
122 | offset = lineh * offsets[i]
123 | box[0] -= box_thick * offsets[i] + 1
124 | box[2] += box_thick * offsets[i] + 1
125 | if bottom_text:
126 | box[1] -= box_thick * offsets[i] + 1
127 | box[3] += offset
128 | else:
129 | box[3] += box_thick * offsets[i] + 1
130 | box[1] -= offset
131 |
132 | cv2.rectangle(im, (box[0], box[1]), (box[2], box[3]),
133 | color=best_color, thickness=box_thick)
134 | return im
135 |
136 | color_name_to_hex = mcolors.CSS4_COLORS.copy() # {'whitesmoke': '#F5F5F5', ...}
137 | if __name__ == "__main__":
138 | args = parser.parse_args()
139 |
140 | color_name_list = sorted(list(color_name_to_hex.keys()))[:]
141 | random.seed(69)
142 | random.shuffle(color_name_list)
143 |
144 | color_assign = {} # global track id, obj -> name
145 |
146 | if not os.path.exists(args.out_path):
147 | os.makedirs(args.out_path)
148 |
149 | video_names = [os.path.basename(line.strip()) # with .avi
150 | for line in open(args.video_name_lst, "r").readlines()]
151 | for video_name in tqdm(video_names):
152 | video_name_no_appendix = os.path.splitext(video_name)[0]
153 | frames = glob(os.path.join(args.frame_path, video_name_no_appendix, "*.jpg"))
154 | frames.sort()
155 |
156 | # frame_id -> {cat_name: ..}
157 | track_data = load_track_file(
158 | os.path.join(args.track_path, video_name),
159 | ["Person", "Vehicle"])
160 |
161 | target_file = os.path.join(args.out_path, "%s.mp4" % video_name_no_appendix)
162 |
163 | fourcc = cv2.VideoWriter_fourcc(*"mp4v")
164 | fps = 30.0
165 | video_writer = cv2.VideoWriter(target_file, fourcc, fps, (1920, 1080), True)
166 |
167 | count_global_ids = {}
168 | for frame in frames:
169 | filename = os.path.splitext(os.path.basename(frame))[0]
170 | frame_id = int(filename.split("_F_")[-1])
171 |
172 | boxes = []
173 | labels = []
174 | box_colors = []
175 | if frame_id in track_data:
176 | this_track_data = track_data[frame_id]
177 | for cat_name in this_track_data:
178 | for box_data in this_track_data[cat_name]: # [N, 8]
179 | # get color and label
180 | local_track_id = box_data[1]
181 | global_track_id = box_data[7]
182 | if global_track_id != -1:
183 | color_key = (global_track_id, cat_name)
184 | count_global_ids[color_key] = 1
185 | track_id = "g%s" % global_track_id
186 | else:
187 | if args.show_only_global:
188 | continue
189 | color_key = (video_name, local_track_id, cat_name)
190 | track_id = local_track_id
191 | color = get_or_create_color_from_dict(
192 | color_key, color_assign, color_name_list)
193 | box_colors.append(color)
194 |
195 | conf = box_data[6]
196 | conf_str = ""
197 | if conf != 1.:
198 | conf_str = "%.2f" % conf
199 | labels.append("%s #%s %s"%(cat_name, track_id, conf_str))
200 |
201 | tlwh = box_data[2:6]
202 | tlbr = [tlwh[0], tlwh[1], tlwh[0] + tlwh[2], tlwh[1] + tlwh[3]]
203 | boxes.append(tlbr)
204 |
205 | new_im = cv2.imread(frame, cv2.IMREAD_COLOR)
206 | new_im = draw_boxes(new_im, boxes, labels, box_colors, font_scale=0.8,
207 | font_thick=2, box_thick=2, bottom_text=False)
208 | # write the frame idx
209 | new_im = cv2.putText(new_im, "# %d" % frame_id,
210 | (0, 20), cv2.FONT_HERSHEY_SIMPLEX,
211 | 1, (0, 255, 0), 2)
212 | # the frames might not be 1920x1080
213 | new_im = cv2.resize(new_im, (1920, 1080))
214 | video_writer.write(new_im)
215 |
216 | video_writer.release()
217 | tqdm.write("%s has %s global tracks:%s" % (
218 | video_name, len(count_global_ids), count_global_ids.keys()))
219 | cv2.destroyAllWindows()
220 |
--------------------------------------------------------------------------------