├── README.md
├── configs
    ├── Base-RCNN-C4.yaml
    ├── Base-RCNN-DilatedC5.yaml
    ├── Base-RCNN-FPN.yaml
    ├── Base-RetinaNet.yaml
    ├── COCO-Detection
    │   ├── fast_rcnn_R_50_FPN_1x.yaml
    │   ├── faster_rcnn_R_101_C4_3x.yaml
    │   ├── faster_rcnn_R_101_DC5_3x.yaml
    │   ├── faster_rcnn_R_101_FPN_3x.yaml
    │   ├── faster_rcnn_R_50_C4_1x.yaml
    │   ├── faster_rcnn_R_50_C4_3x.yaml
    │   ├── faster_rcnn_R_50_DC5_1x.yaml
    │   ├── faster_rcnn_R_50_DC5_3x.yaml
    │   ├── faster_rcnn_R_50_FPN_1x.yaml
    │   ├── faster_rcnn_R_50_FPN_3x.yaml
    │   ├── faster_rcnn_X_101_32x8d_FPN_3x.yaml
    │   ├── retinanet_R_101_FPN_3x.yaml
    │   ├── retinanet_R_50_FPN_1x.yaml
    │   ├── retinanet_R_50_FPN_3x.yaml
    │   ├── rpn_R_50_C4_1x.yaml
    │   └── rpn_R_50_FPN_1x.yaml
    ├── COCO-InstanceSegmentation
    │   ├── mask_rcnn_R_101_C4_3x.yaml
    │   ├── mask_rcnn_R_101_DC5_3x.yaml
    │   ├── mask_rcnn_R_101_FPN_3x.yaml
    │   ├── mask_rcnn_R_50_C4_1x.yaml
    │   ├── mask_rcnn_R_50_C4_3x.yaml
    │   ├── mask_rcnn_R_50_DC5_1x.yaml
    │   ├── mask_rcnn_R_50_DC5_3x.yaml
    │   ├── mask_rcnn_R_50_FPN_1x.yaml
    │   ├── mask_rcnn_R_50_FPN_3x.yaml
    │   └── mask_rcnn_X_101_32x8d_FPN_3x.yaml
    ├── COCO-Keypoints
    │   ├── Base-Keypoint-RCNN-FPN.yaml
    │   ├── keypoint_rcnn_R_101_FPN_3x.yaml
    │   ├── keypoint_rcnn_R_50_FPN_1x.yaml
    │   ├── keypoint_rcnn_R_50_FPN_3x.yaml
    │   └── keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
    ├── COCO-PanopticSegmentation
    │   ├── Base-Panoptic-FPN.yaml
    │   ├── panoptic_fpn_R_101_3x.yaml
    │   ├── panoptic_fpn_R_50_1x.yaml
    │   └── panoptic_fpn_R_50_3x.yaml
    ├── Cityscapes
    │   └── mask_rcnn_R_50_FPN.yaml
    ├── Detectron1-Comparisons
    │   ├── README.md
    │   ├── faster_rcnn_R_50_FPN_noaug_1x.yaml
    │   ├── keypoint_rcnn_R_50_FPN_1x.yaml
    │   └── mask_rcnn_R_50_FPN_noaug_1x.yaml
    ├── LVIS-InstanceSegmentation
    │   ├── mask_rcnn_R_101_FPN_1x.yaml
    │   ├── mask_rcnn_R_50_FPN_1x.yaml
    │   └── mask_rcnn_X_101_32x8d_FPN_1x.yaml
    ├── Misc
    │   ├── cascade_mask_rcnn_R_50_FPN_1x.yaml
    │   ├── cascade_mask_rcnn_R_50_FPN_3x.yaml
    │   ├── cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
    │   ├── mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
    │   ├── mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
    │   ├── mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
    │   ├── mask_rcnn_R_50_FPN_3x_gn.yaml
    │   ├── mask_rcnn_R_50_FPN_3x_syncbn.yaml
    │   ├── panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
    │   ├── scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
    │   └── semantic_R_50_FPN_1x.yaml
    ├── PascalVOC-Detection
    │   ├── faster_rcnn_R_50_C4.yaml
    │   └── faster_rcnn_R_50_FPN.yaml
    └── quick_schedules
    │   ├── README.md
    │   ├── fast_rcnn_R_50_FPN_inference_acc_test.yaml
    │   ├── fast_rcnn_R_50_FPN_instant_test.yaml
    │   ├── keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
    │   ├── keypoint_rcnn_R_50_FPN_instant_test.yaml
    │   ├── keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml
    │   ├── keypoint_rcnn_R_50_FPN_training_acc_test.yaml
    │   ├── mask_rcnn_R_50_C4_inference_acc_test.yaml
    │   ├── mask_rcnn_R_50_C4_instant_test.yaml
    │   ├── mask_rcnn_R_50_C4_training_acc_test.yaml
    │   ├── mask_rcnn_R_50_DC5_inference_acc_test.yaml
    │   ├── mask_rcnn_R_50_FPN_inference_acc_test.yaml
    │   ├── mask_rcnn_R_50_FPN_instant_test.yaml
    │   ├── mask_rcnn_R_50_FPN_training_acc_test.yaml
    │   ├── panoptic_fpn_R_50_inference_acc_test.yaml
    │   ├── panoptic_fpn_R_50_instant_test.yaml
    │   ├── panoptic_fpn_R_50_training_acc_test.yaml
    │   ├── retinanet_R_50_FPN_inference_acc_test.yaml
    │   ├── retinanet_R_50_FPN_instant_test.yaml
    │   ├── rpn_R_50_FPN_inference_acc_test.yaml
    │   ├── rpn_R_50_FPN_instant_test.yaml
    │   ├── semantic_R_50_FPN_inference_acc_test.yaml
    │   ├── semantic_R_50_FPN_instant_test.yaml
    │   └── semantic_R_50_FPN_training_acc_test.yaml
├── img
    ├── d435_error_graph.png
    ├── d435_error_table.png
    ├── d435_rms_error.png
    ├── demo.gif
    ├── depth_vs_range.png
    ├── detectron2_model_zoo.png
    └── github_title.png
├── main_detectron2_FULL_mac.py
├── main_detectron2_simple_mac.py
├── main_detectron2_simple_win10.py
└── sort.py


/README.md:
--------------------------------------------------------------------------------
  1 | ![title](img/github_title.png)
  2 | 
  3 | 
  4 | This is a work based on [ErikGDev-InstanceSeg-Mac/Linux](https://github.com/ErikGDev/instance-segmentation), which is a fork of [Facebook AI Research's](https://github.com/facebookresearch) implementation of [Mask R_CNN](https://arxiv.org/abs/1703.06870), Detectron2. Detectron2 is a complete write-up from its previous version
  5 | [Detectron](https://github.com/facebookresearch/Detectron/), which originates from [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/). This Mask R_CNN implementation is powered by [PyTorch](https://pytorch.org) and is based on **Feature Pyramid Network (FPN) and a ResNet101** backbone. (PS: You may also check MMsegmentation at [here](https://github.com/open-mmlab))
  6 | 
  7 | In this project, like its [reference](https://github.com/ErikGDev/instance-segmentation), real-time RGB video and depth map or RGB-D from a [Intel® RealSense™ D435 camera](https://www.intelrealsense.com/depth-camera-d435/) are inputted into Detectron2's Mask R_CNN model. The output is ("almost") real-time video (2-4fps with CUDA enabled in my case) with instance segmentation masks and labels superimposed. The feature of this project is that the median depth values of each object are also outputted. It runs in Windows10.
  8 | 
  9 | ![gif](img/demo.gif)
 10 | 
 11 | ## 1. Usage
 12 | 
 13 | **Requirements/Dependencies**
 14 | 
 15 | - Windows10 (You can find the solution for Linux or macOS [here](https://github.com/ErikGDev/instance-segmentation))
 16 | - Python ≥ 3.6
 17 | - PyTorch ≥ 1.3
 18 | - [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 19 | 	You can install them together at [pytorch.org](https://pytorch.org) to make sure of this. Please ensure that your version of CUDA is also compatible when installing. You can run this code without CUDA, but it will be much slower, e.g. 10x slower but not really test it.
 20 | - OpenCV `pip3 install opencv-python`
 21 | - Intel RealSense SDK 2.0 Installation: [here](https://www.intelrealsense.com/sdk-2/)
 22 | - PyRealSense `pip3 install pyrealsense2`
 23 | - Build Detectron2:
 24 |   + **Build Detectron2 from Source**
 25 | 
 26 |      [Windows] Install Visual C++ Build tools form [this link](https://answers.microsoft.com/en-us/windows/forum/windows_10-windows_install/microsoft-visual-c-140-is-required-in-windows-10/f0445e6b-d461-4e40-b44f-962622628de7).  Then restart your PC, then you also need to upgrade Python setup tools, by running this command: `pip3 install --upgrade setuptools`.
 27 | 
 28 |      Then you can install Detectron2 from source by running:
 29 |      ```bash
 30 |      [Note: This should be the easiest way to build Detectron2 in Windows10!]
 31 |      pip install git+https://github.com/facebookresearch/detectron2.git
 32 |      # (add --user if you don't have permission)
 33 | 
 34 |      # Or, to install it from a local clone:
 35 |      git clone https://github.com/facebookresearch/detectron2.git
 36 |      cd detectron2 && pip3 install -e .
 37 | 
 38 |      # Or if you are on macOS
 39 |      # CC=clang CXX=clang++ pip install -e .
 40 |      ```
 41 | 
 42 |      If the installation is not proper, you may see the error of "cannot import name '_C' #157" when running the `main_xxx.py`.
 43 | 
 44 |      For more details on the installation of Detectron2 and its dependencies, please refer to the [official Detectron2 GitHub](https://github.com/facebookresearch/detectron2).
 45 | 
 46 | **After Installation**
 47 | 
 48 | 1. Clone or download this repository.
 49 | 2. To perform instance segmentation straight from a D435 camera attached to a USB port:
 50 |   * Run one of the two python files i.e. `main_xxx_win10.py`
 51 |   * If using .bag files:
 52 |     * Type 'python3 main_xxx_win10.py --file={filename}' where {filename} is the name of the input .bag file. To create .bag files, use d435_to_file.py in [this repository](https://github.com/ErikGDev/instance-segmentation/tree/master/tools).
 53 | 
 54 | ---
 55 | 
 56 | _(For conveniently recalling the background, I here copy and paste most of the content from [this awesome rep](https://github.com/ErikGDev/instance-segmentation) as below.)_
 57 | ## 2. Accuracy and Specifications of Model
 58 | 
 59 | ### 2.1 Instance Segmentation Validation Results
 60 | 
 61 | |  | Backbone | AP | AP<sub>50</sub> | AP<sub>75</sub> | AP<sub>S</sub> | AP<sub>M</sub> | AP<sub>L</sub> |
 62 | | :--- | :--- | :---: | :---: | :---: |  :---:  | :---: | :---: |
 63 | | Original Mask R-CNN   | ResNet-101-FPN  | 35.7 | 58.0 | 37.8 | 15.5 | 38.1 | 52.4 |
 64 | | Matterport Mask R-CNN | ReSNet-101-FPN | 38.0 | 55.8 | <b>41.3</b> | 17.9 | <b>45.5</b> | <b>55.9</b> |
 65 | | Detectron2 Mask R-CNN | ReSNet-101-FPN | <b>38.6</b> | <b>60.4</b> | <b>41.3</b> | <b>19.5</b> | 41.3 | 55.3 |
 66 | 
 67 | Validation tests were perfomed on the segmentation masks created on the **2017 COCO** validation dataset. The standard COCO validation metrics include average AP over IoU thresholds, AP<sub>50</sub>, AP<sub>75</sub>, and AP<sub>S</sub>, AP<sub>M</sub> and AP<sub>L</sub> (AP at different scales). These results were then compared to COCO validation results from the [original paper](https://arxiv.org/abs/1703.06870) and a popular [Mask R-CNN implementation by Matterport](https://github.com/matterport/Mask_RCNN). Clearly, Detectron2's Mask R_CNN outperforms the original Mask R_CNN and Matterport's Mask R_CNN with respect to average precision. It also outperformed SOTA COCO segmentation competition winners from the [2015 and 2016 challenge](http://cocodataset.org/#detection-leaderboard).
 68 | 
 69 | ### 2.2 Why this model?
 70 | 
 71 | Detectron2's Mask R_CNN with a **ReSNet-101-FPN** backbone was determined to be the optimal model. Upon comparing Detectron2 to [MMDetection's models](https://github.com/open-mmlab/mmdetection/blob/master/docs/MODEL_ZOO.md), which won first place in the [2018 segmentation COCO challenge](http://cocodataset.org/#detection-leaderboard), it is evident that the choice of model is appropriate for high-speed real-time video.
 72 | 
 73 | When comparing [Detectron2's Mask R_CNN](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md#coco-instance-segmentation-baselines-with-mask-r-cnn) to [MMDetection's Mask R_CNN](https://github.com/open-mmlab/mmdetection/blob/master/docs/MODEL_ZOO.md#mask-r-cnn), Detectron2 outperforms in both mask AP (38.6 vs 35.9) and inference time (0.070 s/im vs 0.105 s/im). MMDetectron does have models that are slightly more accurate than Detectron2's Mask R_CNN implementation, such as [the Hybrid Task Cascade model (HTC)](https://github.com/open-mmlab/mmdetection/blob/master/docs/MODEL_ZOO.md#hybrid-task-cascade-htc) however these often result in models that output masks at less than 4 fps. When adding the time to ouput the superimposed images, this would be insufficient for real-time.
 74 | 
 75 | Detectron2's Model Zoo displays the inference time and Mask AP for each model provided. For the Mask R_CNN models, the FPN model with a ResNet101 backbone has the best Mask AP for the short time it takes for inferences.
 76 | 
 77 | <img src="img/detectron2_model_zoo.png" />
 78 | 
 79 | 
 80 | ### 2.3 Config Settings
 81 | 
 82 | Config settings can be altered.
 83 | 
 84 | + The `SCORE_THRESHOLD` or `cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST` line specifies the lower threshold for when the instance segmentation mask is shown to the user. For example, set `SCORE_THRESHOLD=0.65` or `cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.65`. If Detectron2 is at least 65% confident the object detected belongs to a class name, the mask is superimposed onto the image.
 85 | 
 86 | + The `cfg.MODEL.WEIGHTS` line specifies the pretrained weights used to perform instance segmentation. This program uses the `ResNet101 FPN` weights.
 87 | 
 88 | + The `cfg.INPUT.MIN_SIZE_TEST` line specifies the size of the smallest size of the image during testing/inference. If this is set to `0`, resizing is disabled.
 89 | 
 90 | + The `RESOLUTION_X`(i.e. `640`, `1280`) and `RESOLUTION_Y`(`360`(BW:cannot work in this PC, min:`480`),`480`, `720`) specify the resolution of camera streams from D435.
 91 | 
 92 | 
 93 | ## 3. Intel® RealSense™ D435 RGB-D Camera
 94 | 
 95 | According to Intel's paper, [Best-Known-Methods for Tuning Intel® RealSense™ D400 Depth Cameras for Best Performance](https://www.intelrealsense.com/wp-content/uploads/2019/11/BKMs_Tuning_RealSense_D4xx_Cam.pdf), The depth RMS (root mean square) error increases rapidly when placing objects further away, especially when the distance is greater than 3m. The orange line on the graph below represents the depth RMS error on a D435 with HFOV=90deg, Xres=848, baseline=50mm and for subpixel=0.08.
 96 | 
 97 | <img src="img/d435_rms_error.png" />
 98 | 
 99 | ### 3.1 Depth Error Testing
100 | 
101 | Testing was performed on this program, where the real distances of objects from the D435 were compared to the distance measured by the stereo sensors on the D435. The true distance was found by measuring the distance between a box (with a flat front) and the parallel plane of the imagers.
102 | 
103 | <img src="img/depth_vs_range.png"/>
104 | 
105 | from [Intel D400 Datasheet](https://www.mouser.ca/pdfdocs/Intel_D400_Series_Datasheet.pdf)
106 | 
107 | The D435 recordings were measured on the [realsense-viewer](https://github.com/IntelRealSense/librealsense/tree/master/tools/realsense-viewer) program. The stereo resolution was set to 1280 x 720. Rather than the depth RMS error, the absolute depth error was compared to the real distance of the object to the D435.
108 | ![d435_error_table](img/d435_error_table.png)
109 | ![d435_error_graph](img/d435_error_graph.png)
110 | 
111 | This graph shows that the absolute error appears to exponentially increases when the distance increases. This means the depth recordings will be most accurate when the object is closer to the camera.
112 | 
113 | ### 3.2 When does the Object Record 0m?
114 | 
115 | When the object is too close to the camera, the depth values will return 0m. This threshold is known as `MinZ`. The formula for calculating `MinZ` is
116 | 
117 | > MinZ(mm) = focal length(pixels) x Baseline(mm)/126
118 | 
119 | Therefore with a depth resolution of 848x480, the MinZ is ~16.8cm. If the object is within this distance, no value is returned.
120 | 
121 | Similar to MinZ, MaxZ exists too. For the D435, the MaxZ is [approximately 10m](https://ark.intel.com/content/www/us/en/ark/products/128255/intel-realsense-depth-camera-d435.html). Any object outside this range will also be recorded as 0m.
122 | 
123 | Sometimes objects can be recorded as 0m even though they are inside the MinZ and MaxZ threshold. This usually occurs when there is too much noise on the depth image. This can occur when the target is not well textured. For more information on how to configure the D435 for specific environments and objects, refer to [this paper](https://www.intelrealsense.com/wp-content/uploads/2019/11/BKMs_Tuning_RealSense_D4xx_Cam.pdf).
124 | 
125 | ### 3.3 How is each Depth Value Calculated?
126 | 
127 | To find the distance of each object, the median depth pixel is used. All pixels associated to the object are abstracted to a histogram with a max distance of 10m (Max range of the D435), and 500 bins. The bins are looped through until the bin which contains the median is found. This means that the depth values will change with intervals of 0.02m.
128 | 
129 | For smaller intervals of 0.01m, change the NUM_BINS constant to 1000, and change
130 | 
131 | `centre_depth = "{:.2f}m".format(x / 50)`
132 | to
133 | `centre_depth = "{:.2f}m".format(x / 100)`
134 | 
135 | 
136 | ## 4. Object Tracking (note:exclude this part for speed up if not necessary)
137 | 
138 | The purpose of this project is to propose where objects exists in the environment around a robot. In addition to this, it would be ideal to understand the movement of each object.
139 | 
140 | ### 4.1 Simple Online and Real-time Tracking (SORT)
141 | 
142 | The velocity, linear speed (between camera and object), and time to impact were all calculated using an altered version of [Chris Fotache's implementation of SORT with PyTorch](https://github.com/cfotache/pytorch_objectdetecttrack), created by [Alex Bewley](https://github.com/abewley/), Zongyuan Ge, Lionel Ott, Fabio Ramos and Ben Upcroft.
143 | 
144 | Simple Online and Real-time Tracking (SORT) paper: https://arxiv.org/abs/1602.00763</br>
145 | Original python implementation of SORT by Alex Bewley: https://github.com/abewley/sort
146 | 
147 | SORT proposes using a Kalman filter to predict the trajectory of previously identified objects, and then match them with newly identified objects. In this program, when an object is matched with a detection, the real-world position and distance from camera are added as attributes to the KalmanBoxTracker object. When the same object is tracked to the next frame, linear speed, velocity, real-world distance, and time until impact are all added under the same object. Each KalmanBoxTracker is added to the appropriate DetectedObject as the attribute DetectredObject.track. This means all the data can be passed to an API using a single DetectedObject.
148 | 
149 | ### 4.2 Velocity Vector Arrows
150 | 
151 | Optionally, vector arrows can be superimposed on the image. These vector arrows show the direction the object is moving in 3D space. Each arrow is represented through the Arrow3D class, which essentially is the same as the FancyArrowPatch class from matplotlib, with additional 3D support.
152 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN-C4.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   ROI_HEADS:
 7 |     NAME: "Res5ROIHeads"
 8 | DATASETS:
 9 |   TRAIN: ("coco_2017_train",)
10 |   TEST: ("coco_2017_val",)
11 | SOLVER:
12 |   IMS_PER_BATCH: 16
13 |   BASE_LR: 0.02
14 |   STEPS: (60000, 80000)
15 |   MAX_ITER: 90000
16 | INPUT:
17 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
18 | 
19 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN-DilatedC5.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RESNETS:
 4 |     OUT_FEATURES: ["res5"]
 5 |     RES5_DILATION: 2
 6 |   RPN:
 7 |     IN_FEATURES: ["res5"]
 8 |     PRE_NMS_TOPK_TEST: 6000
 9 |     POST_NMS_TOPK_TEST: 1000
10 |   ROI_HEADS:
11 |     NAME: "StandardROIHeads"
12 |     IN_FEATURES: ["res5"]
13 |   ROI_BOX_HEAD:
14 |     NAME: "FastRCNNConvFCHead"
15 |     NUM_FC: 2
16 |     POOLER_RESOLUTION: 7
17 |   ROI_MASK_HEAD:
18 |     NAME: "MaskRCNNConvUpsampleHead"
19 |     NUM_CONV: 4
20 |     POOLER_RESOLUTION: 14
21 | DATASETS:
22 |   TRAIN: ("coco_2017_train",)
23 |   TEST: ("coco_2017_val",)
24 | SOLVER:
25 |   IMS_PER_BATCH: 16
26 |   BASE_LR: 0.02
27 |   STEPS: (60000, 80000)
28 |   MAX_ITER: 90000
29 | INPUT:
30 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
31 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 9 |   ANCHOR_GENERATOR:
10 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
11 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
12 |   RPN:
13 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
15 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
16 |     # Detectron1 uses 2000 proposals per-batch,
17 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 |     POST_NMS_TOPK_TRAIN: 1000
20 |     POST_NMS_TOPK_TEST: 1000
21 |   ROI_HEADS:
22 |     NAME: "StandardROIHeads"
23 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 |   ROI_BOX_HEAD:
25 |     NAME: "FastRCNNConvFCHead"
26 |     NUM_FC: 2
27 |     POOLER_RESOLUTION: 7
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 4
31 |     POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("coco_2017_train",)
34 |   TEST: ("coco_2017_val",)
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | 


--------------------------------------------------------------------------------
/configs/Base-RetinaNet.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "RetinaNet"
 3 |   BACKBONE:
 4 |     NAME: "build_retinanet_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res3", "res4", "res5"]
 7 |   ANCHOR_GENERATOR:
 8 |     SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
 9 |   FPN:
10 |     IN_FEATURES: ["res3", "res4", "res5"]
11 |   RETINANET:
12 |     IOU_THRESHOLDS: [0.4, 0.5]
13 |     IOU_LABELS: [0, -1, 1]
14 | DATASETS:
15 |   TRAIN: ("coco_2017_train",)
16 |   TEST: ("coco_2017_val",)
17 | SOLVER:
18 |   IMS_PER_BATCH: 16
19 |   BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
20 |   STEPS: (60000, 80000)
21 |   MAX_ITER: 90000
22 | INPUT:
23 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
24 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   LOAD_PROPOSALS: True
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   PROPOSAL_GENERATOR:
 9 |     NAME: "PrecomputedProposals"
10 | DATASETS:
11 |   TRAIN: ("coco_2017_train",)
12 |   PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
13 |   TEST: ("coco_2017_val",)
14 |   PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
15 | DATALOADER:
16 |   # proposals are part of the dataset_dicts, and take a lot of RAM
17 |   NUM_WORKERS: 2
18 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: False
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: False
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: False
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   MASK_ON: False
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 5 |   PIXEL_STD: [57.375, 57.120, 58.395]
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 | SOLVER:
12 |   STEPS: (210000, 250000)
13 |   MAX_ITER: 270000
14 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/rpn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "ProposalNetwork"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   MASK_ON: False
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   RPN:
 9 |     PRE_NMS_TOPK_TEST: 12000
10 |     POST_NMS_TOPK_TEST: 2000
11 | 


--------------------------------------------------------------------------------
/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "ProposalNetwork"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   MASK_ON: False
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   RPN:
 9 |     POST_NMS_TOPK_TEST: 2000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   MASK_ON: True
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 5 |   PIXEL_STD: [57.375, 57.120, 58.395]
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 | SOLVER:
12 |   STEPS: (210000, 250000)
13 |   MAX_ITER: 270000
14 | 


--------------------------------------------------------------------------------
/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   KEYPOINT_ON: True
 4 |   ROI_HEADS:
 5 |     NUM_CLASSES: 1
 6 |   ROI_BOX_HEAD:
 7 |     SMOOTH_L1_BETA: 0.5  # Keypoint AP degrades (though box AP improves) when using plain L1 loss
 8 |   RPN:
 9 |     # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
10 |     # 1000 proposals per-image is found to hurt box AP.
11 |     # Therefore we increase it to 1500 per-image.
12 |     POST_NMS_TOPK_TRAIN: 1500
13 | DATASETS:
14 |   TRAIN: ("keypoints_coco_2017_train",)
15 |   TEST: ("keypoints_coco_2017_val",)
16 | 


--------------------------------------------------------------------------------
/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-Keypoint-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-Keypoint-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | 


--------------------------------------------------------------------------------
/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-Keypoint-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-Keypoint-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 4 |   PIXEL_STD: [57.375, 57.120, 58.395]
 5 |   RESNETS:
 6 |     STRIDE_IN_1X1: False  # this is a C2 model
 7 |     NUM_GROUPS: 32
 8 |     WIDTH_PER_GROUP: 8
 9 |     DEPTH: 101
10 | SOLVER:
11 |   STEPS: (210000, 250000)
12 |   MAX_ITER: 270000
13 | 


--------------------------------------------------------------------------------
/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "PanopticFPN"
 4 |   MASK_ON: True
 5 |   SEM_SEG_HEAD:
 6 |     LOSS_WEIGHT: 0.5
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_train_panoptic_separated",)
 9 |   TEST: ("coco_2017_val_panoptic_separated",)
10 | 


--------------------------------------------------------------------------------
/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-Panoptic-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-Panoptic-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | 


--------------------------------------------------------------------------------
/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-Panoptic-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   # WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   # For better, more stable performance initialize from COCO
 5 |   WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
 6 |   MASK_ON: True
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 8
 9 | # This is the setting used in Mask R-CNN paper, Appendix A
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
12 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
13 |   MIN_SIZE_TEST: 1024
14 |   MAX_SIZE_TRAIN: 2048
15 |   MAX_SIZE_TEST: 2048
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   BASE_LR: 0.01
21 |   STEPS: (18000,)
22 |   MAX_ITER: 24000
23 |   IMS_PER_BATCH: 8
24 | TEST:
25 |   EVAL_PERIOD: 8000
26 | 


--------------------------------------------------------------------------------
/configs/Detectron1-Comparisons/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Detectron2's default settings and a few implementation details are different from Detectron.
 3 | 
 4 | The differences in implementation details are shared in
 5 | [Compatibility with Other Libraries](../../docs/notes/compatibility.md).
 6 | 
 7 | The differences in default config includes:
 8 | * Use scale augmentation during training.
 9 | * Use L1 loss instead of smooth L1 loss.
10 | * Use `POOLER_SAMPLING_RATIO=0` instead of 2.
11 | * Use `ROIAlignV2`.
12 | 
13 | In this directory, we provide a few configs that mimic Detectron's behavior as close as possible.
14 | This provides a fair comparison of accuracy and speed against Detectron.
15 | 
16 | <!--
17 | ./gen_html_table.py --config 'Detectron1-Comparisons/*.yaml' --name "Faster R-CNN" "Keypoint R-CNN" "Mask R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP keypoint_AP
18 | -->
19 | 
20 | 
21 | <table><tbody>
22 | <!-- START TABLE -->
23 | <!-- TABLE HEADER -->
24 | <th valign="bottom">Name</th>
25 | <th valign="bottom">lr<br/>sched</th>
26 | <th valign="bottom">train<br/>time<br/>(s/iter)</th>
27 | <th valign="bottom">inference<br/>time<br/>(s/im)</th>
28 | <th valign="bottom">train<br/>mem<br/>(GB)</th>
29 | <th valign="bottom">box<br/>AP</th>
30 | <th valign="bottom">mask<br/>AP</th>
31 | <th valign="bottom">kp.<br/>AP</th>
32 | <th valign="bottom">model id</th>
33 | <th valign="bottom">download</th>
34 | <!-- TABLE BODY -->
35 | <!-- ROW: faster_rcnn_R_50_FPN_noaug_1x -->
36 |  <tr><td align="left"><a href="configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml">Faster R-CNN</a></td>
37 | <td align="center">1x</td>
38 | <td align="center">0.219</td>
39 | <td align="center">0.048</td>
40 | <td align="center">3.1</td>
41 | <td align="center">36.9</td>
42 | <td align="center"></td>
43 | <td align="center"></td>
44 | <td align="center">137781054</td>
45 | <td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/model_final_7ab50c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/metrics.json">metrics</a></td>
46 | </tr>
47 | <!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
48 |  <tr><td align="left"><a href="configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml">Keypoint R-CNN</a></td>
49 | <td align="center">1x</td>
50 | <td align="center">0.313</td>
51 | <td align="center">0.082</td>
52 | <td align="center">5.0</td>
53 | <td align="center">53.1</td>
54 | <td align="center"></td>
55 | <td align="center">64.2</td>
56 | <td align="center">137781195</td>
57 | <td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/model_final_cce136.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/metrics.json">metrics</a></td>
58 | </tr>
59 | <!-- ROW: mask_rcnn_R_50_FPN_noaug_1x -->
60 |  <tr><td align="left"><a href="configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml">Mask R-CNN</a></td>
61 | <td align="center">1x</td>
62 | <td align="center">0.273</td>
63 | <td align="center">0.052</td>
64 | <td align="center">3.4</td>
65 | <td align="center">37.8</td>
66 | <td align="center">34.9</td>
67 | <td align="center"></td>
68 | <td align="center">137781281</td>
69 | <td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/model_final_62ca52.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/metrics.json">metrics</a></td>
70 | </tr>
71 | </tbody></table>
72 | 
73 | ## Comparisons:
74 | 
75 | * Faster R-CNN: Detectron's AP is 36.7, similar to ours.
76 | * Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's
77 |   [bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be
78 | 	compensated back by some parameter tuning.
79 | * Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation.
80 | 
81 | For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html).
82 | 


--------------------------------------------------------------------------------
/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   # Detectron1 uses smooth L1 loss with some magic beta values.
 8 |   # The defaults are changed to L1 loss in Detectron2.
 9 |   RPN:
10 |     SMOOTH_L1_BETA: 0.1111
11 |   ROI_BOX_HEAD:
12 |     SMOOTH_L1_BETA: 1.0
13 |     POOLER_SAMPLING_RATIO: 2
14 |     POOLER_TYPE: "ROIAlign"
15 | INPUT:
16 |   # no scale augmentation
17 |   MIN_SIZE_TRAIN: (800, )
18 | 


--------------------------------------------------------------------------------
/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   KEYPOINT_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1
 9 |   ROI_KEYPOINT_HEAD:
10 |     POOLER_RESOLUTION: 14
11 |     POOLER_SAMPLING_RATIO: 2
12 |     POOLER_TYPE: "ROIAlign"
13 |   # Detectron1 uses smooth L1 loss with some magic beta values.
14 |   # The defaults are changed to L1 loss in Detectron2.
15 |   ROI_BOX_HEAD:
16 |     SMOOTH_L1_BETA: 1.0
17 |     POOLER_SAMPLING_RATIO: 2
18 |     POOLER_TYPE: "ROIAlign"
19 |   RPN:
20 |     SMOOTH_L1_BETA: 0.1111
21 |     # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2
22 |     # 1000 proposals per-image is found to hurt box AP.
23 |     # Therefore we increase it to 1500 per-image.
24 |     POST_NMS_TOPK_TRAIN: 1500
25 | DATASETS:
26 |   TRAIN: ("keypoints_coco_2017_train",)
27 |   TEST: ("keypoints_coco_2017_val",)
28 | 


--------------------------------------------------------------------------------
/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   # Detectron1 uses smooth L1 loss with some magic beta values.
 8 |   # The defaults are changed to L1 loss in Detectron2.
 9 |   RPN:
10 |     SMOOTH_L1_BETA: 0.1111
11 |   ROI_BOX_HEAD:
12 |     SMOOTH_L1_BETA: 1.0
13 |     POOLER_SAMPLING_RATIO: 2
14 |     POOLER_TYPE: "ROIAlign"
15 |   ROI_MASK_HEAD:
16 |     POOLER_SAMPLING_RATIO: 2
17 |     POOLER_TYPE: "ROIAlign"
18 | INPUT:
19 |   # no scale augmentation
20 |   MIN_SIZE_TRAIN: (800, )
21 | 


--------------------------------------------------------------------------------
/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1230
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v0.5_train",)
14 |   TEST: ("lvis_v0.5_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 | DATALOADER:
18 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
19 |   REPEAT_THRESHOLD: 0.001
20 | 


--------------------------------------------------------------------------------
/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1230
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v0.5_train",)
14 |   TEST: ("lvis_v0.5_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 | DATALOADER:
18 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
19 |   REPEAT_THRESHOLD: 0.001
20 | 


--------------------------------------------------------------------------------
/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 4 |   PIXEL_STD: [57.375, 57.120, 58.395]
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 |   ROI_HEADS:
12 |     NUM_CLASSES: 1230
13 |     SCORE_THRESH_TEST: 0.0001
14 | INPUT:
15 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
16 | DATASETS:
17 |   TRAIN: ("lvis_v0.5_train",)
18 |   TEST: ("lvis_v0.5_val",)
19 | TEST:
20 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
21 | DATALOADER:
22 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 |   REPEAT_THRESHOLD: 0.001
24 | 


--------------------------------------------------------------------------------
/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NAME: CascadeROIHeads
 9 |   ROI_BOX_HEAD:
10 |     CLS_AGNOSTIC_BBOX_REG: True
11 |   RPN:
12 |     POST_NMS_TOPK_TRAIN: 2000
13 | 


--------------------------------------------------------------------------------
/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NAME: CascadeROIHeads
 9 |   ROI_BOX_HEAD:
10 |     CLS_AGNOSTIC_BBOX_REG: True
11 |   RPN:
12 |     POST_NMS_TOPK_TRAIN: 2000
13 | SOLVER:
14 |   STEPS: (210000, 250000)
15 |   MAX_ITER: 270000
16 | 


--------------------------------------------------------------------------------
/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   MASK_ON: True
 4 |   WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
 5 |   RESNETS:
 6 |     STRIDE_IN_1X1: False  # this is a C2 model
 7 |     NUM_GROUPS: 32
 8 |     WIDTH_PER_GROUP: 8
 9 |     DEPTH: 152
10 |     DEFORM_ON_PER_STAGE: [False, True, True, True]
11 |   ROI_HEADS:
12 |     NAME: "CascadeROIHeads"
13 |   ROI_BOX_HEAD:
14 |     NAME: "FastRCNNConvFCHead"
15 |     NUM_CONV: 4
16 |     NUM_FC: 1
17 |     NORM: "GN"
18 |     CLS_AGNOSTIC_BBOX_REG: True
19 |   ROI_MASK_HEAD:
20 |     NUM_CONV: 8
21 |     NORM: "GN"
22 |   RPN:
23 |     POST_NMS_TOPK_TRAIN: 2000
24 | SOLVER:
25 |   IMS_PER_BATCH: 128
26 |   STEPS: (35000, 45000)
27 |   MAX_ITER: 50000
28 |   BASE_LR: 0.16
29 | INPUT:
30 |   MIN_SIZE_TRAIN: (640, 864)
31 |   MIN_SIZE_TRAIN_SAMPLING: "range"
32 |   MAX_SIZE_TRAIN: 1440
33 |   CROP:
34 |     ENABLED: True
35 | TEST:
36 |   EVAL_PERIOD: 2500
37 | 


--------------------------------------------------------------------------------
/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_BOX_HEAD:
 8 |     CLS_AGNOSTIC_BBOX_REG: True
 9 |   ROI_MASK_HEAD:
10 |     CLS_AGNOSTIC_MASK: True
11 | 


--------------------------------------------------------------------------------
/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 |     DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
8 |     DEFORM_MODULATED: False
9 | 


--------------------------------------------------------------------------------
/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |     DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
 8 |     DEFORM_MODULATED: False
 9 | SOLVER:
10 |   STEPS: (210000, 250000)
11 |   MAX_ITER: 270000
12 | 


--------------------------------------------------------------------------------
/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |     NORM: "GN"
 8 |     STRIDE_IN_1X1: False
 9 |   FPN:
10 |     NORM: "GN"
11 |   ROI_BOX_HEAD:
12 |     NAME: "FastRCNNConvFCHead"
13 |     NUM_CONV: 4
14 |     NUM_FC: 1
15 |     NORM: "GN"
16 |   ROI_MASK_HEAD:
17 |     NORM: "GN"
18 | SOLVER:
19 |   # 3x schedule
20 |   STEPS: (210000, 250000)
21 |   MAX_ITER: 270000
22 | 


--------------------------------------------------------------------------------
/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |     NORM: "SyncBN"
 8 |     STRIDE_IN_1X1: False
 9 |   FPN:
10 |     NORM: "SyncBN"
11 |   ROI_BOX_HEAD:
12 |     NAME: "FastRCNNConvFCHead"
13 |     NUM_CONV: 4
14 |     NUM_FC: 1
15 |     NORM: "SyncBN"
16 |   ROI_MASK_HEAD:
17 |     NORM: "SyncBN"
18 | SOLVER:
19 |   # 3x schedule
20 |   STEPS: (210000, 250000)
21 |   MAX_ITER: 270000
22 | TEST:
23 |   PRECISE_BN:
24 |     ENABLED: True
25 | 


--------------------------------------------------------------------------------
/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml:
--------------------------------------------------------------------------------
 1 | # A large PanopticFPN for demo purposes.
 2 | # Use GN on backbone to support semantic seg.
 3 | # Use Cascade + Deform Conv to improve localization.
 4 | _BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
 5 | MODEL:
 6 |   WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
 7 |   RESNETS:
 8 |     DEPTH: 101
 9 |     NORM: "GN"
10 |     DEFORM_ON_PER_STAGE: [False, True, True, True]
11 |     STRIDE_IN_1X1: False
12 |   FPN:
13 |     NORM: "GN"
14 |   ROI_HEADS:
15 |     NAME: CascadeROIHeads
16 |   ROI_BOX_HEAD:
17 |     CLS_AGNOSTIC_BBOX_REG: True
18 |   ROI_MASK_HEAD:
19 |     NORM: "GN"
20 |   RPN:
21 |     POST_NMS_TOPK_TRAIN: 2000
22 | SOLVER:
23 |   STEPS: (105000, 125000)
24 |   MAX_ITER: 135000
25 |   IMS_PER_BATCH: 32
26 |   BASE_LR: 0.04
27 | 


--------------------------------------------------------------------------------
/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
 2 | # INPUT:
 3 |   # It makes sense to divide by STD when training from scratch
 4 |   # But it seems to make no difference on the results and C2's models didn't do this.
 5 |   # So we keep things consistent with C2.
 6 |   # PIXEL_STD: [57.375, 57.12, 58.395]
 7 | MODEL:
 8 |   WEIGHTS: ""
 9 |   MASK_ON: True
10 |   BACKBONE:
11 |     FREEZE_AT: 0
12 | 


--------------------------------------------------------------------------------
/configs/Misc/semantic_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "SemanticSegmentor"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_train_panoptic_stuffonly",)
 9 |   TEST: ("coco_2017_val_panoptic_stuffonly",)
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | 


--------------------------------------------------------------------------------
/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 20
 9 | INPUT:
10 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
11 |   MIN_SIZE_TEST: 800
12 | DATASETS:
13 |   TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
14 |   TEST: ('voc_2007_test',)
15 | SOLVER:
16 |   STEPS: (12000, 16000)
17 |   MAX_ITER: 18000  # 17.4 epochs
18 |   WARMUP_ITERS: 100
19 | 


--------------------------------------------------------------------------------
/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 20
 9 | INPUT:
10 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
11 |   MIN_SIZE_TEST: 800
12 | DATASETS:
13 |   TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
14 |   TEST: ('voc_2007_test',)
15 | SOLVER:
16 |   STEPS: (12000, 16000)
17 |   MAX_ITER: 18000  # 17.4 epochs
18 |   WARMUP_ITERS: 100
19 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/README.md:
--------------------------------------------------------------------------------
1 | These are quick configs for performance or accuracy regression tracking purposes.
2 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl"
4 | DATASETS:
5 |   TEST: ("coco_2017_val_100",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["bbox", "AP", 45.70, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 | DATASETS:
 5 |   TRAIN: ("coco_2017_val_100",)
 6 |   PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
 7 |   TEST: ("coco_2017_val_100",)
 8 |   PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
 9 | SOLVER:
10 |   BASE_LR: 0.005
11 |   STEPS: (30,)
12 |   MAX_ITER: 40
13 |   IMS_PER_BATCH: 4
14 | DATALOADER:
15 |   NUM_WORKERS: 2
16 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl"
4 | DATASETS:
5 |   TEST: ("keypoints_coco_2017_val_100",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["bbox", "AP", 52.47, 0.02], ["keypoints", "AP", 67.36, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   KEYPOINT_ON: True
 5 | DATASETS:
 6 |   TRAIN: ("keypoints_coco_2017_val_100",)
 7 |   TEST: ("keypoints_coco_2017_val_100",)
 8 | SOLVER:
 9 |   BASE_LR: 0.005
10 |   STEPS: (30,)
11 |   MAX_ITER: 40
12 |   IMS_PER_BATCH: 4
13 | DATALOADER:
14 |   NUM_WORKERS: 2
15 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   KEYPOINT_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     BATCH_SIZE_PER_IMAGE: 256
 9 |     NUM_CLASSES: 1
10 |   ROI_KEYPOINT_HEAD:
11 |     POOLER_RESOLUTION: 14
12 |     POOLER_SAMPLING_RATIO: 2
13 |     NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: False
14 |     LOSS_WEIGHT: 4.0
15 |   ROI_BOX_HEAD:
16 |     SMOOTH_L1_BETA: 1.0  # Keypoint AP degrades when using plain L1 loss
17 |   RPN:
18 |     SMOOTH_L1_BETA: 0.2  # Keypoint AP degrades when using plain L1 loss
19 | DATASETS:
20 |   TRAIN: ("keypoints_coco_2017_val",)
21 |   TEST: ("keypoints_coco_2017_val",)
22 | INPUT:
23 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
24 | SOLVER:
25 |   WARMUP_FACTOR: 0.33333333
26 |   WARMUP_ITERS: 100
27 |   STEPS: (5500, 5800)
28 |   MAX_ITER: 6000
29 | TEST:
30 |   EXPECTED_RESULTS: [["bbox", "AP", 55.35, 1.0], ["keypoints", "AP", 76.91, 1.0]]
31 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   KEYPOINT_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     BATCH_SIZE_PER_IMAGE: 256
 9 |     NUM_CLASSES: 1
10 |   ROI_KEYPOINT_HEAD:
11 |     POOLER_RESOLUTION: 14
12 |     POOLER_SAMPLING_RATIO: 2
13 |   ROI_BOX_HEAD:
14 |     SMOOTH_L1_BETA: 1.0  # Keypoint AP degrades when using plain L1 loss
15 |   RPN:
16 |     SMOOTH_L1_BETA: 0.2  # Keypoint AP degrades when using plain L1 loss
17 | DATASETS:
18 |   TRAIN: ("keypoints_coco_2017_val",)
19 |   TEST: ("keypoints_coco_2017_val",)
20 | INPUT:
21 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
22 | SOLVER:
23 |   WARMUP_FACTOR: 0.33333333
24 |   WARMUP_ITERS: 100
25 |   STEPS: (5500, 5800)
26 |   MAX_ITER: 6000
27 | TEST:
28 |   EXPECTED_RESULTS: [["bbox", "AP", 53.5, 1.0], ["keypoints", "AP", 72.4, 1.0]]
29 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl"
4 | DATASETS:
5 |   TEST: ("coco_2017_val_100",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["bbox", "AP", 47.37, 0.02], ["segm", "AP", 40.99, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 | DATASETS:
 6 |   TRAIN: ("coco_2017_val_100",)
 7 |   TEST: ("coco_2017_val_100",)
 8 | SOLVER:
 9 |   BASE_LR: 0.001
10 |   STEPS: (30,)
11 |   MAX_ITER: 40
12 |   IMS_PER_BATCH: 4
13 | DATALOADER:
14 |   NUM_WORKERS: 2
15 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   ROI_HEADS:
 5 |     BATCH_SIZE_PER_IMAGE: 256
 6 |   MASK_ON: True
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_val",)
 9 |   TEST: ("coco_2017_val",)
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (600,)
12 |   MAX_SIZE_TRAIN: 1000
13 |   MIN_SIZE_TEST: 800
14 |   MAX_SIZE_TEST: 1000
15 | SOLVER:
16 |   IMS_PER_BATCH: 8  # base uses 16
17 |   WARMUP_FACTOR: 0.33333
18 |   WARMUP_ITERS: 100
19 |   STEPS: (11000, 11600)
20 |   MAX_ITER: 12000
21 | TEST:
22 |   EXPECTED_RESULTS: [["bbox", "AP", 41.88, 0.7], ["segm", "AP", 33.79, 0.5]]
23 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl"
4 | DATASETS:
5 |   TEST: ("coco_2017_val_100",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["bbox", "AP", 47.44, 0.02], ["segm", "AP", 42.94, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
 4 | DATASETS:
 5 |   TEST: ("coco_2017_val_100",)
 6 | TEST:
 7 |   EXPECTED_RESULTS: [["bbox", "AP", 47.34, 0.02], ["segm", "AP",  42.67, 0.02]]
 8 |   # expected results do not use test-time augmentation. TTA results are not verified.
 9 |   AUG:
10 |     ENABLED: True
11 |     MIN_SIZES: (400, 500)  # to save some time
12 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 | DATASETS:
 6 |   TRAIN: ("coco_2017_val_100",)
 7 |   TEST: ("coco_2017_val_100",)
 8 | SOLVER:
 9 |   BASE_LR: 0.005
10 |   STEPS: (30,)
11 |   MAX_ITER: 40
12 |   IMS_PER_BATCH: 4
13 | DATALOADER:
14 |   NUM_WORKERS: 2
15 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   ROI_HEADS:
 5 |     BATCH_SIZE_PER_IMAGE: 256
 6 |   MASK_ON: True
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_val",)
 9 |   TEST: ("coco_2017_val",)
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (600,)
12 |   MAX_SIZE_TRAIN: 1000
13 |   MIN_SIZE_TEST: 800
14 |   MAX_SIZE_TEST: 1000
15 | SOLVER:
16 |   WARMUP_FACTOR: 0.3333333
17 |   WARMUP_ITERS: 100
18 |   STEPS: (5500, 5800)
19 |   MAX_ITER: 6000
20 | TEST:
21 |   EXPECTED_RESULTS: [["bbox", "AP", 42.8, 0.8], ["segm", "AP", 35.7, 0.8]]
22 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl"
4 | DATASETS:
5 |   TEST: ("coco_2017_val_100_panoptic_separated",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["bbox", "AP", 46.47, 0.02], ["segm", "AP", 43.39, 0.02], ["sem_seg", "mIoU", 42.55, 0.02], ["panoptic_seg", "PQ", 38.99, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "PanopticFPN"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   SEM_SEG_HEAD:
 9 |     LOSS_WEIGHT: 0.5
10 | DATASETS:
11 |   TRAIN: ("coco_2017_val_100_panoptic_separated",)
12 |   TEST: ("coco_2017_val_100_panoptic_separated",)
13 | SOLVER:
14 |   BASE_LR: 0.005
15 |   STEPS: (30,)
16 |   MAX_ITER: 40
17 |   IMS_PER_BATCH: 4
18 | DATALOADER:
19 |   NUM_WORKERS: 2
20 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "PanopticFPN"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   SEM_SEG_HEAD:
 9 |     LOSS_WEIGHT: 0.5
10 | DATASETS:
11 |   TRAIN: ("coco_2017_val_panoptic_separated",)
12 |   TEST: ("coco_2017_val_panoptic_separated",)
13 | SOLVER:
14 |   BASE_LR: 0.01
15 |   WARMUP_FACTOR: 0.001
16 |   WARMUP_ITERS: 500
17 |   STEPS: (5500,)
18 |   MAX_ITER: 7000
19 | TEST:
20 |   EXPECTED_RESULTS: [["bbox", "AP", 46.80, 1.1], ["segm", "AP", 38.93, 0.7], ["sem_seg", "mIoU", 63.99, 0.9], ["panoptic_seg", "PQ", 48.23, 0.8]]
21 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-Detection/retinanet_R_50_FPN_3x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl"
4 | DATASETS:
5 |   TEST: ("coco_2017_val_100",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["bbox", "AP", 44.36, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../COCO-Detection/retinanet_R_50_FPN_1x.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 | DATASETS:
 5 |   TRAIN: ("coco_2017_val_100",)
 6 |   TEST: ("coco_2017_val_100",)
 7 | SOLVER:
 8 |   BASE_LR: 0.005
 9 |   STEPS: (30,)
10 |   MAX_ITER: 40
11 |   IMS_PER_BATCH: 4
12 | DATALOADER:
13 |   NUM_WORKERS: 2
14 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl"
4 | DATASETS:
5 |   TEST: ("coco_2017_val_100",)
6 | TEST:
7 |   EXPECTED_RESULTS: [["box_proposals", "AR@1000", 58.16, 0.02]]
8 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 | DATASETS:
 5 |   TRAIN: ("coco_2017_val_100",)
 6 |   TEST: ("coco_2017_val_100",)
 7 | SOLVER:
 8 |   STEPS: (30,)
 9 |   MAX_ITER: 40
10 |   BASE_LR: 0.005
11 |   IMS_PER_BATCH: 4
12 | DATALOADER:
13 |   NUM_WORKERS: 2
14 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "SemanticSegmentor"
 4 |   WEIGHTS: "detectron2://semantic_R_50_FPN_1x/111802073/model_final_c18079783c55a94968edc28b7101c5f0.pkl"
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | DATASETS:
 8 |   TEST: ("coco_2017_val_100_panoptic_stuffonly",)
 9 | TEST:
10 |   EXPECTED_RESULTS: [["sem_seg", "mIoU", 39.53, 0.02], ["sem_seg", "mACC", 51.50, 0.02]]
11 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "SemanticSegmentor"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_val_100_panoptic_stuffonly",)
 9 |   TEST: ("coco_2017_val_100_panoptic_stuffonly",)
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | SOLVER:
13 |   BASE_LR: 0.005
14 |   STEPS: (30,)
15 |   MAX_ITER: 40
16 |   IMS_PER_BATCH: 4
17 | DATALOADER:
18 |   NUM_WORKERS: 2
19 | 


--------------------------------------------------------------------------------
/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "SemanticSegmentor"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_val_panoptic_stuffonly",)
 9 |   TEST: ("coco_2017_val_panoptic_stuffonly",)
10 | SOLVER:
11 |   BASE_LR: 0.01
12 |   WARMUP_FACTOR: 0.001
13 |   WARMUP_ITERS: 300
14 |   STEPS: (5500,)
15 |   MAX_ITER: 7000
16 | TEST:
17 |   EXPECTED_RESULTS: [["sem_seg", "mIoU", 76.51, 1.0], ["sem_seg", "mACC", 83.25, 1.0]]
18 | INPUT:
19 |   # no scale augmentation
20 |   MIN_SIZE_TRAIN: (800, )
21 | 


--------------------------------------------------------------------------------
/img/d435_error_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/d435_error_graph.png


--------------------------------------------------------------------------------
/img/d435_error_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/d435_error_table.png


--------------------------------------------------------------------------------
/img/d435_rms_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/d435_rms_error.png


--------------------------------------------------------------------------------
/img/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/demo.gif


--------------------------------------------------------------------------------
/img/depth_vs_range.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/depth_vs_range.png


--------------------------------------------------------------------------------
/img/detectron2_model_zoo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/detectron2_model_zoo.png


--------------------------------------------------------------------------------
/img/github_title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowu1004/instance_segmentation_RealSense/a211b6899900dc9a6a2ffafdcedcfe8a4b50a157/img/github_title.png


--------------------------------------------------------------------------------
/main_detectron2_FULL_mac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import time
  3 | import cv2
  4 | import pyrealsense2 as rs 
  5 | import random
  6 | import math
  7 | import argparse
  8 | 
  9 | from threading import Thread
 10 | from matplotlib import pyplot as plt
 11 | from mpl_toolkits.axes_grid1.inset_locator import inset_axes
 12 | from mpl_toolkits.mplot3d import proj3d
 13 | from mpl_toolkits.mplot3d import Axes3D
 14 | from matplotlib.patches import FancyArrowPatch
 15 | from sort import *
 16 | 
 17 | from detectron2.engine import DefaultPredictor
 18 | from detectron2.config import get_cfg
 19 | from detectron2.utils.visualizer import Visualizer
 20 | from detectron2.utils.visualizer import GenericMask
 21 | from detectron2.utils.visualizer import ColorMode
 22 | from detectron2.structures import Boxes, RotatedBoxes
 23 | 
 24 | from detectron2.data import MetadataCatalog
 25 | 
 26 | import torch, torchvision
 27 | 
 28 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 29 | import os
 30 | import pkg_resources
 31 | 
 32 | from detectron2.checkpoint import DetectionCheckpointer
 33 | from detectron2.modeling import build_model
 34 | 
 35 | # >>---------------------- load predefined model -------------------
 36 | class _ModelZooUrls(object):
 37 |     """
 38 |     Mapping from names to officially released Detectron2 pre-trained models.
 39 |     """
 40 | 
 41 |     S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
 42 | 
 43 |     # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
 44 |     CONFIG_PATH_TO_URL_SUFFIX = {
 45 |         # COCO Detection with Faster R-CNN
 46 |         "COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl",
 47 |         "COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl",
 48 |         "COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl",
 49 |         "COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl",
 50 |         "COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl",
 51 |         "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl",
 52 |         "COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl",
 53 |         "COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl",
 54 |         "COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl",
 55 |         "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl",
 56 |         # COCO Detection with RetinaNet
 57 |         "COCO-Detection/retinanet_R_50_FPN_1x.yaml": "190397773/model_final_bfca0b.pkl",
 58 |         "COCO-Detection/retinanet_R_50_FPN_3x.yaml": "190397829/model_final_5bd44e.pkl",
 59 |         "COCO-Detection/retinanet_R_101_FPN_3x.yaml": "190397697/model_final_971ab9.pkl",
 60 |         # COCO Detection with RPN and Fast R-CNN
 61 |         "COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl",
 62 |         "COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl",
 63 |         "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl",
 64 |         # COCO Instance Segmentation Baselines with Mask R-CNN
 65 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl",
 66 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl",
 67 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl",
 68 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl",
 69 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl",
 70 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl",
 71 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl",
 72 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl",
 73 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl",
 74 |         "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl",  # noqa
 75 |         # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
 76 |         "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl",
 77 |         "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl",
 78 |         "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl",
 79 |         "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl",
 80 |         # COCO Panoptic Segmentation Baselines with Panoptic FPN
 81 |         "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl",
 82 |         "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl",
 83 |         "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl",
 84 |         # LVIS Instance Segmentation Baselines with Mask R-CNN
 85 |         "LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl",
 86 |         "LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl",
 87 |         "LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl",  # noqa
 88 |         # Cityscapes & Pascal VOC Baselines
 89 |         "Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl",
 90 |         "PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl",
 91 |         # Other Settings
 92 |         "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml": "138602867/model_final_65c703.pkl",
 93 |         "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml": "144998336/model_final_821d0b.pkl",
 94 |         "Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl",
 95 |         "Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl",
 96 |         "Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "169527823/model_final_3b3c51.pkl",
 97 |         "Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl",
 98 |         "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl",
 99 |         "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl",
100 |         "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl",  # noqa
101 |         # D1 Comparisons
102 |         "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml": "137781054/model_final_7ab50c.pkl",  # noqa
103 |         "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml": "137781281/model_final_62ca52.pkl",  # noqa
104 |         "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml": "137781195/model_final_cce136.pkl",
105 |     }
106 | 
107 | 
108 | def get_checkpoint_url(config_path):
109 |     """
110 |     Returns the URL to the model trained using the given config
111 | 
112 |     Args:
113 |         config_path (str): config file name relative to detectron2's "configs/"
114 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
115 | 
116 |     Returns:
117 |         str: a URL to the model
118 |     """
119 |     name = config_path.replace(".yaml", "")
120 |     if config_path in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
121 |         suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[config_path]
122 |         return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
123 |     raise RuntimeError("{} not available in Model Zoo!".format(name))
124 | 
125 | 
126 | 
127 | def get_config_file(config_path):
128 |     """
129 |     Returns path to a builtin config file.
130 | 
131 |     Args:
132 |         config_path (str): config file name relative to detectron2's "configs/"
133 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
134 | 
135 |     Returns:
136 |         str: the real path to the config file.
137 |     """
138 |     cfg_file = pkg_resources.resource_filename(
139 |         "detectron2.model_zoo", os.path.join("configs", config_path)
140 |     )
141 |     if not os.path.exists(cfg_file):
142 |         raise RuntimeError("{} not available in Model Zoo!".format(config_path))
143 |     return cfg_file
144 | 
145 | 
146 | 
147 | def get(config_path, trained: bool = False):
148 |     """
149 |     Get a model specified by relative path under Detectron2's official ``configs/`` directory.
150 | 
151 |     Args:
152 |         config_path (str): config file name relative to detectron2's "configs/"
153 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
154 |         trained (bool): If True, will initialize the model with the trained model zoo weights.
155 |             If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
156 |             instead; this will typically (though not always) initialize a subset of weights using
157 |             an ImageNet pre-trained model, while randomly initializing the other weights.
158 | 
159 |     Returns:
160 |         nn.Module: a detectron2 model
161 | 
162 |     Example:
163 |     ::
164 |         from detectron2 import model_zoo
165 |         model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
166 |     """
167 |     cfg_file = get_config_file(config_path)
168 | 
169 |     cfg = get_cfg()
170 |     cfg.merge_from_file(cfg_file)
171 |     if trained:
172 |         cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
173 |     if not torch.cuda.is_available():
174 |         cfg.MODEL.DEVICE = "cpu"
175 | 
176 |     model = build_model(cfg)
177 |     DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
178 |     return model
179 |     # E.g. # model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
180 | # <<---------------------- load predefined model -------------------
181 | 
182 | 
183 | 
184 | 
185 | # Resolution of camera streams
186 | RESOLUTION_X = 640  #640, 1280
187 | RESOLUTION_Y = 360  #480, 720
188 | 
189 | # Configuration for histogram for depth image
190 | NUM_BINS = 500    #500 x depth_scale = e.g. 500x0.001m=50cm
191 | MAX_RANGE = 10000  #10000xdepth_scale = e.g. 10000x0.001m=10m
192 | 
193 | AXES_SIZE = 10
194 | 
195 | 
196 | 
197 | class VideoStreamer:
198 |     """
199 |     Video streamer that takes advantage of multi-threading, and continuously is reading frames.
200 |     Frames are then ready to read when program requires.
201 |     """
202 |     def __init__(self, video_file=None):
203 |         """
204 |         When initialised, VideoStreamer object should be reading frames
205 |         """
206 |         self.setup_image_config(video_file)
207 |         self.configure_streams()
208 |         self.stopped = False
209 | 
210 |     def start(self):
211 |         """
212 |         Initialise thread, update method will run under thread
213 |         """
214 |         Thread(target=self.update, args=()).start()
215 |         return self
216 | 
217 |     def update(self):
218 |         """
219 |         Constantly read frames until stop() method is introduced
220 |         """
221 |         while True:
222 | 
223 |             if self.stopped:
224 |                 return
225 | 
226 |             frames = self.pipeline.wait_for_frames()
227 |             frames = self.align.process(frames)
228 | 
229 |             color_frame = frames.get_color_frame()
230 |             depth_frame = frames.get_depth_frame()
231 | 
232 |             self.depth_intrin = depth_frame.profile.as_video_stream_profile().intrinsics
233 |             
234 |             # Convert image to numpy array and initialise images
235 |             self.color_image = np.asanyarray(color_frame.get_data())
236 |             self.depth_image = np.asanyarray(depth_frame.get_data())
237 | 
238 | 
239 |     def stop(self):
240 |         self.pipeline.stop()
241 |         self.stopped = True
242 | 
243 |     def read(self):
244 |         return (self.color_image, self.depth_image)
245 | 
246 |     def setup_image_config(self, video_file=None):
247 |         """
248 |         Setup config and video steams. If --file is specified as an argument, setup
249 |         stream from file. The input of --file is a .bag file in the bag_files folder.
250 |         .bag files can be created using d435_to_file in the tools folder.
251 |         video_file is by default None, and thus will by default stream from the 
252 |         device connected to the USB.
253 |         """
254 |         config = rs.config()
255 | 
256 |         if video_file is None:
257 |             
258 |             config.enable_stream(rs.stream.depth, RESOLUTION_X, RESOLUTION_Y, rs.format.z16, 30)
259 |             config.enable_stream(rs.stream.color, RESOLUTION_X, RESOLUTION_Y, rs.format.bgr8, 30)
260 |         else:
261 |             try:
262 |                 config.enable_device_from_file("bag_files/{}".format(video_file))
263 |             except:
264 |                 print("Cannot enable device from: '{}'".format(video_file))
265 | 
266 |         self.config = config
267 | 
268 |     def configure_streams(self):
269 |         # Configure video streams
270 |         self.pipeline = rs.pipeline()
271 |     
272 |         # Start streaming
273 |         self.profile = self.pipeline.start(self.config)
274 |         self.align = rs.align(rs.stream.color)
275 | 
276 |     def get_depth_scale(self):
277 |         return self.profile.get_device().first_depth_sensor().get_depth_scale()
278 | 
279 | 
280 | 
281 | class Predictor(DefaultPredictor):
282 |     def __init__(self):
283 |         self.config = self.setup_predictor_config()
284 |         super().__init__(self.config)
285 | 
286 |     def create_outputs(self, color_image):
287 |         self.outputs = self(color_image)
288 | 
289 |     def setup_predictor_config(self):
290 |         """
291 |         Setup config and return predictor. See config/defaults.py for more options
292 |         """
293 | 
294 |         config_path = 'COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml'
295 |         print(
296 |             f'vip-Using {config_path} at /Users/bowu/opt/anaconda3/lib/python3.7/site-packages/detectron2/engine/defaults.py')
297 |         # cfg_file = get_config_file(config_path)
298 | 
299 |         cfg = get_cfg()
300 |         cfg.merge_from_file("configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml")
301 |         pretrained = True
302 |         if pretrained:
303 |             cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
304 | 
305 |         cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.4
306 |         # Mask R-CNN ResNet101 FPN weights
307 |         #cfg.MODEL.WEIGHTS = "model_final_a3ec72.pkl"
308 |         # This determines the resizing of the image. At 0, resizing is disabled.
309 |         cfg.INPUT.MIN_SIZE_TEST = 0
310 | 
311 |         return cfg
312 | 
313 |     def format_results(self, class_names):
314 |         """
315 |         Format results so they can be used by overlay_instances function
316 |         """
317 |         predictions = self.outputs['instances']
318 |         boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
319 |         scores = predictions.scores if predictions.has("scores") else None
320 |         classes = predictions.pred_classes if predictions.has("pred_classes") else None
321 | 
322 |         labels = None 
323 |         if classes is not None and class_names is not None and len(class_names) > 1:
324 |             labels = [class_names[i] for i in classes]
325 |         if scores is not None:
326 |             if labels is None:
327 |                 labels = ["{:.0f}%".format(s * 100) for s in scores]
328 |             else:
329 |                 labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
330 | 
331 |         masks = predictions.pred_masks.cpu().numpy()
332 |         masks = [GenericMask(x, v.output.height, v.output.width) for x in masks]
333 | 
334 |         boxes_list = boxes.tensor.tolist()
335 |         scores_list = scores.tolist()
336 |         class_list = classes.tolist()
337 | 
338 |         for i in range(len(scores_list)):
339 |             boxes_list[i].append(scores_list[i])
340 |             boxes_list[i].append(class_list[i])
341 |         
342 | 
343 |         boxes_list = np.array(boxes_list)
344 | 
345 |         return (masks, boxes, boxes_list, labels, scores_list, class_list)    
346 | 
347 | 
348 | 
349 | class OptimizedVisualizer(Visualizer):
350 |     """
351 |     Detectron2's altered Visualizer class which converts boxes tensor to cpu
352 |     """
353 |     def __init__(self, img_rgb, metadata, scale=1.0, instance_mode=ColorMode.IMAGE):
354 |         super().__init__(img_rgb, metadata, scale, instance_mode)
355 |     
356 |     def _convert_boxes(self, boxes):
357 |         """
358 |         Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
359 |         """
360 |         if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
361 |             return boxes.tensor.cpu().numpy()
362 |         else:
363 |             return np.asarray(boxes)
364 | 
365 | 
366 | 
367 | class DetectedObject:
368 |     """
369 |     Each object corresponds to all objects detected during the instance segmentation
370 |     phase. Associated trackers, distance, position and velocity are stored as attributes
371 |     of the object.
372 |     masks[i], boxes[i], labels[i], scores_list[i], class_list[i]
373 |     """
374 |     def __init__(self, mask, box, label, score, class_name):
375 |         self.mask = mask
376 |         self.box = box
377 |         self.label = label
378 |         self.score = score
379 |         self.class_name = class_name
380 | 
381 |     def __str__(self):
382 |         ret_str = "The pixel mask of {} represents a {} and is {}m away from the camera.\n".format(self.mask, self.class_name, self.distance)
383 |         if hasattr(self, 'track'):
384 |             if hasattr(self.track, 'speed'):
385 |                 if self.track.speed >= 0:
386 |                     ret_str += "The {} is travelling {}m/s towards the camera\n".format(self.class_name, self.track.speed)
387 |                 else:
388 |                     ret_str += "The {} is travelling {}m/s away from the camera\n".format(self.class_name, abs(self.track.speed))
389 |             if hasattr(self.track, 'impact_time'):
390 |                 ret_str += "The {} will collide in {} seconds\n".format(self.class_name, self.track.impact_time)
391 |             if hasattr(self.track, 'velocity'):
392 |                 ret_str += "The {} is located at {} and travelling at {}m/s\n".format(self.class_name, self.track.position, self.track.velocity)
393 |         return ret_str
394 | 
395 |     def create_vector_arrow(self):
396 |         """
397 |         Creates direction arrow which will use Arrow3D object. Converts vector to a suitable size so that the direction is clear.
398 |         NOTE: The magnitude of the velocity is not represented through this arrow. The arrow lengths are almost all identical
399 |         """
400 |         arrow_ratio = AXES_SIZE / max(abs(self.track.velocity_vector[0]), abs(self.track.velocity_vector[1]), abs(self.track.velocity_vector[2]))
401 |         self.track.v_points = [x * arrow_ratio for x in self.track.velocity_vector]
402 | 
403 |     
404 | 
405 | class Arrow3D(FancyArrowPatch):
406 |     """
407 |     Arrow used to demonstrate direction of travel for each object
408 |     """
409 |     def __init__(self, xs, ys, zs, *args, **kwargs):
410 |         FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
411 |         self._verts3d = xs, ys, zs
412 | 
413 |     def draw(self, renderer):
414 |         xs3d, ys3d, zs3d = self._verts3d
415 |         xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
416 |         self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
417 |         FancyArrowPatch.draw(self, renderer)
418 | 
419 | 
420 | 
421 | def find_mask_centre(mask, color_image):
422 |     """
423 |     Finding centre of mask using moments
424 |     """
425 |     moments = cv2.moments(np.float32(mask))
426 | 
427 |     cX = int(moments["m10"] / moments["m00"])
428 |     cY = int(moments["m01"] / moments["m00"])
429 | 
430 |     return cX, cY
431 | 
432 | 
433 | def find_median_depth(mask_area, num_median, histg):
434 |     """
435 |     Iterate through all histogram bins and stop at the median value. This is the
436 |     median depth of the mask.
437 |     """
438 |     
439 |     median_counter = 0
440 |     centre_depth = "0.00"
441 |     for x in range(0, len(histg)):
442 |         median_counter += histg[x][0]
443 |         if median_counter >= num_median:
444 |             # Half of histogram is iterated through,
445 |             # Therefore this bin contains the median
446 |             centre_depth = x / 50
447 |             break 
448 | 
449 |     return float(centre_depth)
450 | 
451 | def debug_plots(color_image, depth_image, mask, histg, depth_colormap):
452 |     """
453 |     This function is used for debugging purposes. This plots the depth color-
454 |     map, mask, mask and depth color-map bitwise_and, and histogram distrobutions
455 |     of the full image and the masked image.
456 |     """
457 |     full_hist = cv2.calcHist([depth_image], [0], None, [NUM_BINS], [0, MAX_RANGE])
458 |     masked_depth_image = cv2.bitwise_and(depth_colormap, depth_colormap, mask= mask)
459 | 
460 |     plt.figure()
461 |             
462 |     plt.subplot(2, 2, 1)
463 |     plt.imshow(depth_colormap)
464 | 
465 |     plt.subplot(2, 2, 2)
466 |     plt.imshow(masks[i].mask)
467 | 
468 |     plt.subplot(2, 2, 3).set_title(labels[i])
469 |     plt.imshow(masked_depth_image)
470 | 
471 |     plt.subplot(2, 2, 4)
472 |     plt.plot(full_hist)
473 |     plt.plot(histg)
474 |     plt.xlim([0, 600])
475 |     plt.show()
476 | 
477 | if __name__ == "__main__":
478 |     
479 |     parser = argparse.ArgumentParser()
480 |     parser.add_argument('--file', help='type --file=file-name.bag to stream using file instead of webcam')
481 |     args = parser.parse_args()
482 | 
483 |     # Initialise Detectron2 predictor
484 |     predictor = Predictor()
485 | 
486 |     # Initialise video streams from D435
487 |     video_streamer = VideoStreamer()
488 | 
489 |     # Initialise Kalman filter tracker from modified Sort module
490 |     mot_tracker = Sort()
491 | 
492 |     depth_scale = video_streamer.get_depth_scale()
493 |     print("Depth Scale is: {:.4f}m".format(depth_scale))
494 | 
495 |     speed_time_start = time.time()
496 | 
497 |     video_streamer.start()
498 |     time.sleep(1)
499 | 
500 |     while True:
501 |         
502 |         time_start = time.time()
503 |         color_image, depth_image = video_streamer.read()
504 |         detected_objects = []
505 | 
506 |         t1 = time.time()
507 | 
508 |         camera_time = t1 - time_start
509 |         
510 |         predictor.create_outputs(color_image)
511 |         outputs = predictor.outputs
512 | 
513 |         t2 = time.time()
514 |         model_time = t2 - t1
515 |         print("Model took {:.2f} time".format(model_time))
516 | 
517 |         predictions = outputs['instances']
518 |         
519 | 
520 |         if outputs['instances'].has('pred_masks'):
521 |             num_masks = len(predictions.pred_masks)
522 |         else:
523 |             # Even if no masks are found, the trackers must still be updated
524 |             tracked_objects = mot_tracker.update(boxes_list)
525 |             continue
526 |         
527 |         detectron_time = time.time()
528 | 
529 |         # Create a new Visualizer object from Detectron2 
530 |         v = OptimizedVisualizer(color_image[:, :, ::-1], MetadataCatalog.get(predictor.config.DATASETS.TRAIN[0]))
531 |         
532 |         masks, boxes, boxes_list, labels, scores_list, class_list = predictor.format_results(v.metadata.get("thing_classes"))
533 | 
534 |         for i in range(num_masks):
535 |             try:
536 |                 detected_obj = DetectedObject(masks[i], boxes[i], labels[i], scores_list[i], class_list[i])
537 |             except:
538 |                 print("Object doesn't meet all parameters")
539 |             
540 |             detected_objects.append(detected_obj)
541 | 
542 | 
543 |         tracked_objects = mot_tracker.update(boxes_list)
544 | 
545 |         
546 |         v.overlay_instances(
547 |             masks=masks,
548 |             boxes=boxes,
549 |             labels=labels,
550 |             keypoints=None,
551 |             assigned_colors=None,
552 |             alpha=0.3
553 |         )
554 |         
555 |         speed_time_end = time.time()
556 |         total_speed_time = speed_time_end - speed_time_start
557 |         speed_time_start = time.time()
558 |         for i in range(num_masks):
559 |             """
560 |             Converting depth image to a histogram with num bins of NUM_BINS 
561 |             and depth range of (0 - MAX_RANGE millimeters)
562 |             """
563 |         
564 |             mask_area = detected_objects[i].mask.area()
565 |             num_median = math.floor(mask_area / 2)
566 |             
567 |             histg = cv2.calcHist([depth_image], [0], detected_objects[i].mask.mask, [NUM_BINS], [0, MAX_RANGE])
568 |             
569 |             
570 |             # Uncomment this to use the debugging function
571 |             #depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
572 |             #debug_plots(color_image, depth_image, masks[i].mask, histg, depth_colormap)
573 |             
574 |             centre_depth = find_median_depth(mask_area, num_median, histg)
575 |             detected_objects[i].distance = centre_depth
576 |             cX, cY = find_mask_centre(detected_objects[i].mask._mask, v.output)
577 | 
578 |             # track refers to the list which holds the index of the detected mask which matches the tracker
579 |             track = mot_tracker.matched[np.where(mot_tracker.matched[:,0]==i)[0],1]
580 |             
581 |             if len(track) > 0:
582 |                 # Index of detected mask
583 |                 track = track[0]
584 |                 if i not in mot_tracker.unmatched:
585 |                     try:
586 |                         # If the tracker's distance has already been initialised - tracker has been detected previously
587 |                         if hasattr(mot_tracker.trackers[track], 'distance'):
588 |                             mot_tracker.trackers[track].set_speed(centre_depth, total_speed_time)
589 | 
590 |                             mot_tracker.trackers[track].set_impact_time(centre_depth)
591 | 
592 |                             if mot_tracker.trackers[track].impact_time != False and mot_tracker.trackers[track].impact_time >= 0:
593 |                                 v.draw_text("{:.2f} seconds to impact".format(mot_tracker.trackers[track].impact_time), (cX, cY + 60))
594 |                         
595 |                         if hasattr(mot_tracker.trackers[track], 'position'):
596 |                             # New 3D coordinates for current frame
597 |                             x1, y1, z1 = rs.rs2_deproject_pixel_to_point(
598 |                             video_streamer.depth_intrin, [cX, cY], centre_depth
599 |                         )
600 |                             
601 |                             # Update states for tracked object
602 |                             mot_tracker.trackers[track].set_velocity_vector(x1, y1, z1)
603 |                             mot_tracker.trackers[track].set_distance_3d(x1, y1, z1)
604 |                             mot_tracker.trackers[track].set_velocity(total_speed_time)
605 | 
606 |                             detected_objects[i].track = mot_tracker.trackers[track]
607 | 
608 |                             v.draw_text("{:.2f}m/s".format(detected_objects[i].track.velocity), (cX, cY + 40))
609 | 
610 |                             #relative_x = (cX - 64) / RESOLUTION_X
611 |                             #relative_y = (abs(RESOLUTION_Y - cY) - 36) / RESOLUTION_Y
612 | 
613 |                             
614 |                             # Show velocity vector arrow if velocity >= 1 m/s
615 |                             """
616 |                             if detected_objects[i].track.velocity >= 1:
617 |                                 ax = v.output.fig.add_axes([relative_x, relative_y, 0.1, 0.1], projection='3d')
618 |                                 ax.set_xlim([-AXES_SIZE, AXES_SIZE])
619 |                                 ax.set_ylim([-AXES_SIZE, AXES_SIZE])
620 |                                 ax.set_zlim([-AXES_SIZE, AXES_SIZE])
621 |                                 
622 |                                 #print(v_points)
623 |                                 detected_objects[i].create_vector_arrow()
624 |                                 a = Arrow3D([0, detected_objects[i].track.v_points[0]], [0, detected_objects[i].track.v_points[1]], [0, detected_objects[i].track.v_points[2]], mutation_scale=10, lw=1, arrowstyle="-|>", color="w")
625 |                                 ax.add_artist(a)
626 |                                 #ax.axis("off")
627 |                                 ax.set_facecolor((1, 1, 1, 0))
628 |                                 v.output.fig.add_axes(ax)
629 |                             """
630 | 
631 |                         position = rs.rs2_deproject_pixel_to_point(
632 |                             video_streamer.depth_intrin, [cX, cY], centre_depth
633 |                         )    
634 |                             
635 |                         mot_tracker.trackers[track].set_distance(centre_depth)
636 |                         mot_tracker.trackers[track].set_position(position)
637 | 
638 |                         
639 |                     except IndexError:
640 |                         continue
641 | 
642 | 
643 |             v.draw_circle((cX, cY), (0, 0, 0))
644 |             v.draw_text("{:.2f}m".format(centre_depth), (cX, cY + 20))
645 |             
646 | 
647 |         #for i in detected_objects:
648 |             #print(i)
649 | 
650 |         #depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
651 |         #cv2.imshow('Segmented Image', color_image)
652 |         cv2.imshow('Segmented Image', v.output.get_image()[:, :, ::-1])
653 |         #cv2.imshow('Depth', depth_colormap)
654 |         if cv2.waitKey(1) & 0xFF == ord('q'):
655 |             break
656 |         
657 |         time_end = time.time()
658 |         total_time = time_end - time_start
659 | 
660 |         print("Time to process frame: {:.2f}".format(total_time))
661 |         print("FPS: {:.2f}\n".format(1/total_time))
662 |         
663 |     video_streamer.stop()
664 |     cv2.destroyAllWindows()
665 | 


--------------------------------------------------------------------------------
/main_detectron2_simple_mac.py:
--------------------------------------------------------------------------------
  1 | """BW: this file is all to speed up the inference step, including measures:
  2 |     + with smaller image size, i.e. (1280x720)-(640,480)-(640-360).
  3 |     + with less displayed sentences, only centers and distances are kept shown.
  4 |     + with larger softmax threshold, i.e. 75%.
  5 | 
  6 | """
  7 | import numpy as np
  8 | import time
  9 | import cv2
 10 | import pyrealsense2 as rs 
 11 | import random
 12 | import math
 13 | import argparse
 14 | 
 15 | from threading import Thread
 16 | from matplotlib import pyplot as plt
 17 | from mpl_toolkits.axes_grid1.inset_locator import inset_axes
 18 | from mpl_toolkits.mplot3d import proj3d
 19 | from mpl_toolkits.mplot3d import Axes3D
 20 | from matplotlib.patches import FancyArrowPatch
 21 | from sort import *
 22 | 
 23 | from detectron2.engine import DefaultPredictor
 24 | from detectron2.config import get_cfg
 25 | from detectron2.utils.visualizer import Visualizer
 26 | from detectron2.utils.visualizer import GenericMask
 27 | from detectron2.utils.visualizer import ColorMode
 28 | from detectron2.structures import Boxes, RotatedBoxes
 29 | 
 30 | from detectron2.data import MetadataCatalog
 31 | 
 32 | import torch, torchvision
 33 | 
 34 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 35 | import os
 36 | import pkg_resources
 37 | 
 38 | from detectron2.checkpoint import DetectionCheckpointer
 39 | from detectron2.modeling import build_model
 40 | 
 41 | # >>---------------------- load predefined model -------------------
 42 | class _ModelZooUrls(object):
 43 |     """
 44 |     Mapping from names to officially released Detectron2 pre-trained models.
 45 |     """
 46 | 
 47 |     S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
 48 | 
 49 |     # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
 50 |     CONFIG_PATH_TO_URL_SUFFIX = {
 51 |         # COCO Detection with Faster R-CNN
 52 |         "COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl",
 53 |         "COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl",
 54 |         "COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl",
 55 |         "COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl",
 56 |         "COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl",
 57 |         "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl",
 58 |         "COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl",
 59 |         "COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl",
 60 |         "COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl",
 61 |         "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl",
 62 |         # COCO Detection with RetinaNet
 63 |         "COCO-Detection/retinanet_R_50_FPN_1x.yaml": "190397773/model_final_bfca0b.pkl",
 64 |         "COCO-Detection/retinanet_R_50_FPN_3x.yaml": "190397829/model_final_5bd44e.pkl",
 65 |         "COCO-Detection/retinanet_R_101_FPN_3x.yaml": "190397697/model_final_971ab9.pkl",
 66 |         # COCO Detection with RPN and Fast R-CNN
 67 |         "COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl",
 68 |         "COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl",
 69 |         "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl",
 70 |         # COCO Instance Segmentation Baselines with Mask R-CNN
 71 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl",
 72 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl",
 73 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl",
 74 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl",
 75 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl",
 76 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl",
 77 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl",
 78 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl",
 79 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl",
 80 |         "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl",  # noqa
 81 |         # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
 82 |         "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl",
 83 |         "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl",
 84 |         "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl",
 85 |         "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl",
 86 |         # COCO Panoptic Segmentation Baselines with Panoptic FPN
 87 |         "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl",
 88 |         "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl",
 89 |         "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl",
 90 |         # LVIS Instance Segmentation Baselines with Mask R-CNN
 91 |         "LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl",
 92 |         "LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl",
 93 |         "LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl",  # noqa
 94 |         # Cityscapes & Pascal VOC Baselines
 95 |         "Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl",
 96 |         "PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl",
 97 |         # Other Settings
 98 |         "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml": "138602867/model_final_65c703.pkl",
 99 |         "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml": "144998336/model_final_821d0b.pkl",
100 |         "Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl",
101 |         "Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl",
102 |         "Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "169527823/model_final_3b3c51.pkl",
103 |         "Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl",
104 |         "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl",
105 |         "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl",
106 |         "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl",  # noqa
107 |         # D1 Comparisons
108 |         "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml": "137781054/model_final_7ab50c.pkl",  # noqa
109 |         "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml": "137781281/model_final_62ca52.pkl",  # noqa
110 |         "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml": "137781195/model_final_cce136.pkl",
111 |     }
112 | def get_checkpoint_url(config_path):
113 |     """
114 |     Returns the URL to the model trained using the given config
115 | 
116 |     Args:
117 |         config_path (str): config file name relative to detectron2's "configs/"
118 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
119 | 
120 |     Returns:
121 |         str: a URL to the model
122 |     """
123 |     name = config_path.replace(".yaml", "")
124 |     if config_path in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
125 |         suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[config_path]
126 |         return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
127 |     raise RuntimeError("{} not available in Model Zoo!".format(name))
128 | def get_config_file(config_path):
129 |     """
130 |     Returns path to a builtin config file.
131 | 
132 |     Args:
133 |         config_path (str): config file name relative to detectron2's "configs/"
134 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
135 | 
136 |     Returns:
137 |         str: the real path to the config file.
138 |     """
139 |     cfg_file = pkg_resources.resource_filename(
140 |         "detectron2.model_zoo", os.path.join("configs", config_path)
141 |     )
142 |     if not os.path.exists(cfg_file):
143 |         raise RuntimeError("{} not available in Model Zoo!".format(config_path))
144 |     return cfg_file
145 | def get(config_path, trained = False):
146 |     """
147 |     Get a model specified by relative path under Detectron2's official ``configs/`` directory.
148 | 
149 |     Args:
150 |         config_path (str): config file name relative to detectron2's "configs/"
151 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
152 |         trained (bool): If True, will initialize the model with the trained model zoo weights.
153 |             If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
154 |             instead; this will typically (though not always) initialize a subset of weights using
155 |             an ImageNet pre-trained model, while randomly initializing the other weights.
156 | 
157 |     Returns:
158 |         nn.Module: a detectron2 model
159 | 
160 |     Example:
161 |     ::
162 |         from detectron2 import model_zoo
163 |         model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
164 |     """
165 |     cfg_file = get_config_file(config_path)
166 | 
167 |     cfg = get_cfg()
168 |     cfg.merge_from_file(cfg_file)
169 |     if trained:
170 |         cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
171 |     if not torch.cuda.is_available():
172 |         cfg.MODEL.DEVICE = "cpu"
173 | 
174 |     model = build_model(cfg)
175 |     DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
176 |     return model
177 |     # E.g. # model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
178 | # <<---------------------- load predefined model -------------------
179 | 
180 | 
181 | 
182 | # Resolution of camera streams
183 | RESOLUTION_X = 640  #640, 1280
184 | RESOLUTION_Y = 360  #480, 720
185 | 
186 | # Configuration for histogram for depth image
187 | NUM_BINS = 500    #500 x depth_scale = e.g. 500x0.001m=50cm
188 | MAX_RANGE = 10000  #10000xdepth_scale = e.g. 10000x0.001m=10m
189 | 
190 | AXES_SIZE = 10
191 | 
192 | # Set test score threshold
193 | SCORE_THRESHOLD = 0.65  #vip-The smaller, the faster.
194 | 
195 | 
196 | class VideoStreamer:
197 |     """
198 |     Video streamer that takes advantage of multi-threading, and continuously is reading frames.
199 |     Frames are then ready to read when program requires.
200 |     """
201 |     def __init__(self, video_file=None):
202 |         """
203 |         When initialised, VideoStreamer object should be reading frames
204 |         """
205 |         self.setup_image_config(video_file)
206 |         self.configure_streams()
207 |         self.stopped = False
208 | 
209 |     def start(self):
210 |         """
211 |         Initialise thread, update method will run under thread
212 |         """
213 |         Thread(target=self.update, args=()).start()
214 |         return self
215 | 
216 |     def update(self):
217 |         """
218 |         Constantly read frames until stop() method is introduced
219 |         """
220 |         while True:
221 | 
222 |             if self.stopped:
223 |                 return
224 | 
225 |             frames = self.pipeline.wait_for_frames()
226 |             frames = self.align.process(frames)
227 | 
228 |             color_frame = frames.get_color_frame()
229 |             depth_frame = frames.get_depth_frame()
230 | 
231 |             self.depth_intrin = depth_frame.profile.as_video_stream_profile().intrinsics
232 |             
233 |             # Convert image to numpy array and initialise images
234 |             self.color_image = np.asanyarray(color_frame.get_data())
235 |             self.depth_image = np.asanyarray(depth_frame.get_data())
236 | 
237 | 
238 |     def stop(self):
239 |         self.pipeline.stop()
240 |         self.stopped = True
241 | 
242 |     def read(self):
243 |         return (self.color_image, self.depth_image)
244 | 
245 |     def setup_image_config(self, video_file=None):
246 |         """
247 |         Setup config and video steams. If --file is specified as an argument, setup
248 |         stream from file. The input of --file is a .bag file in the bag_files folder.
249 |         .bag files can be created using d435_to_file in the tools folder.
250 |         video_file is by default None, and thus will by default stream from the 
251 |         device connected to the USB.
252 |         """
253 |         config = rs.config()
254 | 
255 |         if video_file is None:
256 |             
257 |             config.enable_stream(rs.stream.depth, RESOLUTION_X, RESOLUTION_Y, rs.format.z16, 30)
258 |             config.enable_stream(rs.stream.color, RESOLUTION_X, RESOLUTION_Y, rs.format.bgr8, 30)
259 |         else:
260 |             try:
261 |                 config.enable_device_from_file("bag_files/{}".format(video_file))
262 |             except:
263 |                 print("Cannot enable device from: '{}'".format(video_file))
264 | 
265 |         self.config = config
266 | 
267 |     def configure_streams(self):
268 |         # Configure video streams
269 |         self.pipeline = rs.pipeline()
270 |     
271 |         # Start streaming
272 |         self.profile = self.pipeline.start(self.config)
273 |         self.align = rs.align(rs.stream.color)
274 | 
275 |     def get_depth_scale(self):
276 |         return self.profile.get_device().first_depth_sensor().get_depth_scale()
277 | 
278 | 
279 | 
280 | class Predictor(DefaultPredictor):
281 |     def __init__(self):
282 |         self.config = self.setup_predictor_config()
283 |         super().__init__(self.config)
284 | 
285 |     def create_outputs(self, color_image):
286 |         self.outputs = self(color_image)
287 | 
288 |     def setup_predictor_config(self):
289 |         """
290 |         Setup config and return predictor. See config/defaults.py for more options
291 |         """
292 | 
293 | 
294 |         # cfg_file = get_config_file(config_path)
295 | 
296 |         cfg = get_cfg()
297 |         config_path = 'COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml'
298 |         cfg.merge_from_file("configs/" + config_path)
299 |         pretrained = True
300 |         if pretrained:
301 |             cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
302 | 
303 |         cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = SCORE_THRESHOLD
304 |         # Mask R-CNN ResNet101 FPN weights
305 |         ##cfg.MODEL.WEIGHTS = "model_final_a3ec72.pkl"
306 |         # This determines the resizing of the image. At 0, resizing is disabled.
307 |         cfg.INPUT.MIN_SIZE_TEST = 0
308 | 
309 |         return cfg
310 | 
311 |     def format_results(self, class_names):
312 |         """
313 |         Format results so they can be used by overlay_instances function
314 |         """
315 |         predictions = self.outputs['instances']
316 |         boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
317 |         scores = predictions.scores if predictions.has("scores") else None
318 |         classes = predictions.pred_classes if predictions.has("pred_classes") else None
319 | 
320 |         labels = None 
321 |         if classes is not None and class_names is not None and len(class_names) > 1:
322 |             labels = [class_names[i] for i in classes]
323 |         if scores is not None:
324 |             if labels is None:
325 |                 labels = ["{:.0f}%".format(s * 100) for s in scores]
326 |             else:
327 |                 labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
328 | 
329 |         masks = predictions.pred_masks.cpu().numpy()
330 |         masks = [GenericMask(x, v.output.height, v.output.width) for x in masks]
331 | 
332 |         boxes_list = boxes.tensor.tolist()
333 |         scores_list = scores.tolist()
334 |         class_list = classes.tolist()
335 | 
336 |         for i in range(len(scores_list)):
337 |             boxes_list[i].append(scores_list[i])
338 |             boxes_list[i].append(class_list[i])
339 |         
340 | 
341 |         boxes_list = np.array(boxes_list)
342 | 
343 |         return (masks, boxes, boxes_list, labels, scores_list, class_list)    
344 | 
345 | 
346 | 
347 | class OptimizedVisualizer(Visualizer):
348 |     """
349 |     Detectron2's altered Visualizer class which converts boxes tensor to cpu
350 |     """
351 |     def __init__(self, img_rgb, metadata, scale=1.0, instance_mode=ColorMode.IMAGE):
352 |         super().__init__(img_rgb, metadata, scale, instance_mode)
353 |     
354 |     def _convert_boxes(self, boxes):
355 |         """
356 |         Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
357 |         """
358 |         if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
359 |             return boxes.tensor.cpu().numpy()
360 |         else:
361 |             return np.asarray(boxes)
362 | 
363 | 
364 | 
365 | class DetectedObject:
366 |     """
367 |     Each object corresponds to all objects detected during the instance segmentation
368 |     phase. Associated trackers, distance, position and velocity are stored as attributes
369 |     of the object.
370 |     masks[i], boxes[i], labels[i], scores_list[i], class_list[i]
371 |     """
372 |     def __init__(self, mask, box, label, score, class_name):
373 |         self.mask = mask
374 |         self.box = box
375 |         self.label = label
376 |         self.score = score
377 |         self.class_name = class_name
378 | 
379 |     #BW: comment below for speed-up! ~5sec/frame faster.
380 |     # def __str__(self):
381 |     #     ret_str = "The pixel mask of {} represents a {} and is {}m away from the camera.\n".format(self.mask, self.class_name, self.distance)
382 |     #     if hasattr(self, 'track'):
383 |     #         if hasattr(self.track, 'speed'):
384 |     #             if self.track.speed >= 0:
385 |     #                 ret_str += "The {} is travelling {}m/s towards the camera\n".format(self.class_name, self.track.speed)
386 |     #             else:
387 |     #                 ret_str += "The {} is travelling {}m/s away from the camera\n".format(self.class_name, abs(self.track.speed))
388 |     #         if hasattr(self.track, 'impact_time'):
389 |     #             ret_str += "The {} will collide in {} seconds\n".format(self.class_name, self.track.impact_time)
390 |     #         if hasattr(self.track, 'velocity'):
391 |     #             ret_str += "The {} is located at {} and travelling at {}m/s\n".format(self.class_name, self.track.position, self.track.velocity)
392 |     #     return ret_str
393 | 
394 |     def create_vector_arrow(self):
395 |         """
396 |         Creates direction arrow which will use Arrow3D object. Converts vector to a suitable size so that the direction is clear.
397 |         NOTE: The magnitude of the velocity is not represented through this arrow. The arrow lengths are almost all identical
398 |         """
399 |         arrow_ratio = AXES_SIZE / max(abs(self.track.velocity_vector[0]), abs(self.track.velocity_vector[1]), abs(self.track.velocity_vector[2]))
400 |         self.track.v_points = [x * arrow_ratio for x in self.track.velocity_vector]
401 | 
402 |     
403 | 
404 | class Arrow3D(FancyArrowPatch):
405 |     """
406 |     Arrow used to demonstrate direction of travel for each object
407 |     """
408 |     def __init__(self, xs, ys, zs, *args, **kwargs):
409 |         FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
410 |         self._verts3d = xs, ys, zs
411 | 
412 |     def draw(self, renderer):
413 |         xs3d, ys3d, zs3d = self._verts3d
414 |         xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
415 |         self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
416 |         FancyArrowPatch.draw(self, renderer)
417 | 
418 | 
419 | 
420 | def find_mask_centre(mask, color_image):
421 |     """
422 |     Finding centre of mask using moments
423 |     """
424 |     moments = cv2.moments(np.float32(mask))
425 | 
426 |     cX = int(moments["m10"] / moments["m00"])
427 |     cY = int(moments["m01"] / moments["m00"])
428 | 
429 |     return cX, cY
430 | 
431 | 
432 | def find_median_depth(mask_area, num_median, histg):
433 |     """
434 |     Iterate through all histogram bins and stop at the median value. This is the
435 |     median depth of the mask.
436 |     """
437 |     
438 |     median_counter = 0
439 |     centre_depth = "0.00"
440 |     for x in range(0, len(histg)):
441 |         median_counter += histg[x][0]
442 |         if median_counter >= num_median:
443 |             # Half of histogram is iterated through,
444 |             # Therefore this bin contains the median
445 |             centre_depth = x / 50
446 |             break 
447 | 
448 |     return float(centre_depth)
449 | 
450 | def debug_plots(color_image, depth_image, mask, histg, depth_colormap):
451 |     """
452 |     This function is used for debugging purposes. This plots the depth color-
453 |     map, mask, mask and depth color-map bitwise_and, and histogram distrobutions
454 |     of the full image and the masked image.
455 |     """
456 |     full_hist = cv2.calcHist([depth_image], [0], None, [NUM_BINS], [0, MAX_RANGE])
457 |     masked_depth_image = cv2.bitwise_and(depth_colormap, depth_colormap, mask= mask)
458 | 
459 |     plt.figure()
460 |             
461 |     plt.subplot(2, 2, 1)
462 |     plt.imshow(depth_colormap)
463 | 
464 |     plt.subplot(2, 2, 2)
465 |     plt.imshow(masks[i].mask)
466 | 
467 |     plt.subplot(2, 2, 3).set_title(labels[i])
468 |     plt.imshow(masked_depth_image)
469 | 
470 |     plt.subplot(2, 2, 4)
471 |     plt.plot(full_hist)
472 |     plt.plot(histg)
473 |     plt.xlim([0, 600])
474 |     plt.show()
475 | 
476 | if __name__ == "__main__":
477 |     
478 |     parser = argparse.ArgumentParser()
479 |     parser.add_argument('--file', help='type --file=file-name.bag to stream using file instead of webcam')
480 |     args = parser.parse_args()
481 | 
482 |     # Initialise Detectron2 predictor
483 |     predictor = Predictor()
484 | 
485 |     # Initialise video streams from D435
486 |     video_streamer = VideoStreamer()
487 | 
488 |     # Initialise Kalman filter tracker from modified Sort module
489 |     mot_tracker = Sort()
490 | 
491 |     depth_scale = video_streamer.get_depth_scale()
492 |     print("Depth Scale is: {:.4f}m".format(depth_scale))
493 | 
494 |     speed_time_start = time.time()
495 | 
496 |     video_streamer.start()
497 |     time.sleep(1)
498 | 
499 |     while True:
500 |         
501 |         time_start = time.time()
502 |         color_image, depth_image = video_streamer.read()
503 |         detected_objects = []
504 | 
505 |         t1 = time.time()
506 | 
507 |         camera_time = t1 - time_start
508 |         
509 |         predictor.create_outputs(color_image)
510 |         outputs = predictor.outputs
511 | 
512 |         t2 = time.time()
513 |         model_time = t2 - t1
514 |         print("Model took {:.2f} time".format(model_time))
515 | 
516 |         predictions = outputs['instances']
517 |         
518 | 
519 |         if outputs['instances'].has('pred_masks'):
520 |             num_masks = len(predictions.pred_masks)
521 |         else:
522 |             # Even if no masks are found, the trackers must still be updated
523 |             tracked_objects = mot_tracker.update(boxes_list)
524 |             continue
525 |         
526 |         detectron_time = time.time()
527 | 
528 |         # Create a new Visualizer object from Detectron2 
529 |         v = OptimizedVisualizer(color_image[:, :, ::-1], MetadataCatalog.get(predictor.config.DATASETS.TRAIN[0]))
530 |         
531 |         masks, boxes, boxes_list, labels, scores_list, class_list = predictor.format_results(v.metadata.get("thing_classes"))
532 | 
533 |         for i in range(num_masks):
534 |             try:
535 |                 detected_obj = DetectedObject(masks[i], boxes[i], labels[i], scores_list[i], class_list[i])
536 |             except:
537 |                 print("Object doesn't meet all parameters")
538 |             
539 |             detected_objects.append(detected_obj)
540 | 
541 | 
542 |         tracked_objects = mot_tracker.update(boxes_list)
543 | 
544 |         
545 |         v.overlay_instances(
546 |             masks=masks,
547 |             boxes=boxes,
548 |             labels=labels,
549 |             keypoints=None,
550 |             assigned_colors=None,
551 |             alpha=0.3
552 |         )
553 |         
554 |         speed_time_end = time.time()
555 |         total_speed_time = speed_time_end - speed_time_start
556 |         speed_time_start = time.time()
557 |         for i in range(num_masks):
558 |             """
559 |             Converting depth image to a histogram with num bins of NUM_BINS 
560 |             and depth range of (0 - MAX_RANGE millimeters)
561 |             """
562 |         
563 |             mask_area = detected_objects[i].mask.area()
564 |             num_median = math.floor(mask_area / 2)
565 |             
566 |             histg = cv2.calcHist([depth_image], [0], detected_objects[i].mask.mask, [NUM_BINS], [0, MAX_RANGE])
567 |             
568 |             
569 |             # Uncomment this to use the debugging function
570 |             #depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
571 |             #debug_plots(color_image, depth_image, masks[i].mask, histg, depth_colormap)
572 |             
573 |             centre_depth = find_median_depth(mask_area, num_median, histg)
574 |             detected_objects[i].distance = centre_depth
575 |             cX, cY = find_mask_centre(detected_objects[i].mask._mask, v.output)
576 | 
577 |             #BW: comment below to speed-up!
578 |             # >> ------------------------------------------------------------------------------
579 |             # # Track refers to the list which holds the index of the detected mask which matches the tracker
580 |             # track = mot_tracker.matched[np.where(mot_tracker.matched[:,0]==i)[0],1]
581 |             #
582 |             # if len(track) > 0:
583 |             #     # Index of detected mask
584 |             #     track = track[0]
585 |             #     if i not in mot_tracker.unmatched:
586 |             #         try:
587 |             #             # If the tracker's distance has already been initialised - tracker has been detected previously
588 |             #             if hasattr(mot_tracker.trackers[track], 'distance'):
589 |             #                 mot_tracker.trackers[track].set_speed(centre_depth, total_speed_time)
590 |             #
591 |             #                 mot_tracker.trackers[track].set_impact_time(centre_depth)
592 |             #
593 |             #                 if mot_tracker.trackers[track].impact_time != False and mot_tracker.trackers[track].impact_time >= 0:
594 |             #                     v.draw_text("{:.2f} seconds to impact".format(mot_tracker.trackers[track].impact_time), (cX, cY + 60))
595 |             #
596 |             #             if hasattr(mot_tracker.trackers[track], 'position'):
597 |             #                 # New 3D coordinates for current frame
598 |             #                 x1, y1, z1 = rs.rs2_deproject_pixel_to_point(
599 |             #                 video_streamer.depth_intrin, [cX, cY], centre_depth
600 |             #             )
601 |             #
602 |             #                 # Update states for tracked object
603 |             #                 mot_tracker.trackers[track].set_velocity_vector(x1, y1, z1)
604 |             #                 mot_tracker.trackers[track].set_distance_3d(x1, y1, z1)
605 |             #                 mot_tracker.trackers[track].set_velocity(total_speed_time)
606 |             #
607 |             #                 detected_objects[i].track = mot_tracker.trackers[track]
608 |             #
609 |             #                 v.draw_text("{:.2f}m/s".format(detected_objects[i].track.velocity), (cX, cY + 40))
610 |             #             # << ------------------------------------------------------------------------------
611 |             #                 ##relative_x = (cX - 64) / RESOLUTION_X
612 |             #                 ##relative_y = (abs(RESOLUTION_Y - cY) - 36) / RESOLUTION_Y
613 |             #
614 |             #
615 |             #                 # Show velocity vector arrow if velocity >= 1 m/s
616 |             #                 """
617 |             #                 if detected_objects[i].track.velocity >= 1:
618 |             #                     ax = v.output.fig.add_axes([relative_x, relative_y, 0.1, 0.1], projection='3d')
619 |             #                     ax.set_xlim([-AXES_SIZE, AXES_SIZE])
620 |             #                     ax.set_ylim([-AXES_SIZE, AXES_SIZE])
621 |             #                     ax.set_zlim([-AXES_SIZE, AXES_SIZE])
622 |             #
623 |             #                     #print(v_points)
624 |             #                     detected_objects[i].create_vector_arrow()
625 |             #                     a = Arrow3D([0, detected_objects[i].track.v_points[0]], [0, detected_objects[i].track.v_points[1]], [0, detected_objects[i].track.v_points[2]], mutation_scale=10, lw=1, arrowstyle="-|>", color="w")
626 |             #                     ax.add_artist(a)
627 |             #                     #ax.axis("off")
628 |             #                     ax.set_facecolor((1, 1, 1, 0))
629 |             #                     v.output.fig.add_axes(ax)
630 |             #                 """
631 |             #
632 |             #             position = rs.rs2_deproject_pixel_to_point(
633 |             #                 video_streamer.depth_intrin, [cX, cY], centre_depth
634 |             #             )
635 |             #
636 |             #             mot_tracker.trackers[track].set_distance(centre_depth)
637 |             #             mot_tracker.trackers[track].set_position(position)
638 |             #
639 |             #
640 |             #         except IndexError:
641 |             #             continue
642 | 
643 | 
644 |             v.draw_circle((cX, cY), (0, 0, 0))
645 |             v.draw_text("{:.2f}m".format(centre_depth), (cX, cY + 20))
646 |             
647 | 
648 |         #for i in detected_objects:
649 |             #print(i)
650 | 
651 |         #depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
652 |         #cv2.imshow('Segmented Image', color_image)
653 |         cv2.imshow('Segmented Image', v.output.get_image()[:, :, ::-1])
654 |         #cv2.imshow('Depth', depth_colormap)
655 |         if cv2.waitKey(1) & 0xFF == ord('q'):
656 |             break
657 |         
658 |         time_end = time.time()
659 |         total_time = time_end - time_start
660 | 
661 |         print("Time to process frame: {:.2f}".format(total_time))
662 |         print("FPS: {:.2f}\n".format(1/total_time))
663 |         
664 |     video_streamer.stop()
665 |     cv2.destroyAllWindows()
666 | 


--------------------------------------------------------------------------------
/main_detectron2_simple_win10.py:
--------------------------------------------------------------------------------
  1 | """BW: this file is all to speed up the inference step, including measures:
  2 |     + with smaller image size, i.e. (1280x720)-(640,480)-(640-360).
  3 |     + with less displayed sentences, only centers and distances are kept shown.
  4 |     + with larger softmax threshold, i.e. 75%.
  5 | 
  6 | 
  7 | In Windows10 BW vip notes:
  8 | 1. Regarding "cannot import name '_C' #157" error.
  9 |   Backgraound: **Build Detectron2 from Source**
 10 | 
 11 |     [Windows] Install Visual C++ Build tools form this link: https://answers.microsoft.com/en-us/windows/forum/windows_10-windows_install/microsoft-visual-c-140-is-required-in-windows-10/f0445e6b-d461-4e40-b44f-962622628de7.  Then restart your PC, then you also need to upgrade Python setup tools, by running this command: `pip install --upgrade setuptools`.
 12 | 
 13 |     After having the above dependencies you can install detectron2 from source by running:
 14 |     ~~~~~bash
 15 |     [Note-Works in Windows10!] pip install git+https://github.com/facebookresearch/detectron2.git
 16 |     # (add --user if you don't have permission)
 17 | 
 18 |     # Or, to install it from a local clone:
 19 |     git clone https://github.com/facebookresearch/detectron2.git
 20 |     cd detectron2 && pip install -e .
 21 | 
 22 |     # Or if you are on macOS
 23 |     # CC=clang CXX=clang++ pip install -e .
 24 |     ~~~~~
 25 |   A: So I install by pip, and run demo or example in git repo detectron in root directory,
 26 |       import detectron may import lib from your git root directory (not pip installation).
 27 |       This won't work (you want to use pip installation).
 28 |     (VIP-BW) You may remove detectron directory or change the name(as I do here), so python will look in pip packages.
 29 | """
 30 | import numpy as np
 31 | import time
 32 | import cv2
 33 | import pyrealsense2 as rs 
 34 | import random
 35 | import math
 36 | import argparse
 37 | 
 38 | from threading import Thread
 39 | from matplotlib import pyplot as plt
 40 | from mpl_toolkits.axes_grid1.inset_locator import inset_axes
 41 | from mpl_toolkits.mplot3d import proj3d
 42 | from mpl_toolkits.mplot3d import Axes3D
 43 | from matplotlib.patches import FancyArrowPatch
 44 | from sort import *
 45 | 
 46 | from detectron2.engine import DefaultPredictor
 47 | from detectron2.config import get_cfg
 48 | from detectron2.utils.visualizer import Visualizer
 49 | from detectron2.utils.visualizer import GenericMask
 50 | from detectron2.utils.visualizer import ColorMode
 51 | from detectron2.structures import Boxes, RotatedBoxes
 52 | 
 53 | from detectron2.data import MetadataCatalog
 54 | 
 55 | import torch, torchvision
 56 | 
 57 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 58 | import os
 59 | import pkg_resources
 60 | 
 61 | from detectron2.checkpoint import DetectionCheckpointer
 62 | from detectron2.modeling import build_model
 63 | 
 64 | # >>---------------------- load predefined model -------------------
 65 | class _ModelZooUrls(object):
 66 |     """
 67 |     Mapping from names to officially released Detectron2 pre-trained models.
 68 |     """
 69 | 
 70 |     S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
 71 | 
 72 |     # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
 73 |     CONFIG_PATH_TO_URL_SUFFIX = {
 74 |         # COCO Detection with Faster R-CNN
 75 |         "COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl",
 76 |         "COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl",
 77 |         "COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl",
 78 |         "COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl",
 79 |         "COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl",
 80 |         "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl",
 81 |         "COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl",
 82 |         "COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl",
 83 |         "COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl",
 84 |         "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl",
 85 |         # COCO Detection with RetinaNet
 86 |         "COCO-Detection/retinanet_R_50_FPN_1x.yaml": "190397773/model_final_bfca0b.pkl",
 87 |         "COCO-Detection/retinanet_R_50_FPN_3x.yaml": "190397829/model_final_5bd44e.pkl",
 88 |         "COCO-Detection/retinanet_R_101_FPN_3x.yaml": "190397697/model_final_971ab9.pkl",
 89 |         # COCO Detection with RPN and Fast R-CNN
 90 |         "COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl",
 91 |         "COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl",
 92 |         "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl",
 93 |         # COCO Instance Segmentation Baselines with Mask R-CNN
 94 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl",
 95 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl",
 96 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl",
 97 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl",
 98 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl",
 99 |         "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl",
100 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl",
101 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl",
102 |         "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl",
103 |         "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl",  # noqa
104 |         # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
105 |         "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl",
106 |         "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl",
107 |         "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl",
108 |         "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl",
109 |         # COCO Panoptic Segmentation Baselines with Panoptic FPN
110 |         "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl",
111 |         "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl",
112 |         "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl",
113 |         # LVIS Instance Segmentation Baselines with Mask R-CNN
114 |         "LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl",
115 |         "LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl",
116 |         "LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl",  # noqa
117 |         # Cityscapes & Pascal VOC Baselines
118 |         "Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl",
119 |         "PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl",
120 |         # Other Settings
121 |         "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml": "138602867/model_final_65c703.pkl",
122 |         "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml": "144998336/model_final_821d0b.pkl",
123 |         "Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl",
124 |         "Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl",
125 |         "Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "169527823/model_final_3b3c51.pkl",
126 |         "Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl",
127 |         "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl",
128 |         "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl",
129 |         "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl",  # noqa
130 |         # D1 Comparisons
131 |         "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml": "137781054/model_final_7ab50c.pkl",  # noqa
132 |         "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml": "137781281/model_final_62ca52.pkl",  # noqa
133 |         "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml": "137781195/model_final_cce136.pkl",
134 |     }
135 | def get_checkpoint_url(config_path):
136 |     """
137 |     Returns the URL to the model trained using the given config
138 | 
139 |     Args:
140 |         config_path (str): config file name relative to detectron2's "configs/"
141 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
142 | 
143 |     Returns:
144 |         str: a URL to the model
145 |     """
146 |     name = config_path.replace(".yaml", "")
147 |     if config_path in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
148 |         suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[config_path]
149 |         return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
150 |     raise RuntimeError("{} not available in Model Zoo!".format(name))
151 | def get_config_file(config_path):
152 |     """
153 |     Returns path to a builtin config file.
154 | 
155 |     Args:
156 |         config_path (str): config file name relative to detectron2's "configs/"
157 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
158 | 
159 |     Returns:
160 |         str: the real path to the config file.
161 |     """
162 |     cfg_file = pkg_resources.resource_filename(
163 |         "detectron2.model_zoo", os.path.join("configs", config_path)
164 |     )
165 |     if not os.path.exists(cfg_file):
166 |         raise RuntimeError("{} not available in Model Zoo!".format(config_path))
167 |     return cfg_file
168 | def get(config_path, trained = False):
169 |     """
170 |     Get a model specified by relative path under Detectron2's official ``configs/`` directory.
171 | 
172 |     Args:
173 |         config_path (str): config file name relative to detectron2's "configs/"
174 |             directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
175 |         trained (bool): If True, will initialize the model with the trained model zoo weights.
176 |             If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
177 |             instead; this will typically (though not always) initialize a subset of weights using
178 |             an ImageNet pre-trained model, while randomly initializing the other weights.
179 | 
180 |     Returns:
181 |         nn.Module: a detectron2 model
182 | 
183 |     Example:
184 |     ::
185 |         from detectron2 import model_zoo
186 |         model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
187 |     """
188 |     cfg_file = get_config_file(config_path)
189 | 
190 |     cfg = get_cfg()
191 |     cfg.merge_from_file(cfg_file)
192 |     if trained:
193 |         cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
194 |     if not torch.cuda.is_available():
195 |         cfg.MODEL.DEVICE = "cpu"
196 | 
197 |     model = build_model(cfg)
198 |     DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
199 |     return model
200 |     # E.g. # model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
201 | # <<---------------------- load predefined model -------------------
202 | 
203 | 
204 | 
205 | # Resolution of camera streams
206 | RESOLUTION_X = 640  #640, 1280
207 | RESOLUTION_Y = 480   #360(BW:cannot work in this PC, min:480)  #480, 720
208 | 
209 | # Configuration for histogram for depth image
210 | NUM_BINS = 500    #500 x depth_scale = e.g. 500x0.001m=50cm
211 | MAX_RANGE = 10000  #10000xdepth_scale = e.g. 10000x0.001m=10m
212 | 
213 | AXES_SIZE = 10
214 | 
215 | # Set test score threshold
216 | SCORE_THRESHOLD = 0.65  #vip-The smaller, the faster.
217 | 
218 | 
219 | class VideoStreamer:
220 |     """
221 |     Video streamer that takes advantage of multi-threading, and continuously is reading frames.
222 |     Frames are then ready to read when program requires.
223 |     """
224 |     def __init__(self, video_file=None):
225 |         """
226 |         When initialised, VideoStreamer object should be reading frames
227 |         """
228 |         self.setup_image_config(video_file)
229 |         self.configure_streams()
230 |         self.stopped = False
231 | 
232 |     def start(self):
233 |         """
234 |         Initialise thread, update method will run under thread
235 |         """
236 |         Thread(target=self.update, args=()).start()
237 |         return self
238 | 
239 |     def update(self):
240 |         """
241 |         Constantly read frames until stop() method is introduced
242 |         """
243 |         while True:
244 | 
245 |             if self.stopped:
246 |                 return
247 | 
248 |             frames = self.pipeline.wait_for_frames()
249 |             frames = self.align.process(frames)
250 | 
251 |             color_frame = frames.get_color_frame()
252 |             depth_frame = frames.get_depth_frame()
253 | 
254 |             self.depth_intrin = depth_frame.profile.as_video_stream_profile().intrinsics
255 |             
256 |             # Convert image to numpy array and initialise images
257 |             self.color_image = np.asanyarray(color_frame.get_data())
258 |             self.depth_image = np.asanyarray(depth_frame.get_data())
259 | 
260 | 
261 |     def stop(self):
262 |         self.pipeline.stop()
263 |         self.stopped = True
264 | 
265 |     def read(self):
266 |         return (self.color_image, self.depth_image)
267 | 
268 |     def setup_image_config(self, video_file=None):
269 |         """
270 |         Setup config and video steams. If --file is specified as an argument, setup
271 |         stream from file. The input of --file is a .bag file in the bag_files folder.
272 |         .bag files can be created using d435_to_file in the tools folder.
273 |         video_file is by default None, and thus will by default stream from the 
274 |         device connected to the USB.
275 |         """
276 |         config = rs.config()
277 | 
278 |         if video_file is None:
279 |             
280 |             config.enable_stream(rs.stream.depth, RESOLUTION_X, RESOLUTION_Y, rs.format.z16, 30)
281 |             config.enable_stream(rs.stream.color, RESOLUTION_X, RESOLUTION_Y, rs.format.bgr8, 30)
282 |         else:
283 |             try:
284 |                 config.enable_device_from_file("bag_files/{}".format(video_file))
285 |             except:
286 |                 print("Cannot enable device from: '{}'".format(video_file))
287 | 
288 |         self.config = config
289 | 
290 |     def configure_streams(self):
291 |         # Configure video streams
292 |         self.pipeline = rs.pipeline()
293 |     
294 |         # Start streaming
295 |         self.profile = self.pipeline.start(self.config)
296 |         self.align = rs.align(rs.stream.color)
297 | 
298 |     def get_depth_scale(self):
299 |         return self.profile.get_device().first_depth_sensor().get_depth_scale()
300 | 
301 | 
302 | 
303 | class Predictor(DefaultPredictor):
304 |     def __init__(self):
305 |         self.config = self.setup_predictor_config()
306 |         super().__init__(self.config)
307 | 
308 |     def create_outputs(self, color_image):
309 |         self.outputs = self(color_image)
310 | 
311 |     def setup_predictor_config(self):
312 |         """
313 |         Setup config and return predictor. See config/defaults.py for more options
314 |         """
315 | 
316 |         # cfg_file = get_config_file(config_path)
317 | 
318 |         cfg = get_cfg()
319 |         config_path = 'COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml'
320 |         cfg.merge_from_file("configs/" + config_path)
321 |         pretrained = True
322 |         if pretrained:
323 |             cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
324 | 
325 |         cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = SCORE_THRESHOLD
326 |         # Mask R-CNN ResNet101 FPN weights
327 |         ##cfg.MODEL.WEIGHTS = "model_final_a3ec72.pkl"  #Load local model
328 |         # This determines the resizing of the image. At 0, resizing is disabled.
329 |         cfg.INPUT.MIN_SIZE_TEST = 0
330 | 
331 |         return cfg
332 | 
333 |     def format_results(self, class_names):
334 |         """
335 |         Format results so they can be used by overlay_instances function
336 |         """
337 |         predictions = self.outputs['instances']
338 |         boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
339 |         scores = predictions.scores if predictions.has("scores") else None
340 |         classes = predictions.pred_classes if predictions.has("pred_classes") else None
341 | 
342 |         labels = None 
343 |         if classes is not None and class_names is not None and len(class_names) > 1:
344 |             labels = [class_names[i] for i in classes]
345 |         if scores is not None:
346 |             if labels is None:
347 |                 labels = ["{:.0f}%".format(s * 100) for s in scores]
348 |             else:
349 |                 labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
350 | 
351 |         masks = predictions.pred_masks.cpu().numpy()
352 |         masks = [GenericMask(x, v.output.height, v.output.width) for x in masks]
353 | 
354 |         boxes_list = boxes.tensor.tolist()
355 |         scores_list = scores.tolist()
356 |         class_list = classes.tolist()
357 | 
358 |         for i in range(len(scores_list)):
359 |             boxes_list[i].append(scores_list[i])
360 |             boxes_list[i].append(class_list[i])
361 |         
362 | 
363 |         boxes_list = np.array(boxes_list)
364 | 
365 |         return (masks, boxes, boxes_list, labels, scores_list, class_list)    
366 | 
367 | 
368 | 
369 | class OptimizedVisualizer(Visualizer):
370 |     """
371 |     Detectron2's altered Visualizer class which converts boxes tensor to cpu
372 |     """
373 |     def __init__(self, img_rgb, metadata, scale=1.0, instance_mode=ColorMode.IMAGE):
374 |         super().__init__(img_rgb, metadata, scale, instance_mode)
375 |     
376 |     def _convert_boxes(self, boxes):
377 |         """
378 |         Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
379 |         """
380 |         if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
381 |             return boxes.tensor.cpu().numpy()
382 |         else:
383 |             return np.asarray(boxes)
384 | 
385 | 
386 | 
387 | class DetectedObject:
388 |     """
389 |     Each object corresponds to all objects detected during the instance segmentation
390 |     phase. Associated trackers, distance, position and velocity are stored as attributes
391 |     of the object.
392 |     masks[i], boxes[i], labels[i], scores_list[i], class_list[i]
393 |     """
394 |     def __init__(self, mask, box, label, score, class_name):
395 |         self.mask = mask
396 |         self.box = box
397 |         self.label = label
398 |         self.score = score
399 |         self.class_name = class_name
400 | 
401 |     #BW: comment below for speed-up! ~5sec/frame faster.
402 |     # def __str__(self):
403 |     #     ret_str = "The pixel mask of {} represents a {} and is {}m away from the camera.\n".format(self.mask, self.class_name, self.distance)
404 |     #     if hasattr(self, 'track'):
405 |     #         if hasattr(self.track, 'speed'):
406 |     #             if self.track.speed >= 0:
407 |     #                 ret_str += "The {} is travelling {}m/s towards the camera\n".format(self.class_name, self.track.speed)
408 |     #             else:
409 |     #                 ret_str += "The {} is travelling {}m/s away from the camera\n".format(self.class_name, abs(self.track.speed))
410 |     #         if hasattr(self.track, 'impact_time'):
411 |     #             ret_str += "The {} will collide in {} seconds\n".format(self.class_name, self.track.impact_time)
412 |     #         if hasattr(self.track, 'velocity'):
413 |     #             ret_str += "The {} is located at {} and travelling at {}m/s\n".format(self.class_name, self.track.position, self.track.velocity)
414 |     #     return ret_str
415 | 
416 |     def create_vector_arrow(self):
417 |         """
418 |         Creates direction arrow which will use Arrow3D object. Converts vector to a suitable size so that the direction is clear.
419 |         NOTE: The magnitude of the velocity is not represented through this arrow. The arrow lengths are almost all identical
420 |         """
421 |         arrow_ratio = AXES_SIZE / max(abs(self.track.velocity_vector[0]), abs(self.track.velocity_vector[1]), abs(self.track.velocity_vector[2]))
422 |         self.track.v_points = [x * arrow_ratio for x in self.track.velocity_vector]
423 | 
424 |     
425 | 
426 | class Arrow3D(FancyArrowPatch):
427 |     """
428 |     Arrow used to demonstrate direction of travel for each object
429 |     """
430 |     def __init__(self, xs, ys, zs, *args, **kwargs):
431 |         FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
432 |         self._verts3d = xs, ys, zs
433 | 
434 |     def draw(self, renderer):
435 |         xs3d, ys3d, zs3d = self._verts3d
436 |         xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
437 |         self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
438 |         FancyArrowPatch.draw(self, renderer)
439 | 
440 | 
441 | 
442 | def find_mask_centre(mask, color_image):
443 |     """
444 |     Finding centre of mask using moments
445 |     """
446 |     moments = cv2.moments(np.float32(mask))
447 | 
448 |     cX = int(moments["m10"] / moments["m00"])
449 |     cY = int(moments["m01"] / moments["m00"])
450 | 
451 |     return cX, cY
452 | 
453 | 
454 | def find_median_depth(mask_area, num_median, histg):
455 |     """
456 |     Iterate through all histogram bins and stop at the median value. This is the
457 |     median depth of the mask.
458 |     """
459 |     
460 |     median_counter = 0
461 |     centre_depth = "0.00"
462 |     for x in range(0, len(histg)):
463 |         median_counter += histg[x][0]
464 |         if median_counter >= num_median:
465 |             # Half of histogram is iterated through,
466 |             # Therefore this bin contains the median
467 |             centre_depth = x / 50
468 |             break 
469 | 
470 |     return float(centre_depth)
471 | 
472 | def debug_plots(color_image, depth_image, mask, histg, depth_colormap):
473 |     """
474 |     This function is used for debugging purposes. This plots the depth color-
475 |     map, mask, mask and depth color-map bitwise_and, and histogram distrobutions
476 |     of the full image and the masked image.
477 |     """
478 |     full_hist = cv2.calcHist([depth_image], [0], None, [NUM_BINS], [0, MAX_RANGE])
479 |     masked_depth_image = cv2.bitwise_and(depth_colormap, depth_colormap, mask= mask)
480 | 
481 |     plt.figure()
482 |             
483 |     plt.subplot(2, 2, 1)
484 |     plt.imshow(depth_colormap)
485 | 
486 |     plt.subplot(2, 2, 2)
487 |     plt.imshow(masks[i].mask)
488 | 
489 |     plt.subplot(2, 2, 3).set_title(labels[i])
490 |     plt.imshow(masked_depth_image)
491 | 
492 |     plt.subplot(2, 2, 4)
493 |     plt.plot(full_hist)
494 |     plt.plot(histg)
495 |     plt.xlim([0, 600])
496 |     plt.show()
497 | 
498 | if __name__ == "__main__":
499 |     
500 |     parser = argparse.ArgumentParser()
501 |     parser.add_argument('--file', help='type --file=file-name.bag to stream using file instead of webcam')
502 |     args = parser.parse_args()
503 | 
504 |     # Initialise Detectron2 predictor
505 |     predictor = Predictor()
506 | 
507 |     # Initialise video streams from D435
508 |     video_streamer = VideoStreamer()
509 | 
510 |     # Initialise Kalman filter tracker from modified Sort module
511 |     mot_tracker = Sort()
512 | 
513 |     depth_scale = video_streamer.get_depth_scale()
514 |     print("Depth Scale is: {:.4f}m".format(depth_scale))
515 | 
516 |     speed_time_start = time.time()
517 | 
518 |     video_streamer.start()
519 |     time.sleep(1)
520 | 
521 |     while True:
522 |         
523 |         time_start = time.time()
524 |         color_image, depth_image = video_streamer.read()
525 |         detected_objects = []
526 | 
527 |         t1 = time.time()
528 | 
529 |         camera_time = t1 - time_start
530 |         
531 |         predictor.create_outputs(color_image)
532 |         outputs = predictor.outputs
533 | 
534 |         t2 = time.time()
535 |         model_time = t2 - t1
536 |         print("Model took {:.2f} time".format(model_time))
537 | 
538 |         predictions = outputs['instances']
539 |         
540 | 
541 |         if outputs['instances'].has('pred_masks'):
542 |             num_masks = len(predictions.pred_masks)
543 |         else:
544 |             # Even if no masks are found, the trackers must still be updated
545 |             tracked_objects = mot_tracker.update(boxes_list)
546 |             continue
547 |         
548 |         detectron_time = time.time()
549 | 
550 |         # Create a new Visualizer object from Detectron2 
551 |         v = OptimizedVisualizer(color_image[:, :, ::-1], MetadataCatalog.get(predictor.config.DATASETS.TRAIN[0]))
552 |         
553 |         masks, boxes, boxes_list, labels, scores_list, class_list = predictor.format_results(v.metadata.get("thing_classes"))
554 | 
555 |         for i in range(num_masks):
556 |             try:
557 |                 detected_obj = DetectedObject(masks[i], boxes[i], labels[i], scores_list[i], class_list[i])
558 |             except:
559 |                 print("Object doesn't meet all parameters")
560 |             
561 |             detected_objects.append(detected_obj)
562 | 
563 | 
564 |         tracked_objects = mot_tracker.update(boxes_list)
565 | 
566 |         
567 |         v.overlay_instances(
568 |             masks=masks,
569 |             boxes=boxes,
570 |             labels=labels,
571 |             keypoints=None,
572 |             assigned_colors=None,
573 |             alpha=0.3
574 |         )
575 |         
576 |         speed_time_end = time.time()
577 |         total_speed_time = speed_time_end - speed_time_start
578 |         speed_time_start = time.time()
579 |         for i in range(num_masks):
580 |             """
581 |             Converting depth image to a histogram with num bins of NUM_BINS 
582 |             and depth range of (0 - MAX_RANGE millimeters)
583 |             """
584 |         
585 |             mask_area = detected_objects[i].mask.area()
586 |             num_median = math.floor(mask_area / 2)
587 |             
588 |             histg = cv2.calcHist([depth_image], [0], detected_objects[i].mask.mask, [NUM_BINS], [0, MAX_RANGE])
589 |             
590 |             
591 |             # Uncomment this to use the debugging function
592 |             #depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
593 |             #debug_plots(color_image, depth_image, masks[i].mask, histg, depth_colormap)
594 |             
595 |             centre_depth = find_median_depth(mask_area, num_median, histg)
596 |             detected_objects[i].distance = centre_depth
597 |             cX, cY = find_mask_centre(detected_objects[i].mask._mask, v.output)
598 | 
599 |             #BW: comment below to speed-up!
600 |             # >> ------------------------------------------------------------------------------
601 |             # # Track refers to the list which holds the index of the detected mask which matches the tracker
602 |             # track = mot_tracker.matched[np.where(mot_tracker.matched[:,0]==i)[0],1]
603 |             #
604 |             # if len(track) > 0:
605 |             #     # Index of detected mask
606 |             #     track = track[0]
607 |             #     if i not in mot_tracker.unmatched:
608 |             #         try:
609 |             #             # If the tracker's distance has already been initialised - tracker has been detected previously
610 |             #             if hasattr(mot_tracker.trackers[track], 'distance'):
611 |             #                 mot_tracker.trackers[track].set_speed(centre_depth, total_speed_time)
612 |             #
613 |             #                 mot_tracker.trackers[track].set_impact_time(centre_depth)
614 |             #
615 |             #                 if mot_tracker.trackers[track].impact_time != False and mot_tracker.trackers[track].impact_time >= 0:
616 |             #                     v.draw_text("{:.2f} seconds to impact".format(mot_tracker.trackers[track].impact_time), (cX, cY + 60))
617 |             #
618 |             #             if hasattr(mot_tracker.trackers[track], 'position'):
619 |             #                 # New 3D coordinates for current frame
620 |             #                 x1, y1, z1 = rs.rs2_deproject_pixel_to_point(
621 |             #                 video_streamer.depth_intrin, [cX, cY], centre_depth
622 |             #             )
623 |             #
624 |             #                 # Update states for tracked object
625 |             #                 mot_tracker.trackers[track].set_velocity_vector(x1, y1, z1)
626 |             #                 mot_tracker.trackers[track].set_distance_3d(x1, y1, z1)
627 |             #                 mot_tracker.trackers[track].set_velocity(total_speed_time)
628 |             #
629 |             #                 detected_objects[i].track = mot_tracker.trackers[track]
630 |             #
631 |             #                 v.draw_text("{:.2f}m/s".format(detected_objects[i].track.velocity), (cX, cY + 40))
632 |             #             # << ------------------------------------------------------------------------------
633 |             #                 ##relative_x = (cX - 64) / RESOLUTION_X
634 |             #                 ##relative_y = (abs(RESOLUTION_Y - cY) - 36) / RESOLUTION_Y
635 |             #
636 |             #
637 |             #                 # Show velocity vector arrow if velocity >= 1 m/s
638 |             #                 """
639 |             #                 if detected_objects[i].track.velocity >= 1:
640 |             #                     ax = v.output.fig.add_axes([relative_x, relative_y, 0.1, 0.1], projection='3d')
641 |             #                     ax.set_xlim([-AXES_SIZE, AXES_SIZE])
642 |             #                     ax.set_ylim([-AXES_SIZE, AXES_SIZE])
643 |             #                     ax.set_zlim([-AXES_SIZE, AXES_SIZE])
644 |             #
645 |             #                     #print(v_points)
646 |             #                     detected_objects[i].create_vector_arrow()
647 |             #                     a = Arrow3D([0, detected_objects[i].track.v_points[0]], [0, detected_objects[i].track.v_points[1]], [0, detected_objects[i].track.v_points[2]], mutation_scale=10, lw=1, arrowstyle="-|>", color="w")
648 |             #                     ax.add_artist(a)
649 |             #                     #ax.axis("off")
650 |             #                     ax.set_facecolor((1, 1, 1, 0))
651 |             #                     v.output.fig.add_axes(ax)
652 |             #                 """
653 |             #
654 |             #             position = rs.rs2_deproject_pixel_to_point(
655 |             #                 video_streamer.depth_intrin, [cX, cY], centre_depth
656 |             #             )
657 |             #
658 |             #             mot_tracker.trackers[track].set_distance(centre_depth)
659 |             #             mot_tracker.trackers[track].set_position(position)
660 |             #
661 |             #
662 |             #         except IndexError:
663 |             #             continue
664 | 
665 | 
666 |             v.draw_circle((cX, cY), (0, 0, 0))
667 |             v.draw_text("{:.2f}m".format(centre_depth), (cX, cY + 20))
668 |             
669 | 
670 |         #for i in detected_objects:
671 |             #print(i)
672 | 
673 |         #depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
674 |         #cv2.imshow('Segmented Image', color_image)
675 |         cv2.imshow('Segmented Image', v.output.get_image()[:, :, ::-1])
676 |         #cv2.imshow('Depth', depth_colormap)
677 |         if cv2.waitKey(1) & 0xFF == ord('q'):
678 |             break
679 |         
680 |         time_end = time.time()
681 |         total_time = time_end - time_start
682 | 
683 |         print("Time to process frame: {:.2f}".format(total_time))
684 |         print("FPS: {:.2f}\n".format(1/total_time))
685 |         
686 |     video_streamer.stop()
687 |     cv2.destroyAllWindows()
688 | 


--------------------------------------------------------------------------------
/sort.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from numba import jit
  4 | import os.path
  5 | import numpy as np
  6 | ##import matplotlib.pyplot as plt
  7 | ##import matplotlib.patches as patches
  8 | from skimage import io
  9 | from sklearn.utils.linear_assignment_ import linear_assignment
 10 | import glob
 11 | import time
 12 | import math
 13 | import argparse
 14 | from filterpy.kalman import KalmanFilter
 15 | 
 16 | @jit
 17 | def iou(bb_test,bb_gt):
 18 |   """
 19 |   Computes IUO between two bboxes in the form [x1,y1,x2,y2]
 20 |   """
 21 |   xx1 = np.maximum(bb_test[0], bb_gt[0])
 22 |   yy1 = np.maximum(bb_test[1], bb_gt[1])
 23 |   xx2 = np.minimum(bb_test[2], bb_gt[2])
 24 |   yy2 = np.minimum(bb_test[3], bb_gt[3])
 25 |   w = np.maximum(0., xx2 - xx1)
 26 |   h = np.maximum(0., yy2 - yy1)
 27 |   wh = w * h
 28 |   o = wh / ((bb_test[2]-bb_test[0])*(bb_test[3]-bb_test[1])
 29 |     + (bb_gt[2]-bb_gt[0])*(bb_gt[3]-bb_gt[1]) - wh)
 30 |   return(o)
 31 | 
 32 | def convert_bbox_to_z(bbox):
 33 |   """
 34 |   Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
 35 |     [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
 36 |     the aspect ratio
 37 |   """
 38 |   w = bbox[2]-bbox[0]
 39 |   h = bbox[3]-bbox[1]
 40 |   x = bbox[0]+w/2.
 41 |   y = bbox[1]+h/2.
 42 |   s = w*h    #scale is just area
 43 |   r = w/float(h)
 44 |   return np.array([x,y,s,r]).reshape((4,1))
 45 | 
 46 | def convert_x_to_bbox(x,score=None):
 47 |   """
 48 |   Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
 49 |     [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
 50 |   """
 51 |   w = np.sqrt(x[2]*x[3])
 52 |   h = x[2]/w
 53 |   if(score==None):
 54 |     return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.]).reshape((1,4))
 55 |   else:
 56 |     return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.,score]).reshape((1,5))
 57 | 
 58 | 
 59 | class KalmanBoxTracker(object):
 60 |   """
 61 |   This class represents the internel state of individual tracked objects observed as bbox.
 62 |   """
 63 |   count = 0
 64 |   def __init__(self,bbox):
 65 |     """
 66 |     Initialises a tracker using initial bounding box.
 67 |     """
 68 |     #define constant velocity model
 69 |     self.kf = KalmanFilter(dim_x=7, dim_z=4)
 70 |     self.kf.F = np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0],  [0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1]])
 71 |     self.kf.H = np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0]])
 72 | 
 73 |     self.kf.R[2:,2:] *= 10.
 74 |     self.kf.P[4:,4:] *= 1000. #give high uncertainty to the unobservable initial velocities
 75 |     self.kf.P *= 10.
 76 |     self.kf.Q[-1,-1] *= 0.01
 77 |     self.kf.Q[4:,4:] *= 0.01
 78 | 
 79 |     self.kf.x[:4] = convert_bbox_to_z(bbox)
 80 |     self.time_since_update = 0
 81 |     self.id = KalmanBoxTracker.count
 82 |     KalmanBoxTracker.count += 1
 83 |     self.history = []
 84 |     self.hits = 0
 85 |     self.hit_streak = 0
 86 |     self.age = 0
 87 |     self.objclass = bbox[5]
 88 |     self.matches = {}
 89 | 
 90 |   def update(self,bbox):
 91 |     """
 92 |     Updates the state vector with observed bbox.
 93 |     """
 94 |     self.time_since_update = 0
 95 |     self.history = []
 96 |     self.hits += 1
 97 |     self.hit_streak += 1
 98 |     self.kf.update(convert_bbox_to_z(bbox))
 99 | 
100 |   def predict(self):
101 |     """
102 |     Advances the state vector and returns the predicted bounding box estimate.
103 |     """
104 |     if((self.kf.x[6]+self.kf.x[2])<=0):
105 |       self.kf.x[6] *= 0.0
106 |     self.kf.predict()
107 |     self.age += 1
108 |     if(self.time_since_update>0):
109 |       self.hit_streak = 0
110 |     self.time_since_update += 1
111 |     self.history.append(convert_x_to_bbox(self.kf.x))
112 |     return self.history[-1]
113 | 
114 |   def get_state(self):
115 |     """
116 |     Returns the current bounding box estimate.
117 |     """
118 |     return convert_x_to_bbox(self.kf.x)
119 | 
120 |   def change_matches(self, matches):
121 |     self.matches = {}
122 |     d, dets = matches
123 |     d = d[0]
124 |     dets = dets.tolist()
125 |     self.matches[d] = dets
126 |     #print("d: {}\ndets: {}".format(d, dets))
127 |   
128 |   def set_velocity_vector(self, x1, y1, z1):
129 |     """
130 |     Sets velocity vector based on old and new positions (x1, y1, z1)
131 |     """
132 |     self.velocity_vector = [x1 - self.position[0], y1 - self.position[1], z1 - self.position[2]]
133 | 
134 |   def set_distance_3d(self, x1, y1, z1):
135 |     """
136 |     Return 3D distance of object from old position to its new position (x1, y1, z1)
137 |     """
138 |     self.distance_3d = math.sqrt((x1 - self.position[0])**2 + (y1 - self.position[1])**2 + (z1 - self.position[2])**2)
139 | 
140 |   def set_velocity(self, total_time):
141 |     """
142 |     Set velocity based on 3D distance and total time between each frame 
143 |     """
144 |     self.velocity = self.distance_3d / (total_time)
145 | 
146 |   def set_speed(self, new_distance, total_time):
147 |     """
148 |     Set linear speed of object with regards to camera
149 |     """
150 |     self.speed = (self.distance - new_distance)/total_time
151 | 
152 |   def set_impact_time(self, distance):
153 |     """
154 |     Calculate time until impact from robot to camera
155 |     """
156 |     try:
157 |       self.impact_time = distance / self.speed
158 |     except:
159 |       self.impact_time = False
160 | 
161 |   def set_distance(self, distance):
162 |     """
163 |     Set distance of object from camera
164 |     """
165 |     self.distance = distance
166 | 
167 |   def set_position(self, position):
168 |     """
169 |     Set real-world coordinate position for object
170 |     """
171 |     self.position = position
172 | 
173 | def associate_detections_to_trackers(detections,trackers,iou_threshold = 0.3):
174 |   """
175 |   Assigns detections to tracked object (both represented as bounding boxes)
176 |   Returns 3 lists of matches, unmatched_detections and unmatched_trackers
177 |   """
178 |   if(len(trackers)==0):
179 |     return np.empty((0,2),dtype=int), np.arange(len(detections)), np.empty((0,5),dtype=int)
180 |   iou_matrix = np.zeros((len(detections),len(trackers)),dtype=np.float32)
181 | 
182 |   for d,det in enumerate(detections):
183 |     for t,trk in enumerate(trackers):
184 |       iou_matrix[d,t] = iou(det,trk)
185 |   matched_indices = linear_assignment(-iou_matrix)
186 | 
187 |   unmatched_detections = []
188 |   for d,det in enumerate(detections):
189 |     if(d not in matched_indices[:,0]):
190 |       unmatched_detections.append(d)
191 |   unmatched_trackers = []
192 |   for t,trk in enumerate(trackers):
193 |     if(t not in matched_indices[:,1]):
194 |       unmatched_trackers.append(t)
195 | 
196 |   #filter out matched with low IOU
197 |   matches = []
198 |   for m in matched_indices:
199 |     if(iou_matrix[m[0],m[1]]<iou_threshold):
200 |       unmatched_detections.append(m[0])
201 |       unmatched_trackers.append(m[1])
202 |     else:
203 |       matches.append(m.reshape(1,2))
204 | 
205 |   if(len(matches)==0):
206 |     matches = np.empty((0,2),dtype=int)
207 |   else:
208 |     matches = np.concatenate(matches,axis=0)
209 | 
210 |   #print(matches)
211 |   return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
212 | 
213 | 
214 | 
215 | class Sort(object):
216 |   def __init__(self,max_age=1,min_hits=3):
217 |     """
218 |     Sets key parameters for SORT
219 |     """
220 |     self.max_age = max_age
221 |     self.min_hits = min_hits
222 |     self.trackers = []
223 |     self.unmatched = []
224 |     self.matched = []
225 |     self.frame_count = 0
226 | 
227 |   def update(self,dets):
228 |     """
229 |     Params:
230 |       dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...]
231 |     Requires: this method must be called once for each frame even with empty detections.
232 |     Returns the a similar array, where the last column is the object ID.
233 |     NOTE: The number of objects returned may differ from the number of detections provided.
234 |     """
235 |     self.frame_count += 1
236 |     #get predicted locations from existing trackers.
237 |     trks = np.zeros((len(self.trackers),5))
238 |     to_del = []
239 |     ret = []
240 |     for t,trk in enumerate(trks):
241 |       pos = self.trackers[t].predict()[0]
242 |       trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
243 |       if(np.any(np.isnan(pos))):
244 |         to_del.append(t)
245 |     trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
246 |     for t in reversed(to_del):
247 |       self.trackers.pop(t)
248 |     matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets,trks)
249 | 
250 |     self.matched = matched
251 |     self.unmatched = unmatched_trks
252 |     
253 | 
254 |     #update matched trackers with assigned detections
255 |     for t,trk in enumerate(self.trackers):
256 |       if(t not in unmatched_trks):
257 |         d = matched[np.where(matched[:,1]==t)[0],0]
258 |         track = matched[np.where(matched[:,1]==t)[0],1]
259 |         trk.change_matches((d, dets[d,:][0]))
260 |         trk.update(dets[d,:][0])
261 | 
262 |     #create and initialise new trackers for unmatched detections
263 |     for i in unmatched_dets:
264 |         trk = KalmanBoxTracker(dets[i,:])
265 |         self.trackers.append(trk)
266 |     i = len(self.trackers)
267 |     for trk in reversed(self.trackers):
268 |         d = trk.get_state()[0]
269 |         if((trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits)):
270 |           ret.append(np.concatenate((d,[trk.id+1], [trk.objclass])).reshape(1,-1)) # +1 as MOT benchmark requires positive
271 |         i -= 1
272 |         #remove dead tracklet
273 |         if(trk.time_since_update > self.max_age):
274 |           self.trackers.pop(i)
275 |     if(len(ret)>0):
276 |       #print(trk.objclass)
277 |       return np.concatenate(ret)
278 |     return np.empty((0,5))
279 |     
280 | def parse_args():
281 |     """Parse input arguments."""
282 |     parser = argparse.ArgumentParser(description='SORT demo')
283 |     parser.add_argument('--display', dest='display', help='Display online tracker output (slow) [False]',action='store_true')
284 |     args = parser.parse_args()
285 |     return args
286 | 
287 | if __name__ == '__main__':
288 |   # all train
289 |   sequences = ['PETS09-S2L1','TUD-Campus','TUD-Stadtmitte','ETH-Bahnhof','ETH-Sunnyday','ETH-Pedcross2','KITTI-13','KITTI-17','ADL-Rundle-6','ADL-Rundle-8','Venice-2']
290 |   args = parse_args()
291 |   display = args.display
292 |   phase = 'train'
293 |   total_time = 0.0
294 |   total_frames = 0
295 |   colours = np.random.rand(32,3) #used only for display
296 |   if(display):
297 |     if not os.path.exists('mot_benchmark'):
298 |       print('\n\tERROR: mot_benchmark link not found!\n\n    Create a symbolic link to the MOT benchmark\n    (https://motchallenge.net/data/2D_MOT_2015/#download). E.g.:\n\n    $ ln -s /path/to/MOT2015_challenge/2DMOT2015 mot_benchmark\n\n')
299 |       exit()
300 |     plt.ion()
301 |     fig = plt.figure() 
302 |   
303 |   if not os.path.exists('output'):
304 |     os.makedirs('output')
305 |   
306 |   for seq in sequences:
307 |     mot_tracker = Sort() #create instance of the SORT tracker
308 |     seq_dets = np.loadtxt('data/%s/det.txt'%(seq),delimiter=',') #load detections
309 |     with open('output/%s.txt'%(seq),'w') as out_file:
310 |       print("Processing %s."%(seq))
311 |       for frame in range(int(seq_dets[:,0].max())):
312 |         frame += 1 #detection and frame numbers begin at 1
313 |         dets = seq_dets[seq_dets[:,0]==frame,2:7]
314 |         dets[:,2:4] += dets[:,0:2] #convert to [x1,y1,w,h] to [x1,y1,x2,y2]
315 |         total_frames += 1
316 | 
317 |         if(display):
318 |           ax1 = fig.add_subplot(111, aspect='equal')
319 |           fn = 'mot_benchmark/%s/%s/img1/%06d.jpg'%(phase,seq,frame)
320 |           im =io.imread(fn)
321 |           ax1.imshow(im)
322 |           plt.title(seq+' Tracked Targets')
323 | 
324 |         start_time = time.time()
325 |         trackers = mot_tracker.update(dets)
326 |         cycle_time = time.time() - start_time
327 |         total_time += cycle_time
328 | 
329 |         for d in trackers:
330 |           print('%d,%d,%.2f,%.2f,%.2f,%.2f,1,-1,-1,-1'%(frame,d[4],d[0],d[1],d[2]-d[0],d[3]-d[1]),file=out_file)
331 |           if(display):
332 |             d = d.astype(np.int32)
333 |             ax1.add_patch(patches.Rectangle((d[0],d[1]),d[2]-d[0],d[3]-d[1],fill=False,lw=3,ec=colours[d[4]%32,:]))
334 |             ax1.set_adjustable('box-forced')
335 | 
336 |         if(display):
337 |           fig.canvas.flush_events()
338 |           plt.draw()
339 |           ax1.cla()
340 | 
341 |   print("Total Tracking took: %.3f for %d frames or %.1f FPS"%(total_time,total_frames,total_frames/total_time))
342 |   if(display):
343 |     print("Note: to get real runtime results run without the option: --display")


--------------------------------------------------------------------------------