├── .gitignore ├── LICENSE.txt ├── README.md ├── __init__.py ├── configs ├── .gitignore ├── README.md ├── __init__.py ├── configAVD1.py ├── configAVD2.py └── configAVD3.py ├── evaluation ├── __init__.py ├── coco_det_eval.py ├── cocoapi │ ├── .gitignore │ ├── PythonAPI │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── pycocoDemo.ipynb │ │ ├── pycocoEvalDemo.ipynb │ │ ├── pycocotools │ │ │ ├── .gitignore │ │ │ ├── __init__.py │ │ │ ├── _mask.pyx │ │ │ ├── coco.py │ │ │ ├── cocoeval.py │ │ │ └── mask.py │ │ └── setup.py │ ├── README.txt │ ├── __init__.py │ ├── common │ │ ├── gason.cpp │ │ ├── gason.h │ │ ├── maskApi.c │ │ └── maskApi.h │ └── license.txt ├── convert_AVDgt_to_COCOgt.py └── eval_by_object.py ├── model_defs ├── .gitignore ├── TDID.py ├── TDID_fast.py ├── __init__.py ├── anchors │ ├── .gitignore │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── bbox.pyx │ ├── bbox_transform.py │ ├── generate_anchors.py │ └── proposal_layer.py ├── make.sh ├── nms │ ├── .gitignore │ ├── __init__.py │ ├── cpu_nms.pyx │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ ├── nms_wrapper.py │ └── py_cpu_nms.py └── setup.py ├── requirements.txt ├── test_tdid.py ├── test_tdid_det4class.py ├── train_tdid.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | Data/ 2 | *.pyc 3 | *.jpg 4 | *.swp 5 | 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018 Phil Ammirato 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Target Driven Instance Detection 2 | 3 | This is an implementation of the technique described in [Target Driven Instance Detection](https://arxiv.org/abs/1803.04610). It is written in python for use with Pytorch. 4 | 5 | 6 | ## External Requirements 7 | * Python 2 (might work with Python 3) 8 | * [PyTorch](http://pytorch.org/) **(version 0.3.X) There are known errors with version 0.4** 9 | * [AVD Data](http://www.cs.unc.edu/~ammirato/active_vision_dataset_website/get_data.html) Parts 1, 2 and 3 10 | * [AVD processing code](https://github.com/ammirato/active_vision_dataset_processing) 11 | 12 | ## Installation 13 | These instructions will setup the code and data to run our experiments on the AVD dataset. More instructions will be provided to run our other experiments or use your own data. 14 | 15 | 0. Dependencies and Data: 16 | 17 | - Make sure you have Pytorch (and torchvision) 18 | - Get the [AVD processing code](https://github.com/ammirato/active_vision_dataset_processing), and make sure it is included in your PYTHONPATH 19 | - Download the [AVD Data](http://www.cs.unc.edu/~ammirato/active_vision_dataset_website/get_data.html) into a path of your choosing, we will refer to is as `AVD_ROOT_DIR`. 20 | - Make sure to also get the [instance id map](https://drive.google.com/file/d/1UmhAr-l-CL3CeBq6U8V973jX5BPWkrlK/view?usp=sharing) and put it in the `AVD_ROOT_DIR` 21 | - Download the [target images](https://drive.google.com/file/d/1uV2I-SYWQvJb0PqzDdg8ESwRdQoVpSWr/view?usp=sharing) into a path of your choosing, we will refer to is as `TARGET_IMAGE_DIR`. 22 | 23 | 1. Get the code 24 | ``` 25 | git clone https://github.com/ammirato/target_driven_instance_detection.git 26 | ``` 27 | 28 | 2. Install the other requirements 29 | ``` 30 | cd target_driven_instance_detection/ 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | 3. Build the cython code for anchor boxes and non-max supression 35 | ``` 36 | cd model_defs/ 37 | ./make.sh 38 | ``` 39 | 40 | 4. Build the coco evaluation cython code 41 | ``` 42 | cd ../evaluation/cocoapi/PythonAPI/ 43 | make all 44 | cd ../../../ 45 | ``` 46 | 47 | 5. Convert AVD annotations to COCO format yourself, or download the converted files 48 | 49 | **To Download the files:** 50 | ``` 51 | mkdir Data 52 | cd Data 53 | ``` 54 | 55 | Download the tar [here](https://drive.google.com/file/d/1VgDBR5K1I-Tb6QVqyqVfGEXxcwKGHjQx/view?usp=sharing) 56 | 57 | `tar -xf tdid_gt_boxes.tar` 58 | 59 | **Or to convert yourself:** 60 | ```python 61 | cd evaluation/ 62 | #Update paths in `convert_AVDgt_to_COCOgt.py` with: 63 | #your AVD_ROOT_DIR 64 | #a path to save the annotations, we will call it VAL_GROUND_TRUTH_BOXES 65 | python convert_AVDgt_to_COCOgt.py 66 | 67 | #now update the scene_list in convert_AVDgt_to_COCOgt.py 68 | #to make the test set 69 | #change the path to save the annotations, we will call it TEST_GROUND_TRUTH_BOXES 70 | python convert_AVDgt_to_COCOgt.py 71 | 72 | ``` 73 | 74 | 75 | 6. Set paths `configs/configAVD2.py` file. See `configs/README.md` for details on config files. Make sure to update the config with your: 76 | 77 | - `AVD_ROOT_DIR` 78 | - `TARGET_IMAGE_DIR` 79 | - `VAL_GROUND_TRUTH_BOXES` 80 | - `TEST_GROUND_TRUTH_BOXES` 81 | 82 | 7. Start training! 83 | ``` 84 | #make sure you are in root directory of project, target_driven_instance_detection/ 85 | python train_tdid.py 86 | ``` 87 | 88 | 89 | ### Trained models 90 | [Here](https://drive.google.com/file/d/1tN2bFaNUp0hsiZIhmeD6oqXddFtdJoUW/view?usp=sharing) are models trained for each of the 3 splits on the AVD dataset 91 | 92 | 93 | # Citation 94 | Please cite our paper if you find our work useful: 95 | ``` 96 | @article{ammiratoTDID18, 97 | title = {Target Driven Instance Detection}, 98 | author = {Ammirato, Phil, and Fu, Cheng-Yang and Shvets, Mykhailo and Kosecka, Jana and Berg, Alexander C.}, 99 | booktitle = {arXiv:1803.04610}, 100 | year = {2018} 101 | } 102 | ``` 103 | 104 | 105 | 106 | # TODO 107 | ### Things to clean and add 108 | 1. Add data and configs for GMU to AVD experiment 109 | 2. Add data and configs for RGB-D Scenes one-shot classifcation experiment 110 | 3. Check det4class code 111 | 4. Clean eval by object 112 | 5. **Provide trained models** 113 | 6. make a note about downloading pretrained pytorch models 114 | 7. How to add your own data 115 | 116 | ### Improvements to system 117 | 1. How to choose target image, multiview targt image pooling thing 118 | 119 | ### Acknowledgements 120 | This code started as a modification of a Faster-RCNN Pytorch implementation [here](https://github.com/longcw/faster_rcnn_pytorch), and still uses some of that code. (In particular the nms code). 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/__init__.py -------------------------------------------------------------------------------- /configs/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /configs/README.md: -------------------------------------------------------------------------------- 1 | # Config Files 2 | Meant to be the only file changed when changing experiments. Defines all needed model training and testing parameters and paths. 3 | 4 | The parameters are defined in the following format: 5 | `name` - text definition. expected type/format 6 | 7 | 8 | 9 | 10 | * `ANCHOR_SCALES` - scale of anchor boxes to be used. [int,int,int] 11 | * `AUGMENT_TARGET_ILLUMINATION` - how often to change the illumination of target images. float [0,1] 12 | * `AUGMENT_TARGET_IMAGES` - how often to augment the target images. float [0,1] 13 | * `AVD_ROOT_DIR` - directory that holds all scene directories for the AVD. string 14 | * `BATCH_SIZE` - batch size for training. int 15 | * `CHOOSE_PRESENT_TARGET` - about how often the target object is in the scene image for training. float [0,1] 16 | * `CORR_WITH_POOLED` - whether or not to pool the target features to 1x1 before correlation. bool 17 | * `DATA_BASE_DIR` - optional, base directory that holds other directories. string 18 | * `DET4CLASS` - whether this is a classification experiment or not. bool 19 | * `DISPLAY_INTERVAL` - how often to print info during training. int 20 | * `EPS` - 21 | * `FEATURE_NET_NAME` - which architeture to use as the backbone network. string 22 | * `FRACTION_OF_NO_BOX_IMAGES` - fraction of images to include from training set that have no objects present. float [0,1] 23 | * `FULL_MODEL_LOAD_DIR` - where to load trained models from. string 24 | * `FULL_MODEL_LOAD_NAME` - name of saved model to load. string 25 | * `ID_MAP_FNAME` - name of file that has map from instance name to id. string 26 | * `ID_TO_NAME` 27 | * `LEARNING_RATE` - learning rate. float 28 | * `LOAD_FULL_MODEL` - whether or not to load a saved model. bool 29 | * `MAX_DETS_PER_TARGET` - maximum detections outputted for a single target/scene image pair. int 30 | * `MAX_NUM_EPOCHS` - maximum number of epochs for training. int 31 | * `MAX_OBJ_DIFFICULTY` - max object difficult as defined by AVD data loader. int 32 | * `META_SAVE_DIR` - where to save the meta (config) data used in training. str 33 | * `MIN_TARGET_SIZE` - minimum size of any dimension for a target images. int 34 | * `MODEL_BASE_SAVE_NAME` - name to use for saving model. string 35 | * `MOMENTUM` 36 | * `NAME_TO_ID` 37 | * `NMS_THRESH` - box score threshold for nms. float [0,1] 38 | * `NUM_TARGETS` - how many target images to use. int 39 | * `NUM_WORKERS` - how many worker to use when laoding data. int 40 | * `OBJ_IDS_TO_EXCLUDE` - instances to not include as foreground during training. list of ints 41 | * `POST_NMS_TOP_N` - max number of anchor boxes to keep after nms. int 42 | * `PRELOAD_TARGET_IMAGES` - 43 | * `PRE_NMS_TOP_N -`max number of anchor boxes to keep after nms. int 44 | * `PROPOSAL_BATCH_SIZE` - max number of anchors boxes to use for loss for one scene images. int 45 | * `PROPOSAL_BBOX_INSIDE_WEIGHTS` - 46 | * `PROPOSAL_CLOBBER_POSITIVES` - 47 | * `PROPOSAL_FG_FRACTION` - max fraction of proposals that can be forground. float [0,1] 48 | * `PROPOSAL_MIN_BOX_SIZE` - minimum size of a proposal box after applying regression parameters. int 49 | * `PROPOSAL_NEGATIVE_OVERLAP` - max overlap of anchor box with gt target box s.t. anchor box can be given gt background label. float [0,1] 50 | * `PROPOSAL_POSITIVE_OVERLAP` - min overlap of anchor box with gt target box s.t. anchor box can be given gt foreground label. float [0,1] 51 | * `PROPOSAL_POSITIVE_WEIGHT` - 52 | * `PYTORCH_FEATURE_NET` - whether or not to use a pytorch implementation of backbone feature extractor. bool 53 | * `RESIZE_IMG` - how often to resize scene images during training. float [0,1] 54 | * `RESIZE_IMG_FACTOR` -scaling factor to resize images during training. float 55 | * `SAVE_BY_EPOCH` - whether SAVE-FREQ refers to epochs(true) or steps(false). bool 56 | * `SAVE_FREQ` - how often to save the model during training. int 57 | * `SCORE_THRESH` - minimum score for outputting a box during inference. float [0,1] 58 | * `SNAPSHOT_SAVE_DIR` - where to save models during training. string 59 | * `TARGET_IMAGE_DIR` - where target images are stored. string 60 | * `TEST_FRACTION_OF_NO_BOX_IMAGES` - fraction of images to include from testing set that have no objects present. float [0,1] 61 | * `TEST_GROUND_TRUTH_BOXES` - location of file that has annotations of the test set. string 62 | * `TEST_LIST` - list of scenes included in the test set. list of string 63 | * `TEST_NMS_OVERLAP_THRESH` - 64 | * `TEST_OBJ_IDS` - objects ids to include in the test set. list of ints 65 | * `TEST_ONE_AT_A_TIME` - whether to test one target/scene image pair at a time, or use faster testing method. bool 66 | * `TEST_OUTPUT_DIR` - where to save results of testing. string 67 | * `TEST_RESIZE_BOXES_FACTOR` - scale to apply to each bounding box dimension, independent of `RESIZE_IMG_FACTOR` 68 | * `TEST_RESIZE_IMG_FACTOR` - scale for resizing images for testing. float 69 | * `TRAIN_LIST` - list of scenes included in the training set. list of strings 70 | * `TRAIN_OBJ_IDS` - objects ids to include in the train set. list of ints 71 | * `USE_CC_FEATS` - whether to use the CC feats, or not. bool 72 | * `USE_DIFF_FEATS` - whether to use the DIFF feats, or not. bool 73 | * `USE_IMG_FEATS` - whether to use the IMG feats, or not. bool 74 | * `USE_PRETRAINED_WEIGHTS` - whether to use weights from pytorch pretrained network for backbone feature extractor, or not. bool 75 | * `VAL_FRACTION_OF_NO_BOX_IMAGES` - fraction of images to include from validation set that have no objects present. float [0,1] 76 | * `VAL_GROUND_TRUTH_BOXES` - location of file that has annotations of the validation set. string 77 | * `VAL_LIST` - list of scenes included in the validation set. list of strings 78 | * `VAL_OBJ_IDS` - objects ids to include in the test set. list of ints 79 | * `WEIGHT_DECAY` - 80 | 81 | 82 | -------------------------------------------------------------------------------- /configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/configs/__init__.py -------------------------------------------------------------------------------- /configs/configAVD1.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import os 3 | 4 | class Config(): 5 | """ 6 | Holds all config parameters for training/testing. 7 | """ 8 | 9 | #Directories - MUST BE CHANGED for your environment 10 | DATA_BASE_DIR = '/net/bvisionserver3/playpen/ammirato/sandbox/code/target_driven_instance_detection/Data/' 11 | AVD_ROOT_DIR = '/net/bvisionserver3/playpen10/ammirato/Data/HalvedRohitData/' 12 | FULL_MODEL_LOAD_DIR= os.path.join(DATA_BASE_DIR, 'Models/') 13 | SNAPSHOT_SAVE_DIR= os.path.join(DATA_BASE_DIR , 'Models/') 14 | META_SAVE_DIR = os.path.join(DATA_BASE_DIR, 'ModelsMeta/') 15 | TARGET_IMAGE_DIR= os.path.join(DATA_BASE_DIR, 'AVD_and_BigBIRD_targets_v1/') 16 | TEST_OUTPUT_DIR = os.path.join(DATA_BASE_DIR, 'TestOutputs/') 17 | TEST_GROUND_TRUTH_BOXES = os.path.join(DATA_BASE_DIR, 'GT/AVD_split1_test.json') 18 | VAL_GROUND_TRUTH_BOXES = os.path.join(DATA_BASE_DIR ,'GT/AVD_part3_val.json') 19 | 20 | 21 | #Model Loading and saving 22 | FEATURE_NET_NAME= 'vgg16_bn' 23 | PYTORCH_FEATURE_NET= True 24 | USE_PRETRAINED_WEIGHTS = True 25 | FULL_MODEL_LOAD_NAME= 'TDID_AVD1_02_40_72201_0.90867_0.33934.h5' 26 | LOAD_FULL_MODEL= True 27 | MODEL_BASE_SAVE_NAME = 'TDID_AVD1_03' 28 | SAVE_FREQ = 5 29 | SAVE_BY_EPOCH = True 30 | 31 | 32 | #Training 33 | MAX_NUM_EPOCHS= 16 34 | BATCH_SIZE = 5 35 | LEARNING_RATE = .0001 36 | MOMENTUM = .9 37 | WEIGHT_DECAY = .0005 38 | DISPLAY_INTERVAL = 10 39 | NUM_WORKERS = 4 40 | RESIZE_IMG = 0 41 | RESIZE_IMG_FACTOR = .5 42 | CHOOSE_PRESENT_TARGET = .6 43 | DET4CLASS = False 44 | 45 | #Target Images 46 | PRELOAD_TARGET_IMAGES= False 47 | AUGMENT_TARGET_IMAGES= .9 48 | AUGMENT_TARGET_ILLUMINATION= .3 49 | MIN_TARGET_SIZE = 32 50 | 51 | #Training Data 52 | ID_MAP_FNAME= 'all_instance_id_map.txt' 53 | ID_TO_NAME = {} 54 | NAME_TO_ID = {} 55 | OBJ_IDS_TO_EXCLUDE = [8,18,32,33] 56 | 57 | TRAIN_OBJ_IDS=[cid for cid in range(1,33) if cid not in OBJ_IDS_TO_EXCLUDE] 58 | FRACTION_OF_NO_BOX_IMAGES = .1 59 | MAX_OBJ_DIFFICULTY= 4 60 | TRAIN_LIST= [ 61 | 'Home_002_1', 62 | 'Home_003_1', 63 | 'Home_003_2', 64 | 'Home_004_1', 65 | 'Home_004_2', 66 | 'Home_005_1', 67 | 'Home_005_2', 68 | 'Home_006_1', 69 | 'Home_014_1', 70 | 'Home_014_2', 71 | 'Office_001_1', 72 | ] 73 | 74 | VAL_OBJ_IDS = TRAIN_OBJ_IDS 75 | VAL_FRACTION_OF_NO_BOX_IMAGES = .01 76 | VAL_LIST= [ 77 | 'Home_007_1', 78 | 'Home_010_1', 79 | 'Home_011_1', 80 | ] 81 | 82 | ############################################## 83 | #Testing 84 | TEST_RESIZE_IMG_FACTOR = 0 85 | TEST_RESIZE_BOXES_FACTOR = 2 86 | MAX_DETS_PER_TARGET = 5 87 | SCORE_THRESH = .01 88 | TEST_NMS_OVERLAP_THRESH = .7 89 | 90 | TEST_OBJ_IDS= TRAIN_OBJ_IDS 91 | TEST_FRACTION_OF_NO_BOX_IMAGES = 1 92 | TEST_LIST = [ 93 | 'Home_001_1', 94 | 'Home_001_2', 95 | 'Home_008_1', 96 | ] 97 | TEST_ONE_AT_A_TIME = False 98 | ############################################### 99 | #Model paramters 100 | ANCHOR_SCALES = [1,2,4] 101 | NUM_TARGETS = 2 102 | CORR_WITH_POOLED = True 103 | USE_IMG_FEATS = False 104 | USE_DIFF_FEATS = True 105 | USE_CC_FEATS = True 106 | 107 | PRE_NMS_TOP_N = 6000 108 | POST_NMS_TOP_N = 300 109 | NMS_THRESH = .7 110 | PROPOSAL_MIN_BOX_SIZE = 8 111 | PROPOSAL_CLOBBER_POSITIVES = False 112 | PROPOSAL_NEGATIVE_OVERLAP = .3 113 | PROPOSAL_POSITIVE_OVERLAP = .6 114 | PROPOSAL_FG_FRACTION = .5 115 | PROPOSAL_BATCH_SIZE = 300 116 | PROPOSAL_POSITIVE_WEIGHT = -1 117 | PROPOSAL_BBOX_INSIDE_WEIGHTS = [1,1,1,1] 118 | 119 | EPS = 1e-14 120 | 121 | 122 | 123 | def get_config(): 124 | 125 | cfg = Config() 126 | cfg.ID_TO_NAME = get_class_id_to_name_dict(cfg.AVD_ROOT_DIR, 127 | cfg.ID_MAP_FNAME) 128 | name_to_id = {} 129 | for cid in cfg.ID_TO_NAME.keys(): 130 | name_to_id[cfg.ID_TO_NAME[cid]] = cid 131 | cfg.NAME_TO_ID = name_to_id 132 | 133 | return cfg 134 | -------------------------------------------------------------------------------- /configs/configAVD2.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import os 3 | 4 | class Config(): 5 | """ 6 | Holds all config parameters for training/testing. 7 | """ 8 | 9 | #Directories - MUST BE CHANGED for your environment 10 | DATA_BASE_DIR = '/net/bvisionserver3/playpen/ammirato/sandbox/code/target_driven_instance_detection/Data/' 11 | AVD_ROOT_DIR = '/net/bvisionserver3/playpen10/ammirato/Data/HalvedRohitData/' 12 | FULL_MODEL_LOAD_DIR= os.path.join(DATA_BASE_DIR, 'Models/') 13 | SNAPSHOT_SAVE_DIR= os.path.join(DATA_BASE_DIR , 'Models/') 14 | META_SAVE_DIR = os.path.join(DATA_BASE_DIR, 'ModelsMeta/') 15 | TARGET_IMAGE_DIR= os.path.join(DATA_BASE_DIR, 'AVD_and_BigBIRD_targets_v1/') 16 | TEST_OUTPUT_DIR = os.path.join(DATA_BASE_DIR, 'TestOutputs/') 17 | TEST_GROUND_TRUTH_BOXES = os.path.join(DATA_BASE_DIR, 'GT/AVD_split2_test.json') 18 | VAL_GROUND_TRUTH_BOXES = os.path.join(DATA_BASE_DIR ,'GT/AVD_part3_val.json') 19 | 20 | 21 | #Model Loading and saving 22 | FEATURE_NET_NAME= 'vgg16_bn' 23 | PYTORCH_FEATURE_NET= True 24 | USE_PRETRAINED_WEIGHTS = True 25 | FULL_MODEL_LOAD_NAME= 'TDID_AVD2_03_15_26806_0.36337_0.35057.h5' 26 | LOAD_FULL_MODEL= True 27 | MODEL_BASE_SAVE_NAME = 'TDID_AVD2_04' 28 | SAVE_FREQ = 15 29 | SAVE_BY_EPOCH = True 30 | 31 | 32 | #Training 33 | MAX_NUM_EPOCHS= 16 34 | BATCH_SIZE = 5 35 | LEARNING_RATE = .0001 36 | MOMENTUM = .9 37 | WEIGHT_DECAY = .0005 38 | DISPLAY_INTERVAL = 10 39 | NUM_WORKERS = 4 40 | RESIZE_IMG = 0 41 | RESIZE_IMG_FACTOR = .5 42 | CHOOSE_PRESENT_TARGET = .6 43 | DET4CLASS = False 44 | 45 | #Target Images 46 | PRELOAD_TARGET_IMAGES= False 47 | AUGMENT_TARGET_IMAGES= .9 48 | AUGMENT_TARGET_ILLUMINATION= .3 49 | MIN_TARGET_SIZE = 32 50 | 51 | #Training Data 52 | ID_MAP_FNAME= 'all_instance_id_map.txt' 53 | ID_TO_NAME = {} 54 | NAME_TO_ID = {} 55 | OBJ_IDS_TO_EXCLUDE = [8,18,32,33] 56 | 57 | TRAIN_OBJ_IDS=[cid for cid in range(1,33) if cid not in OBJ_IDS_TO_EXCLUDE] 58 | FRACTION_OF_NO_BOX_IMAGES = .1 59 | MAX_OBJ_DIFFICULTY= 4 60 | TRAIN_LIST= [ 61 | 'Home_001_1', 62 | 'Home_001_2', 63 | 'Home_002_1', 64 | 'Home_004_1', 65 | 'Home_004_2', 66 | 'Home_005_1', 67 | 'Home_005_2', 68 | 'Home_006_1', 69 | 'Home_008_1', 70 | 'Home_014_1', 71 | 'Home_014_2', 72 | ] 73 | 74 | VAL_OBJ_IDS = TRAIN_OBJ_IDS 75 | VAL_FRACTION_OF_NO_BOX_IMAGES = .01 76 | VAL_LIST= [ 77 | 'Home_007_1', 78 | 'Home_010_1', 79 | 'Home_011_1', 80 | ] 81 | 82 | ############################################## 83 | #Testing 84 | TEST_RESIZE_IMG_FACTOR = 0 85 | TEST_RESIZE_BOXES_FACTOR = 2 86 | MAX_DETS_PER_TARGET = 5 87 | SCORE_THRESH = .01 88 | TEST_NMS_OVERLAP_THRESH = .7 89 | 90 | TEST_OBJ_IDS= TRAIN_OBJ_IDS 91 | TEST_FRACTION_OF_NO_BOX_IMAGES = 1 92 | TEST_LIST = [ 93 | 'Home_003_1', 94 | 'Home_003_2', 95 | 'Office_001_1', 96 | ] 97 | TEST_ONE_AT_A_TIME = False 98 | ############################################### 99 | #Model paramters 100 | ANCHOR_SCALES = [1,2,4] 101 | NUM_TARGETS = 2 102 | CORR_WITH_POOLED = True 103 | USE_IMG_FEATS = False 104 | USE_DIFF_FEATS = True 105 | USE_CC_FEATS = True 106 | 107 | PRE_NMS_TOP_N = 6000 108 | POST_NMS_TOP_N = 300 109 | NMS_THRESH = .7 110 | PROPOSAL_MIN_BOX_SIZE = 8 111 | PROPOSAL_CLOBBER_POSITIVES = False 112 | PROPOSAL_NEGATIVE_OVERLAP = .3 113 | PROPOSAL_POSITIVE_OVERLAP = .6 114 | PROPOSAL_FG_FRACTION = .5 115 | PROPOSAL_BATCH_SIZE = 300 116 | PROPOSAL_POSITIVE_WEIGHT = -1 117 | PROPOSAL_BBOX_INSIDE_WEIGHTS = [1,1,1,1] 118 | 119 | EPS = 1e-14 120 | 121 | 122 | 123 | def get_config(): 124 | 125 | cfg = Config() 126 | cfg.ID_TO_NAME = get_class_id_to_name_dict(cfg.AVD_ROOT_DIR, 127 | cfg.ID_MAP_FNAME) 128 | name_to_id = {} 129 | for cid in cfg.ID_TO_NAME.keys(): 130 | name_to_id[cfg.ID_TO_NAME[cid]] = cid 131 | cfg.NAME_TO_ID = name_to_id 132 | 133 | return cfg 134 | -------------------------------------------------------------------------------- /configs/configAVD3.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import os 3 | 4 | class Config(): 5 | """ 6 | Holds all config parameters for training/testing. 7 | """ 8 | 9 | #Directories - MUST BE CHANGED for your environment 10 | DATA_BASE_DIR = '/net/bvisionserver3/playpen/ammirato/sandbox/code/target_driven_instance_detection/Data/' 11 | AVD_ROOT_DIR = '/net/bvisionserver3/playpen10/ammirato/Data/HalvedRohitData/' 12 | FULL_MODEL_LOAD_DIR= os.path.join(DATA_BASE_DIR, 'Models/') 13 | SNAPSHOT_SAVE_DIR= os.path.join(DATA_BASE_DIR , 'Models/') 14 | META_SAVE_DIR = os.path.join(DATA_BASE_DIR, 'ModelsMeta/') 15 | TARGET_IMAGE_DIR= os.path.join(DATA_BASE_DIR, 'AVD_and_BigBIRD_targets_v1/') 16 | TEST_OUTPUT_DIR = os.path.join(DATA_BASE_DIR, 'TestOutputs/') 17 | TEST_GROUND_TRUTH_BOXES = os.path.join(DATA_BASE_DIR, 'GT/AVD_split3_test.json') 18 | VAL_GROUND_TRUTH_BOXES = os.path.join(DATA_BASE_DIR ,'GT/AVD_part3_val.json') 19 | 20 | 21 | #Model Loading and saving 22 | FEATURE_NET_NAME= 'vgg16_bn' 23 | PYTORCH_FEATURE_NET= True 24 | USE_PRETRAINED_WEIGHTS = True 25 | FULL_MODEL_LOAD_NAME= 'TDID_AVD3_01_40_79081_0.95197_0.32499.h5' 26 | LOAD_FULL_MODEL= True 27 | MODEL_BASE_SAVE_NAME = 'TDID_AVD3_02' 28 | SAVE_FREQ = 5 29 | SAVE_BY_EPOCH = True 30 | 31 | 32 | #Training 33 | MAX_NUM_EPOCHS= 16 34 | BATCH_SIZE = 5 35 | LEARNING_RATE = .0001 36 | MOMENTUM = .9 37 | WEIGHT_DECAY = .0005 38 | DISPLAY_INTERVAL = 10 39 | NUM_WORKERS = 4 40 | RESIZE_IMG = 0 41 | RESIZE_IMG_FACTOR = .5 42 | CHOOSE_PRESENT_TARGET = .6 43 | DET4CLASS = False 44 | 45 | #Target Images 46 | PRELOAD_TARGET_IMAGES= False 47 | AUGMENT_TARGET_IMAGES= .9 48 | AUGMENT_TARGET_ILLUMINATION= .3 49 | MIN_TARGET_SIZE = 32 50 | 51 | #Training Data 52 | ID_MAP_FNAME= 'all_instance_id_map.txt' 53 | ID_TO_NAME = {} 54 | NAME_TO_ID = {} 55 | OBJ_IDS_TO_EXCLUDE = [8,18,32,33] 56 | 57 | TRAIN_OBJ_IDS=[cid for cid in range(1,33) if cid not in OBJ_IDS_TO_EXCLUDE] 58 | FRACTION_OF_NO_BOX_IMAGES = .1 59 | MAX_OBJ_DIFFICULTY= 4 60 | TRAIN_LIST= [ 61 | 'Home_001_1', 62 | 'Home_001_2', 63 | 'Home_003_1', 64 | 'Home_003_2', 65 | 'Home_004_1', 66 | 'Home_004_2', 67 | 'Home_005_1', 68 | 'Home_005_2', 69 | 'Home_006_1', 70 | 'Home_008_1', 71 | 'Office_001_1', 72 | ] 73 | 74 | VAL_OBJ_IDS = TRAIN_OBJ_IDS 75 | VAL_FRACTION_OF_NO_BOX_IMAGES = .01 76 | VAL_LIST= [ 77 | 'Home_007_1', 78 | 'Home_010_1', 79 | 'Home_011_1', 80 | ] 81 | 82 | ############################################## 83 | #Testing 84 | TEST_RESIZE_IMG_FACTOR = 0 85 | TEST_RESIZE_BOXES_FACTOR = 2 86 | MAX_DETS_PER_TARGET = 5 87 | SCORE_THRESH = .01 88 | TEST_NMS_OVERLAP_THRESH = .7 89 | 90 | TEST_OBJ_IDS= TRAIN_OBJ_IDS 91 | TEST_FRACTION_OF_NO_BOX_IMAGES = 1 92 | TEST_LIST = [ 93 | 'Home_014_1', 94 | 'Home_014_2', 95 | 'Home_002_1', 96 | ] 97 | TEST_ONE_AT_A_TIME = False 98 | ############################################### 99 | #Model paramters 100 | ANCHOR_SCALES = [1,2,4] 101 | NUM_TARGETS = 2 102 | CORR_WITH_POOLED = True 103 | USE_IMG_FEATS = False 104 | USE_DIFF_FEATS = True 105 | USE_CC_FEATS = True 106 | 107 | PRE_NMS_TOP_N = 6000 108 | POST_NMS_TOP_N = 300 109 | NMS_THRESH = .7 110 | PROPOSAL_MIN_BOX_SIZE = 8 111 | PROPOSAL_CLOBBER_POSITIVES = False 112 | PROPOSAL_NEGATIVE_OVERLAP = .3 113 | PROPOSAL_POSITIVE_OVERLAP = .6 114 | PROPOSAL_FG_FRACTION = .5 115 | PROPOSAL_BATCH_SIZE = 300 116 | PROPOSAL_POSITIVE_WEIGHT = -1 117 | PROPOSAL_BBOX_INSIDE_WEIGHTS = [1,1,1,1] 118 | 119 | EPS = 1e-14 120 | 121 | 122 | 123 | def get_config(): 124 | 125 | cfg = Config() 126 | cfg.ID_TO_NAME = get_class_id_to_name_dict(cfg.AVD_ROOT_DIR, 127 | cfg.ID_MAP_FNAME) 128 | name_to_id = {} 129 | for cid in cfg.ID_TO_NAME.keys(): 130 | name_to_id[cfg.ID_TO_NAME[cid]] = cid 131 | cfg.NAME_TO_ID = name_to_id 132 | 133 | return cfg 134 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/evaluation/__init__.py -------------------------------------------------------------------------------- /evaluation/coco_det_eval.py: -------------------------------------------------------------------------------- 1 | from .cocoapi.PythonAPI.pycocotools.coco import COCO 2 | from .cocoapi.PythonAPI.pycocotools.cocoeval import COCOeval 3 | import numpy as np 4 | 5 | 6 | def coco_det_eval(gt_path, det_path, catIds, 7 | iouThrs=.5, 8 | maxDets=[1,10,100]): 9 | ''' 10 | Performs coco detection mAP evaluation 11 | 12 | Example: 13 | coco_det_eval('/path/to/ground_truth.json','/path/to/detection.json') 14 | 15 | Input parameters: 16 | gt_path: (str) path to ground truth bounding box json file 17 | det_path: (str) path to detection output json file 18 | catIds: (list of int) class ids to evaluate 19 | 20 | iouThrs: (int) iou threshold for a correct detection Default: .5 21 | maxDets (optional): (list of int) Default: [1,10,100] 22 | 23 | Returns: 24 | (float) m_ap result 25 | ''' 26 | 27 | #initialize COCO ground truth api 28 | cocoGt=COCO(gt_path) 29 | #initialize COCO detections api 30 | cocoDt=cocoGt.loadRes(det_path) 31 | 32 | # setup parameters 33 | annType = 'bbox' 34 | cocoEval = COCOeval(cocoGt,cocoDt,annType) 35 | cocoEval.params.iouThrs = np.array([iouThrs]) 36 | cocoEval.params.maxDets = maxDets 37 | cocoEval.params.catIds = catIds 38 | #Areas as defined by AVD dataset 39 | cocoEval.params.areaRng = [[0, 10000000000.0], [416, 10000000000.0 ], [0, 416], [416, 1250], [1250, 3750], [3750, 7500], [7500,10000000000.0]] 40 | cocoEval.params.areaRngLbl = ['all', 'valid', 'l0', 'l1', 'l2', 'l3', 'l4'] 41 | cocoEval.params.useSegs = [0] 42 | 43 | #run evaluation 44 | cocoEval.evaluate() 45 | cocoEval.accumulate() 46 | cocoEval.summarize() 47 | 48 | return cocoEval.stats[1] 49 | 50 | 51 | -------------------------------------------------------------------------------- /evaluation/cocoapi/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | # install pycocotools locally 3 | python setup.py build_ext --inplace 4 | rm -rf build 5 | 6 | install: 7 | # install pycocotools to the Python site-packages 8 | python setup.py build_ext install 9 | rm -rf build -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/evaluation/cocoapi/PythonAPI/__init__.py -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/pycocoEvalDemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "from pycocotools.coco import COCO\n", 14 | "from pycocotools.cocoeval import COCOeval\n", 15 | "import numpy as np\n", 16 | "import skimage.io as io\n", 17 | "import pylab\n", 18 | "pylab.rcParams['figure.figsize'] = (10.0, 8.0)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "Running demo for *bbox* results.\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "annType = ['segm','bbox','keypoints']\n", 38 | "annType = annType[1] #specify type here\n", 39 | "prefix = 'person_keypoints' if annType=='keypoints' else 'instances'\n", 40 | "print 'Running demo for *%s* results.'%(annType)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "loading annotations into memory...\n", 55 | "Done (t=8.01s)\n", 56 | "creating index...\n", 57 | "index created!\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "#initialize COCO ground truth api\n", 63 | "dataDir='../'\n", 64 | "dataType='val2014'\n", 65 | "annFile = '%s/annotations/%s_%s.json'%(dataDir,prefix,dataType)\n", 66 | "cocoGt=COCO(annFile)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "Loading and preparing results... \n", 81 | "DONE (t=0.05s)\n", 82 | "creating index...\n", 83 | "index created!\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "#initialize COCO detections api\n", 89 | "resFile='%s/results/%s_%s_fake%s100_results.json'\n", 90 | "resFile = resFile%(dataDir, prefix, dataType, annType)\n", 91 | "cocoDt=cocoGt.loadRes(resFile)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "imgIds=sorted(cocoGt.getImgIds())\n", 103 | "imgIds=imgIds[0:100]\n", 104 | "imgId = imgIds[np.random.randint(100)]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 6, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Running per image evaluation... \n", 119 | "DONE (t=0.46s).\n", 120 | "Accumulating evaluation results... \n", 121 | "DONE (t=0.38s).\n", 122 | " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.505\n", 123 | " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.697\n", 124 | " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.573\n", 125 | " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.586\n", 126 | " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.519\n", 127 | " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501\n", 128 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.387\n", 129 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.594\n", 130 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.595\n", 131 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.640\n", 132 | " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.566\n", 133 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.564\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# running evaluation\n", 139 | "cocoEval = COCOeval(cocoGt,cocoDt,annType)\n", 140 | "cocoEval.params.imgIds = imgIds\n", 141 | "cocoEval.evaluate()\n", 142 | "cocoEval.accumulate()\n", 143 | "cocoEval.summarize()" 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 2", 150 | "language": "python", 151 | "name": "python2" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 2 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython2", 163 | "version": "2.7.10" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 0 168 | } 169 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/pycocotools/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.so 3 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/pycocotools/_mask.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c 2 | # distutils: sources = ../common/maskApi.c 3 | 4 | #************************************************************************** 5 | # Microsoft COCO Toolbox. version 2.0 6 | # Data, paper, and tutorials available at: http://mscoco.org/ 7 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 8 | # Licensed under the Simplified BSD License [see coco/license.txt] 9 | #************************************************************************** 10 | 11 | __author__ = 'tsungyi' 12 | 13 | import sys 14 | PYTHON_VERSION = sys.version_info[0] 15 | 16 | # import both Python-level and C-level symbols of Numpy 17 | # the API uses Numpy to interface C and Python 18 | import numpy as np 19 | cimport numpy as np 20 | from libc.stdlib cimport malloc, free 21 | 22 | # intialized Numpy. must do. 23 | np.import_array() 24 | 25 | # import numpy C function 26 | # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management 27 | cdef extern from "numpy/arrayobject.h": 28 | void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) 29 | 30 | # Declare the prototype of the C functions in MaskApi.h 31 | cdef extern from "maskApi.h": 32 | ctypedef unsigned int uint 33 | ctypedef unsigned long siz 34 | ctypedef unsigned char byte 35 | ctypedef double* BB 36 | ctypedef struct RLE: 37 | siz h, 38 | siz w, 39 | siz m, 40 | uint* cnts, 41 | void rlesInit( RLE **R, siz n ) 42 | void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) 43 | void rleDecode( const RLE *R, byte *mask, siz n ) 44 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) 45 | void rleArea( const RLE *R, siz n, uint *a ) 46 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) 47 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) 48 | void rleToBbox( const RLE *R, BB bb, siz n ) 49 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) 50 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) 51 | char* rleToString( const RLE *R ) 52 | void rleFrString( RLE *R, char *s, siz h, siz w ) 53 | 54 | # python class to wrap RLE array in C 55 | # the class handles the memory allocation and deallocation 56 | cdef class RLEs: 57 | cdef RLE *_R 58 | cdef siz _n 59 | 60 | def __cinit__(self, siz n =0): 61 | rlesInit(&self._R, n) 62 | self._n = n 63 | 64 | # free the RLE array here 65 | def __dealloc__(self): 66 | if self._R is not NULL: 67 | for i in range(self._n): 68 | free(self._R[i].cnts) 69 | free(self._R) 70 | def __getattr__(self, key): 71 | if key == 'n': 72 | return self._n 73 | raise AttributeError(key) 74 | 75 | # python class to wrap Mask array in C 76 | # the class handles the memory allocation and deallocation 77 | cdef class Masks: 78 | cdef byte *_mask 79 | cdef siz _h 80 | cdef siz _w 81 | cdef siz _n 82 | 83 | def __cinit__(self, h, w, n): 84 | self._mask = malloc(h*w*n* sizeof(byte)) 85 | self._h = h 86 | self._w = w 87 | self._n = n 88 | # def __dealloc__(self): 89 | # the memory management of _mask has been passed to np.ndarray 90 | # it doesn't need to be freed here 91 | 92 | # called when passing into np.array() and return an np.ndarray in column-major order 93 | def __array__(self): 94 | cdef np.npy_intp shape[1] 95 | shape[0] = self._h*self._w*self._n 96 | # Create a 1D array, and reshape it to fortran/Matlab column-major array 97 | ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F') 98 | # The _mask allocated by Masks is now handled by ndarray 99 | PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA) 100 | return ndarray 101 | 102 | # internal conversion from Python RLEs object to compressed RLE format 103 | def _toString(RLEs Rs): 104 | cdef siz n = Rs.n 105 | cdef bytes py_string 106 | cdef char* c_string 107 | objs = [] 108 | for i in range(n): 109 | c_string = rleToString( &Rs._R[i] ) 110 | py_string = c_string 111 | objs.append({ 112 | 'size': [Rs._R[i].h, Rs._R[i].w], 113 | 'counts': py_string 114 | }) 115 | free(c_string) 116 | return objs 117 | 118 | # internal conversion from compressed RLE format to Python RLEs object 119 | def _frString(rleObjs): 120 | cdef siz n = len(rleObjs) 121 | Rs = RLEs(n) 122 | cdef bytes py_string 123 | cdef char* c_string 124 | for i, obj in enumerate(rleObjs): 125 | if PYTHON_VERSION == 2: 126 | py_string = str(obj['counts']).encode('utf8') 127 | elif PYTHON_VERSION == 3: 128 | py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts'] 129 | else: 130 | raise Exception('Python version must be 2 or 3') 131 | c_string = py_string 132 | rleFrString( &Rs._R[i], c_string, obj['size'][0], obj['size'][1] ) 133 | return Rs 134 | 135 | # encode mask to RLEs objects 136 | # list of RLE string can be generated by RLEs member function 137 | def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask): 138 | h, w, n = mask.shape[0], mask.shape[1], mask.shape[2] 139 | cdef RLEs Rs = RLEs(n) 140 | rleEncode(Rs._R,mask.data,h,w,n) 141 | objs = _toString(Rs) 142 | return objs 143 | 144 | # decode mask from compressed list of RLE string or RLEs object 145 | def decode(rleObjs): 146 | cdef RLEs Rs = _frString(rleObjs) 147 | h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n 148 | masks = Masks(h, w, n) 149 | rleDecode(Rs._R, masks._mask, n); 150 | return np.array(masks) 151 | 152 | def merge(rleObjs, intersect=0): 153 | cdef RLEs Rs = _frString(rleObjs) 154 | cdef RLEs R = RLEs(1) 155 | rleMerge(Rs._R, R._R, Rs._n, intersect) 156 | obj = _toString(R)[0] 157 | return obj 158 | 159 | def area(rleObjs): 160 | cdef RLEs Rs = _frString(rleObjs) 161 | cdef uint* _a = malloc(Rs._n* sizeof(uint)) 162 | rleArea(Rs._R, Rs._n, _a) 163 | cdef np.npy_intp shape[1] 164 | shape[0] = Rs._n 165 | a = np.array((Rs._n, ), dtype=np.uint8) 166 | a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a) 167 | PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) 168 | return a 169 | 170 | # iou computation. support function overload (RLEs-RLEs and bbox-bbox). 171 | def iou( dt, gt, pyiscrowd ): 172 | def _preproc(objs): 173 | if len(objs) == 0: 174 | return objs 175 | if type(objs) == np.ndarray: 176 | if len(objs.shape) == 1: 177 | objs = objs.reshape((objs[0], 1)) 178 | # check if it's Nx4 bbox 179 | if not len(objs.shape) == 2 or not objs.shape[1] == 4: 180 | raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension') 181 | objs = objs.astype(np.double) 182 | elif type(objs) == list: 183 | # check if list is in box format and convert it to np.ndarray 184 | isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs])) 185 | isrle = np.all(np.array([type(obj) == dict for obj in objs])) 186 | if isbox: 187 | objs = np.array(objs, dtype=np.double) 188 | if len(objs.shape) == 1: 189 | objs = objs.reshape((1,objs.shape[0])) 190 | elif isrle: 191 | objs = _frString(objs) 192 | else: 193 | raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])') 194 | else: 195 | raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.') 196 | return objs 197 | def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 198 | rleIou( dt._R, gt._R, m, n, iscrowd.data, _iou.data ) 199 | def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 200 | bbIou( dt.data, gt.data, m, n, iscrowd.data, _iou.data ) 201 | def _len(obj): 202 | cdef siz N = 0 203 | if type(obj) == RLEs: 204 | N = obj.n 205 | elif len(obj)==0: 206 | pass 207 | elif type(obj) == np.ndarray: 208 | N = obj.shape[0] 209 | return N 210 | # convert iscrowd to numpy array 211 | cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8) 212 | # simple type checking 213 | cdef siz m, n 214 | dt = _preproc(dt) 215 | gt = _preproc(gt) 216 | m = _len(dt) 217 | n = _len(gt) 218 | if m == 0 or n == 0: 219 | return [] 220 | if not type(dt) == type(gt): 221 | raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray') 222 | 223 | # define local variables 224 | cdef double* _iou = 0 225 | cdef np.npy_intp shape[1] 226 | # check type and assign iou function 227 | if type(dt) == RLEs: 228 | _iouFun = _rleIou 229 | elif type(dt) == np.ndarray: 230 | _iouFun = _bbIou 231 | else: 232 | raise Exception('input data type not allowed.') 233 | _iou = malloc(m*n* sizeof(double)) 234 | iou = np.zeros((m*n, ), dtype=np.double) 235 | shape[0] = m*n 236 | iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou) 237 | PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA) 238 | _iouFun(dt, gt, iscrowd, m, n, iou) 239 | return iou.reshape((m,n), order='F') 240 | 241 | def toBbox( rleObjs ): 242 | cdef RLEs Rs = _frString(rleObjs) 243 | cdef siz n = Rs.n 244 | cdef BB _bb = malloc(4*n* sizeof(double)) 245 | rleToBbox( Rs._R, _bb, n ) 246 | cdef np.npy_intp shape[1] 247 | shape[0] = 4*n 248 | bb = np.array((1,4*n), dtype=np.double) 249 | bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4)) 250 | PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA) 251 | return bb 252 | 253 | def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ): 254 | cdef siz n = bb.shape[0] 255 | Rs = RLEs(n) 256 | rleFrBbox( Rs._R, bb.data, h, w, n ) 257 | objs = _toString(Rs) 258 | return objs 259 | 260 | def frPoly( poly, siz h, siz w ): 261 | cdef np.ndarray[np.double_t, ndim=1] np_poly 262 | n = len(poly) 263 | Rs = RLEs(n) 264 | for i, p in enumerate(poly): 265 | np_poly = np.array(p, dtype=np.double, order='F') 266 | rleFrPoly( &Rs._R[i], np_poly.data, int(len(p)/2), h, w ) 267 | objs = _toString(Rs) 268 | return objs 269 | 270 | def frUncompressedRLE(ucRles, siz h, siz w): 271 | cdef np.ndarray[np.uint32_t, ndim=1] cnts 272 | cdef RLE R 273 | cdef uint *data 274 | n = len(ucRles) 275 | objs = [] 276 | for i in range(n): 277 | Rs = RLEs(1) 278 | cnts = np.array(ucRles[i]['counts'], dtype=np.uint32) 279 | # time for malloc can be saved here but it's fine 280 | data = malloc(len(cnts)* sizeof(uint)) 281 | for j in range(len(cnts)): 282 | data[j] = cnts[j] 283 | R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), data) 284 | Rs._R[0] = R 285 | objs.append(_toString(Rs)[0]) 286 | return objs 287 | 288 | def frPyObjects(pyobj, h, w): 289 | # encode rle from a list of python objects 290 | if type(pyobj) == np.ndarray: 291 | objs = frBbox(pyobj, h, w) 292 | elif type(pyobj) == list and len(pyobj[0]) == 4: 293 | objs = frBbox(pyobj, h, w) 294 | elif type(pyobj) == list and len(pyobj[0]) > 4: 295 | objs = frPoly(pyobj, h, w) 296 | elif type(pyobj) == list and type(pyobj[0]) == dict \ 297 | and 'counts' in pyobj[0] and 'size' in pyobj[0]: 298 | objs = frUncompressedRLE(pyobj, h, w) 299 | # encode rle from single python object 300 | elif type(pyobj) == list and len(pyobj) == 4: 301 | objs = frBbox([pyobj], h, w)[0] 302 | elif type(pyobj) == list and len(pyobj) > 4: 303 | objs = frPoly([pyobj], h, w)[0] 304 | elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj: 305 | objs = frUncompressedRLE([pyobj], h, w)[0] 306 | else: 307 | raise Exception('input type is not supported.') 308 | return objs 309 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/pycocotools/coco.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | __version__ = '2.0' 3 | # Interface for accessing the Microsoft COCO dataset. 4 | 5 | # Microsoft COCO is a large image dataset designed for object detection, 6 | # segmentation, and caption generation. pycocotools is a Python API that 7 | # assists in loading, parsing and visualizing the annotations in COCO. 8 | # Please visit http://mscoco.org/ for more information on COCO, including 9 | # for the data, paper, and tutorials. The exact format of the annotations 10 | # is also described on the COCO website. For example usage of the pycocotools 11 | # please see pycocotools_demo.ipynb. In addition to this API, please download both 12 | # the COCO images and annotations in order to run the demo. 13 | 14 | # An alternative to using the API is to load the annotations directly 15 | # into Python dictionary 16 | # Using the API provides additional utility functions. Note that this API 17 | # supports both *instance* and *caption* annotations. In the case of 18 | # captions not all functions are defined (e.g. categories are undefined). 19 | 20 | # The following API functions are defined: 21 | # COCO - COCO api class that loads COCO annotation file and prepare data structures. 22 | # decodeMask - Decode binary mask M encoded via run-length encoding. 23 | # encodeMask - Encode binary mask M using run-length encoding. 24 | # getAnnIds - Get ann ids that satisfy given filter conditions. 25 | # getCatIds - Get cat ids that satisfy given filter conditions. 26 | # getImgIds - Get img ids that satisfy given filter conditions. 27 | # loadAnns - Load anns with the specified ids. 28 | # loadCats - Load cats with the specified ids. 29 | # loadImgs - Load imgs with the specified ids. 30 | # annToMask - Convert segmentation in an annotation to binary mask. 31 | # showAnns - Display the specified annotations. 32 | # loadRes - Load algorithm results and create API for accessing them. 33 | # download - Download COCO images from mscoco.org server. 34 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. 35 | # Help on each functions can be accessed by: "help COCO>function". 36 | 37 | # See also COCO>decodeMask, 38 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, 39 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, 40 | # COCO>loadImgs, COCO>annToMask, COCO>showAnns 41 | 42 | # Microsoft COCO Toolbox. version 2.0 43 | # Data, paper, and tutorials available at: http://mscoco.org/ 44 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. 45 | # Licensed under the Simplified BSD License [see bsd.txt] 46 | 47 | import json 48 | import time 49 | #import matplotlib.pyplot as plt 50 | #from matplotlib.collections import PatchCollection 51 | #from matplotlib.patches import Polygon 52 | import numpy as np 53 | import copy 54 | import itertools 55 | from . import mask as maskUtils 56 | import os 57 | from collections import defaultdict 58 | import sys 59 | PYTHON_VERSION = sys.version_info[0] 60 | if PYTHON_VERSION == 2: 61 | from urllib import urlretrieve 62 | elif PYTHON_VERSION == 3: 63 | from urllib.request import urlretrieve 64 | 65 | 66 | def _isArrayLike(obj): 67 | return hasattr(obj, '__iter__') and hasattr(obj, '__len__') 68 | 69 | 70 | class COCO: 71 | def __init__(self, annotation_file=None): 72 | """ 73 | Constructor of Microsoft COCO helper class for reading and visualizing annotations. 74 | :param annotation_file (str): location of annotation file 75 | :param image_folder (str): location to the folder that hosts images. 76 | :return: 77 | """ 78 | # load dataset 79 | self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() 80 | self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) 81 | if not annotation_file == None: 82 | print('loading annotations into memory...') 83 | tic = time.time() 84 | dataset = json.load(open(annotation_file, 'r')) 85 | assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) 86 | print('Done (t={:0.2f}s)'.format(time.time()- tic)) 87 | self.dataset = dataset 88 | self.createIndex() 89 | 90 | def createIndex(self): 91 | # create index 92 | print('creating index...') 93 | anns, cats, imgs = {}, {}, {} 94 | imgToAnns,catToImgs = defaultdict(list),defaultdict(list) 95 | if 'annotations' in self.dataset: 96 | for ann in self.dataset['annotations']: 97 | imgToAnns[ann['image_id']].append(ann) 98 | anns[ann['id']] = ann 99 | 100 | if 'images' in self.dataset: 101 | for img in self.dataset['images']: 102 | imgs[img['id']] = img 103 | 104 | if 'categories' in self.dataset: 105 | for cat in self.dataset['categories']: 106 | cats[cat['id']] = cat 107 | 108 | if 'annotations' in self.dataset and 'categories' in self.dataset: 109 | for ann in self.dataset['annotations']: 110 | catToImgs[ann['category_id']].append(ann['image_id']) 111 | 112 | print('index created!') 113 | 114 | # create class members 115 | self.anns = anns 116 | self.imgToAnns = imgToAnns 117 | self.catToImgs = catToImgs 118 | self.imgs = imgs 119 | self.cats = cats 120 | 121 | def info(self): 122 | """ 123 | Print information about the annotation file. 124 | :return: 125 | """ 126 | for key, value in self.dataset['info'].items(): 127 | print('{}: {}'.format(key, value)) 128 | 129 | def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): 130 | """ 131 | Get ann ids that satisfy given filter conditions. default skips that filter 132 | :param imgIds (int array) : get anns for given imgs 133 | catIds (int array) : get anns for given cats 134 | areaRng (float array) : get anns for given area range (e.g. [0 inf]) 135 | iscrowd (boolean) : get anns for given crowd label (False or True) 136 | :return: ids (int array) : integer array of ann ids 137 | """ 138 | imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] 139 | catIds = catIds if _isArrayLike(catIds) else [catIds] 140 | 141 | if len(imgIds) == len(catIds) == len(areaRng) == 0: 142 | anns = self.dataset['annotations'] 143 | else: 144 | if not len(imgIds) == 0: 145 | lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] 146 | anns = list(itertools.chain.from_iterable(lists)) 147 | else: 148 | anns = self.dataset['annotations'] 149 | anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] 150 | anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] 151 | if not iscrowd == None: 152 | ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] 153 | else: 154 | ids = [ann['id'] for ann in anns] 155 | return ids 156 | 157 | def getCatIds(self, catNms=[], supNms=[], catIds=[]): 158 | """ 159 | filtering parameters. default skips that filter. 160 | :param catNms (str array) : get cats for given cat names 161 | :param supNms (str array) : get cats for given supercategory names 162 | :param catIds (int array) : get cats for given cat ids 163 | :return: ids (int array) : integer array of cat ids 164 | """ 165 | catNms = catNms if _isArrayLike(catNms) else [catNms] 166 | supNms = supNms if _isArrayLike(supNms) else [supNms] 167 | catIds = catIds if _isArrayLike(catIds) else [catIds] 168 | 169 | if len(catNms) == len(supNms) == len(catIds) == 0: 170 | cats = self.dataset['categories'] 171 | else: 172 | cats = self.dataset['categories'] 173 | cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] 174 | cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] 175 | cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] 176 | ids = [cat['id'] for cat in cats] 177 | return ids 178 | 179 | def getImgIds(self, imgIds=[], catIds=[]): 180 | ''' 181 | Get img ids that satisfy given filter conditions. 182 | :param imgIds (int array) : get imgs for given ids 183 | :param catIds (int array) : get imgs with all given cats 184 | :return: ids (int array) : integer array of img ids 185 | ''' 186 | imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] 187 | catIds = catIds if _isArrayLike(catIds) else [catIds] 188 | 189 | if len(imgIds) == len(catIds) == 0: 190 | ids = self.imgs.keys() 191 | else: 192 | ids = set(imgIds) 193 | for i, catId in enumerate(catIds): 194 | if i == 0 and len(ids) == 0: 195 | ids = set(self.catToImgs[catId]) 196 | else: 197 | ids &= set(self.catToImgs[catId]) 198 | return list(ids) 199 | 200 | def loadAnns(self, ids=[]): 201 | """ 202 | Load anns with the specified ids. 203 | :param ids (int array) : integer ids specifying anns 204 | :return: anns (object array) : loaded ann objects 205 | """ 206 | if _isArrayLike(ids): 207 | return [self.anns[id] for id in ids] 208 | elif type(ids) == int: 209 | return [self.anns[ids]] 210 | 211 | def loadCats(self, ids=[]): 212 | """ 213 | Load cats with the specified ids. 214 | :param ids (int array) : integer ids specifying cats 215 | :return: cats (object array) : loaded cat objects 216 | """ 217 | if _isArrayLike(ids): 218 | return [self.cats[id] for id in ids] 219 | elif type(ids) == int: 220 | return [self.cats[ids]] 221 | 222 | def loadImgs(self, ids=[]): 223 | """ 224 | Load anns with the specified ids. 225 | :param ids (int array) : integer ids specifying img 226 | :return: imgs (object array) : loaded img objects 227 | """ 228 | if _isArrayLike(ids): 229 | return [self.imgs[id] for id in ids] 230 | elif type(ids) == int: 231 | return [self.imgs[ids]] 232 | 233 | def showAnns(self, anns): 234 | """ 235 | Display the specified annotations. 236 | :param anns (array of object): annotations to display 237 | :return: None 238 | """ 239 | if len(anns) == 0: 240 | return 0 241 | if 'segmentation' in anns[0] or 'keypoints' in anns[0]: 242 | datasetType = 'instances' 243 | elif 'caption' in anns[0]: 244 | datasetType = 'captions' 245 | else: 246 | raise Exception('datasetType not supported') 247 | if datasetType == 'instances': 248 | ax = plt.gca() 249 | ax.set_autoscale_on(False) 250 | polygons = [] 251 | color = [] 252 | for ann in anns: 253 | c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] 254 | if 'segmentation' in ann: 255 | if type(ann['segmentation']) == list: 256 | # polygon 257 | for seg in ann['segmentation']: 258 | poly = np.array(seg).reshape((int(len(seg)/2), 2)) 259 | polygons.append(Polygon(poly)) 260 | color.append(c) 261 | else: 262 | # mask 263 | t = self.imgs[ann['image_id']] 264 | if type(ann['segmentation']['counts']) == list: 265 | rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) 266 | else: 267 | rle = [ann['segmentation']] 268 | m = maskUtils.decode(rle) 269 | img = np.ones( (m.shape[0], m.shape[1], 3) ) 270 | if ann['iscrowd'] == 1: 271 | color_mask = np.array([2.0,166.0,101.0])/255 272 | if ann['iscrowd'] == 0: 273 | color_mask = np.random.random((1, 3)).tolist()[0] 274 | for i in range(3): 275 | img[:,:,i] = color_mask[i] 276 | ax.imshow(np.dstack( (img, m*0.5) )) 277 | if 'keypoints' in ann and type(ann['keypoints']) == list: 278 | # turn skeleton into zero-based index 279 | sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 280 | kp = np.array(ann['keypoints']) 281 | x = kp[0::3] 282 | y = kp[1::3] 283 | v = kp[2::3] 284 | for sk in sks: 285 | if np.all(v[sk]>0): 286 | plt.plot(x[sk],y[sk], linewidth=3, color=c) 287 | plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) 288 | plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) 289 | p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) 290 | ax.add_collection(p) 291 | p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) 292 | ax.add_collection(p) 293 | elif datasetType == 'captions': 294 | for ann in anns: 295 | print(ann['caption']) 296 | 297 | def loadRes(self, resFile): 298 | """ 299 | Load result file and return a result api object. 300 | :param resFile (str) : file name of result file 301 | :return: res (obj) : result api object 302 | """ 303 | res = COCO() 304 | res.dataset['images'] = [img for img in self.dataset['images']] 305 | 306 | print('Loading and preparing results...') 307 | tic = time.time() 308 | if type(resFile) == str or type(resFile) == unicode: 309 | anns = json.load(open(resFile)) 310 | elif type(resFile) == np.ndarray: 311 | anns = self.loadNumpyAnnotations(resFile) 312 | else: 313 | anns = resFile 314 | assert type(anns) == list, 'results in not an array of objects' 315 | annsImgIds = [ann['image_id'] for ann in anns] 316 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 317 | 'Results do not correspond to current coco set' 318 | if 'caption' in anns[0]: 319 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 320 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 321 | for id, ann in enumerate(anns): 322 | ann['id'] = id+1 323 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 324 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 325 | for id, ann in enumerate(anns): 326 | bb = ann['bbox'] 327 | x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] 328 | if not 'segmentation' in ann: 329 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 330 | ann['area'] = bb[2]*bb[3] 331 | ann['id'] = id+1 332 | ann['iscrowd'] = 0 333 | elif 'segmentation' in anns[0]: 334 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 335 | for id, ann in enumerate(anns): 336 | # now only support compressed RLE format as segmentation results 337 | ann['area'] = maskUtils.area(ann['segmentation']) 338 | if not 'bbox' in ann: 339 | ann['bbox'] = maskUtils.toBbox(ann['segmentation']) 340 | ann['id'] = id+1 341 | ann['iscrowd'] = 0 342 | elif 'keypoints' in anns[0]: 343 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 344 | for id, ann in enumerate(anns): 345 | s = ann['keypoints'] 346 | x = s[0::3] 347 | y = s[1::3] 348 | x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) 349 | ann['area'] = (x1-x0)*(y1-y0) 350 | ann['id'] = id + 1 351 | ann['bbox'] = [x0,y0,x1-x0,y1-y0] 352 | print('DONE (t={:0.2f}s)'.format(time.time()- tic)) 353 | 354 | res.dataset['annotations'] = anns 355 | res.createIndex() 356 | return res 357 | 358 | def download(self, tarDir = None, imgIds = [] ): 359 | ''' 360 | Download COCO images from mscoco.org server. 361 | :param tarDir (str): COCO results directory name 362 | imgIds (list): images to be downloaded 363 | :return: 364 | ''' 365 | if tarDir is None: 366 | print('Please specify target directory') 367 | return -1 368 | if len(imgIds) == 0: 369 | imgs = self.imgs.values() 370 | else: 371 | imgs = self.loadImgs(imgIds) 372 | N = len(imgs) 373 | if not os.path.exists(tarDir): 374 | os.makedirs(tarDir) 375 | for i, img in enumerate(imgs): 376 | tic = time.time() 377 | fname = os.path.join(tarDir, img['file_name']) 378 | if not os.path.exists(fname): 379 | urlretrieve(img['coco_url'], fname) 380 | print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) 381 | 382 | def loadNumpyAnnotations(self, data): 383 | """ 384 | Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} 385 | :param data (numpy.ndarray) 386 | :return: annotations (python nested list) 387 | """ 388 | print('Converting ndarray to lists...') 389 | assert(type(data) == np.ndarray) 390 | print(data.shape) 391 | assert(data.shape[1] == 7) 392 | N = data.shape[0] 393 | ann = [] 394 | for i in range(N): 395 | if i % 1000000 == 0: 396 | print('{}/{}'.format(i,N)) 397 | ann += [{ 398 | 'image_id' : int(data[i, 0]), 399 | 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], 400 | 'score' : data[i, 5], 401 | 'category_id': int(data[i, 6]), 402 | }] 403 | return ann 404 | 405 | def annToRLE(self, ann): 406 | """ 407 | Convert annotation which can be polygons, uncompressed RLE to RLE. 408 | :return: binary mask (numpy 2D array) 409 | """ 410 | t = self.imgs[ann['image_id']] 411 | h, w = t['height'], t['width'] 412 | segm = ann['segmentation'] 413 | if type(segm) == list: 414 | # polygon -- a single object might consist of multiple parts 415 | # we merge all parts into one mask rle code 416 | rles = maskUtils.frPyObjects(segm, h, w) 417 | rle = maskUtils.merge(rles) 418 | elif type(segm['counts']) == list: 419 | # uncompressed RLE 420 | rle = maskUtils.frPyObjects(segm, h, w) 421 | else: 422 | # rle 423 | rle = ann['segmentation'] 424 | return rle 425 | 426 | def annToMask(self, ann): 427 | """ 428 | Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. 429 | :return: binary mask (numpy 2D array) 430 | """ 431 | rle = self.annToRLE(ann) 432 | m = maskUtils.decode(rle) 433 | return m 434 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | 4 | import evaluation.cocoapi.PythonAPI.pycocotools._mask as _mask 5 | #import _mask as _mask 6 | 7 | 8 | # Interface for manipulating masks stored in RLE format. 9 | # 10 | # RLE is a simple yet efficient format for storing binary masks. RLE 11 | # first divides a vector (or vectorized image) into a series of piecewise 12 | # constant regions and then for each piece simply stores the length of 13 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 14 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 15 | # (note that the odd counts are always the numbers of zeros). Instead of 16 | # storing the counts directly, additional compression is achieved with a 17 | # variable bitrate representation based on a common scheme called LEB128. 18 | # 19 | # Compression is greatest given large piecewise constant regions. 20 | # Specifically, the size of the RLE is proportional to the number of 21 | # *boundaries* in M (or for an image the number of boundaries in the y 22 | # direction). Assuming fairly simple shapes, the RLE representation is 23 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 24 | # is substantially lower, especially for large simple objects (large n). 25 | # 26 | # Many common operations on masks can be computed directly using the RLE 27 | # (without need for decoding). This includes computations such as area, 28 | # union, intersection, etc. All of these operations are linear in the 29 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 30 | # of the object. Computing these operations on the original mask is O(n). 31 | # Thus, using the RLE can result in substantial computational savings. 32 | # 33 | # The following API functions are defined: 34 | # encode - Encode binary masks using RLE. 35 | # decode - Decode binary masks encoded via RLE. 36 | # merge - Compute union or intersection of encoded masks. 37 | # iou - Compute intersection over union between masks. 38 | # area - Compute area of encoded masks. 39 | # toBbox - Get bounding boxes surrounding encoded masks. 40 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 41 | # 42 | # Usage: 43 | # Rs = encode( masks ) 44 | # masks = decode( Rs ) 45 | # R = merge( Rs, intersect=false ) 46 | # o = iou( dt, gt, iscrowd ) 47 | # a = area( Rs ) 48 | # bbs = toBbox( Rs ) 49 | # Rs = frPyObjects( [pyObjects], h, w ) 50 | # 51 | # In the API the following formats are used: 52 | # Rs - [dict] Run-length encoding of binary masks 53 | # R - dict Run-length encoding of binary mask 54 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 55 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 56 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 57 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 58 | # dt,gt - May be either bounding boxes or encoded masks 59 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 60 | # 61 | # Finally, a note about the intersection over union (iou) computation. 62 | # The standard iou of a ground truth (gt) and detected (dt) object is 63 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 64 | # For "crowd" regions, we use a modified criteria. If a gt object is 65 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 66 | # Choosing gt' in the crowd gt that best matches the dt can be done using 67 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 68 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 69 | # For crowd gt regions we use this modified criteria above for the iou. 70 | # 71 | # To compile run "python setup.py build_ext --inplace" 72 | # Please do not contact us for help with compiling. 73 | # 74 | # Microsoft COCO Toolbox. version 2.0 75 | # Data, paper, and tutorials available at: http://mscoco.org/ 76 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 77 | # Licensed under the Simplified BSD License [see coco/license.txt] 78 | 79 | iou = _mask.iou 80 | merge = _mask.merge 81 | frPyObjects = _mask.frPyObjects 82 | 83 | def encode(bimask): 84 | if len(bimask.shape) == 3: 85 | return _mask.encode(bimask) 86 | elif len(bimask.shape) == 2: 87 | h, w = bimask.shape 88 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 89 | 90 | def decode(rleObjs): 91 | if type(rleObjs) == list: 92 | return _mask.decode(rleObjs) 93 | else: 94 | return _mask.decode([rleObjs])[:,:,0] 95 | 96 | def area(rleObjs): 97 | if type(rleObjs) == list: 98 | return _mask.area(rleObjs) 99 | else: 100 | return _mask.area([rleObjs])[0] 101 | 102 | def toBbox(rleObjs): 103 | if type(rleObjs) == list: 104 | return _mask.toBbox(rleObjs) 105 | else: 106 | return _mask.toBbox([rleObjs])[0] 107 | -------------------------------------------------------------------------------- /evaluation/cocoapi/PythonAPI/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | from distutils.extension import Extension 4 | import numpy as np 5 | 6 | # To compile and install locally run "python setup.py build_ext --inplace" 7 | # To install library to Python site-packages run "python setup.py build_ext install" 8 | 9 | ext_modules = [ 10 | Extension( 11 | 'pycocotools._mask', 12 | sources=['../common/maskApi.c', 'pycocotools/_mask.pyx'], 13 | include_dirs = [np.get_include(), '../common'], 14 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 15 | ) 16 | ] 17 | 18 | setup(name='pycocotools', 19 | packages=['pycocotools'], 20 | package_dir = {'pycocotools': 'pycocotools'}, 21 | version='2.0', 22 | ext_modules= 23 | cythonize(ext_modules) 24 | ) -------------------------------------------------------------------------------- /evaluation/cocoapi/README.txt: -------------------------------------------------------------------------------- 1 | COCO API - http://cocodataset.org/ 2 | 3 | COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation. This package provides Matlab, Python, and Lua APIs that assists in loading, parsing, and visualizing the annotations in COCO. Please visit http://cocodataset.org/ for more information on COCO, including for the data, paper, and tutorials. The exact format of the annotations is also described on the COCO website. The Matlab and Python APIs are complete, the Lua API provides only basic functionality. 4 | 5 | In addition to this API, please download both the COCO images and annotations in order to run the demos and use the API. Both are available on the project website. 6 | -Please download, unzip, and place the images in: coco/images/ 7 | -Please download and place the annotations in: coco/annotations/ 8 | For substantially more details on the API please see http://cocodataset.org/#download. 9 | 10 | After downloading the images and annotations, run the Matlab, Python, or Lua demos for example usage. 11 | 12 | To install: 13 | -For Python, run "make" under coco/PythonAPI 14 | -------------------------------------------------------------------------------- /evaluation/cocoapi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/evaluation/cocoapi/__init__.py -------------------------------------------------------------------------------- /evaluation/cocoapi/common/gason.cpp: -------------------------------------------------------------------------------- 1 | // https://github.com/vivkin/gason - pulled January 10, 2016 2 | #include "gason.h" 3 | #include 4 | 5 | #define JSON_ZONE_SIZE 4096 6 | #define JSON_STACK_SIZE 32 7 | 8 | const char *jsonStrError(int err) { 9 | switch (err) { 10 | #define XX(no, str) \ 11 | case JSON_##no: \ 12 | return str; 13 | JSON_ERRNO_MAP(XX) 14 | #undef XX 15 | default: 16 | return "unknown"; 17 | } 18 | } 19 | 20 | void *JsonAllocator::allocate(size_t size) { 21 | size = (size + 7) & ~7; 22 | 23 | if (head && head->used + size <= JSON_ZONE_SIZE) { 24 | char *p = (char *)head + head->used; 25 | head->used += size; 26 | return p; 27 | } 28 | 29 | size_t allocSize = sizeof(Zone) + size; 30 | Zone *zone = (Zone *)malloc(allocSize <= JSON_ZONE_SIZE ? JSON_ZONE_SIZE : allocSize); 31 | if (zone == nullptr) 32 | return nullptr; 33 | zone->used = allocSize; 34 | if (allocSize <= JSON_ZONE_SIZE || head == nullptr) { 35 | zone->next = head; 36 | head = zone; 37 | } else { 38 | zone->next = head->next; 39 | head->next = zone; 40 | } 41 | return (char *)zone + sizeof(Zone); 42 | } 43 | 44 | void JsonAllocator::deallocate() { 45 | while (head) { 46 | Zone *next = head->next; 47 | free(head); 48 | head = next; 49 | } 50 | } 51 | 52 | static inline bool isspace(char c) { 53 | return c == ' ' || (c >= '\t' && c <= '\r'); 54 | } 55 | 56 | static inline bool isdelim(char c) { 57 | return c == ',' || c == ':' || c == ']' || c == '}' || isspace(c) || !c; 58 | } 59 | 60 | static inline bool isdigit(char c) { 61 | return c >= '0' && c <= '9'; 62 | } 63 | 64 | static inline bool isxdigit(char c) { 65 | return (c >= '0' && c <= '9') || ((c & ~' ') >= 'A' && (c & ~' ') <= 'F'); 66 | } 67 | 68 | static inline int char2int(char c) { 69 | if (c <= '9') 70 | return c - '0'; 71 | return (c & ~' ') - 'A' + 10; 72 | } 73 | 74 | static double string2double(char *s, char **endptr) { 75 | char ch = *s; 76 | if (ch == '-') 77 | ++s; 78 | 79 | double result = 0; 80 | while (isdigit(*s)) 81 | result = (result * 10) + (*s++ - '0'); 82 | 83 | if (*s == '.') { 84 | ++s; 85 | 86 | double fraction = 1; 87 | while (isdigit(*s)) { 88 | fraction *= 0.1; 89 | result += (*s++ - '0') * fraction; 90 | } 91 | } 92 | 93 | if (*s == 'e' || *s == 'E') { 94 | ++s; 95 | 96 | double base = 10; 97 | if (*s == '+') 98 | ++s; 99 | else if (*s == '-') { 100 | ++s; 101 | base = 0.1; 102 | } 103 | 104 | unsigned int exponent = 0; 105 | while (isdigit(*s)) 106 | exponent = (exponent * 10) + (*s++ - '0'); 107 | 108 | double power = 1; 109 | for (; exponent; exponent >>= 1, base *= base) 110 | if (exponent & 1) 111 | power *= base; 112 | 113 | result *= power; 114 | } 115 | 116 | *endptr = s; 117 | return ch == '-' ? -result : result; 118 | } 119 | 120 | static inline JsonNode *insertAfter(JsonNode *tail, JsonNode *node) { 121 | if (!tail) 122 | return node->next = node; 123 | node->next = tail->next; 124 | tail->next = node; 125 | return node; 126 | } 127 | 128 | static inline JsonValue listToValue(JsonTag tag, JsonNode *tail) { 129 | if (tail) { 130 | auto head = tail->next; 131 | tail->next = nullptr; 132 | return JsonValue(tag, head); 133 | } 134 | return JsonValue(tag, nullptr); 135 | } 136 | 137 | int jsonParse(char *s, char **endptr, JsonValue *value, JsonAllocator &allocator) { 138 | JsonNode *tails[JSON_STACK_SIZE]; 139 | JsonTag tags[JSON_STACK_SIZE]; 140 | char *keys[JSON_STACK_SIZE]; 141 | JsonValue o; 142 | int pos = -1; 143 | bool separator = true; 144 | JsonNode *node; 145 | *endptr = s; 146 | 147 | while (*s) { 148 | while (isspace(*s)) { 149 | ++s; 150 | if (!*s) break; 151 | } 152 | *endptr = s++; 153 | switch (**endptr) { 154 | case '-': 155 | if (!isdigit(*s) && *s != '.') { 156 | *endptr = s; 157 | return JSON_BAD_NUMBER; 158 | } 159 | case '0': 160 | case '1': 161 | case '2': 162 | case '3': 163 | case '4': 164 | case '5': 165 | case '6': 166 | case '7': 167 | case '8': 168 | case '9': 169 | o = JsonValue(string2double(*endptr, &s)); 170 | if (!isdelim(*s)) { 171 | *endptr = s; 172 | return JSON_BAD_NUMBER; 173 | } 174 | break; 175 | case '"': 176 | o = JsonValue(JSON_STRING, s); 177 | for (char *it = s; *s; ++it, ++s) { 178 | int c = *it = *s; 179 | if (c == '\\') { 180 | c = *++s; 181 | switch (c) { 182 | case '\\': 183 | case '"': 184 | case '/': 185 | *it = c; 186 | break; 187 | case 'b': 188 | *it = '\b'; 189 | break; 190 | case 'f': 191 | *it = '\f'; 192 | break; 193 | case 'n': 194 | *it = '\n'; 195 | break; 196 | case 'r': 197 | *it = '\r'; 198 | break; 199 | case 't': 200 | *it = '\t'; 201 | break; 202 | case 'u': 203 | c = 0; 204 | for (int i = 0; i < 4; ++i) { 205 | if (isxdigit(*++s)) { 206 | c = c * 16 + char2int(*s); 207 | } else { 208 | *endptr = s; 209 | return JSON_BAD_STRING; 210 | } 211 | } 212 | if (c < 0x80) { 213 | *it = c; 214 | } else if (c < 0x800) { 215 | *it++ = 0xC0 | (c >> 6); 216 | *it = 0x80 | (c & 0x3F); 217 | } else { 218 | *it++ = 0xE0 | (c >> 12); 219 | *it++ = 0x80 | ((c >> 6) & 0x3F); 220 | *it = 0x80 | (c & 0x3F); 221 | } 222 | break; 223 | default: 224 | *endptr = s; 225 | return JSON_BAD_STRING; 226 | } 227 | } else if ((unsigned int)c < ' ' || c == '\x7F') { 228 | *endptr = s; 229 | return JSON_BAD_STRING; 230 | } else if (c == '"') { 231 | *it = 0; 232 | ++s; 233 | break; 234 | } 235 | } 236 | if (!isdelim(*s)) { 237 | *endptr = s; 238 | return JSON_BAD_STRING; 239 | } 240 | break; 241 | case 't': 242 | if (!(s[0] == 'r' && s[1] == 'u' && s[2] == 'e' && isdelim(s[3]))) 243 | return JSON_BAD_IDENTIFIER; 244 | o = JsonValue(JSON_TRUE); 245 | s += 3; 246 | break; 247 | case 'f': 248 | if (!(s[0] == 'a' && s[1] == 'l' && s[2] == 's' && s[3] == 'e' && isdelim(s[4]))) 249 | return JSON_BAD_IDENTIFIER; 250 | o = JsonValue(JSON_FALSE); 251 | s += 4; 252 | break; 253 | case 'n': 254 | if (!(s[0] == 'u' && s[1] == 'l' && s[2] == 'l' && isdelim(s[3]))) 255 | return JSON_BAD_IDENTIFIER; 256 | o = JsonValue(JSON_NULL); 257 | s += 3; 258 | break; 259 | case ']': 260 | if (pos == -1) 261 | return JSON_STACK_UNDERFLOW; 262 | if (tags[pos] != JSON_ARRAY) 263 | return JSON_MISMATCH_BRACKET; 264 | o = listToValue(JSON_ARRAY, tails[pos--]); 265 | break; 266 | case '}': 267 | if (pos == -1) 268 | return JSON_STACK_UNDERFLOW; 269 | if (tags[pos] != JSON_OBJECT) 270 | return JSON_MISMATCH_BRACKET; 271 | if (keys[pos] != nullptr) 272 | return JSON_UNEXPECTED_CHARACTER; 273 | o = listToValue(JSON_OBJECT, tails[pos--]); 274 | break; 275 | case '[': 276 | if (++pos == JSON_STACK_SIZE) 277 | return JSON_STACK_OVERFLOW; 278 | tails[pos] = nullptr; 279 | tags[pos] = JSON_ARRAY; 280 | keys[pos] = nullptr; 281 | separator = true; 282 | continue; 283 | case '{': 284 | if (++pos == JSON_STACK_SIZE) 285 | return JSON_STACK_OVERFLOW; 286 | tails[pos] = nullptr; 287 | tags[pos] = JSON_OBJECT; 288 | keys[pos] = nullptr; 289 | separator = true; 290 | continue; 291 | case ':': 292 | if (separator || keys[pos] == nullptr) 293 | return JSON_UNEXPECTED_CHARACTER; 294 | separator = true; 295 | continue; 296 | case ',': 297 | if (separator || keys[pos] != nullptr) 298 | return JSON_UNEXPECTED_CHARACTER; 299 | separator = true; 300 | continue; 301 | case '\0': 302 | continue; 303 | default: 304 | return JSON_UNEXPECTED_CHARACTER; 305 | } 306 | 307 | separator = false; 308 | 309 | if (pos == -1) { 310 | *endptr = s; 311 | *value = o; 312 | return JSON_OK; 313 | } 314 | 315 | if (tags[pos] == JSON_OBJECT) { 316 | if (!keys[pos]) { 317 | if (o.getTag() != JSON_STRING) 318 | return JSON_UNQUOTED_KEY; 319 | keys[pos] = o.toString(); 320 | continue; 321 | } 322 | if ((node = (JsonNode *) allocator.allocate(sizeof(JsonNode))) == nullptr) 323 | return JSON_ALLOCATION_FAILURE; 324 | tails[pos] = insertAfter(tails[pos], node); 325 | tails[pos]->key = keys[pos]; 326 | keys[pos] = nullptr; 327 | } else { 328 | if ((node = (JsonNode *) allocator.allocate(sizeof(JsonNode) - sizeof(char *))) == nullptr) 329 | return JSON_ALLOCATION_FAILURE; 330 | tails[pos] = insertAfter(tails[pos], node); 331 | } 332 | tails[pos]->value = o; 333 | } 334 | return JSON_BREAKING_BAD; 335 | } 336 | -------------------------------------------------------------------------------- /evaluation/cocoapi/common/gason.h: -------------------------------------------------------------------------------- 1 | // https://github.com/vivkin/gason - pulled January 10, 2016 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | enum JsonTag { 9 | JSON_NUMBER = 0, 10 | JSON_STRING, 11 | JSON_ARRAY, 12 | JSON_OBJECT, 13 | JSON_TRUE, 14 | JSON_FALSE, 15 | JSON_NULL = 0xF 16 | }; 17 | 18 | struct JsonNode; 19 | 20 | #define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL 21 | #define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL 22 | #define JSON_VALUE_TAG_MASK 0xF 23 | #define JSON_VALUE_TAG_SHIFT 47 24 | 25 | union JsonValue { 26 | uint64_t ival; 27 | double fval; 28 | 29 | JsonValue(double x) 30 | : fval(x) { 31 | } 32 | JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) { 33 | assert((uintptr_t)payload <= JSON_VALUE_PAYLOAD_MASK); 34 | ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload; 35 | } 36 | bool isDouble() const { 37 | return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK; 38 | } 39 | JsonTag getTag() const { 40 | return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK); 41 | } 42 | uint64_t getPayload() const { 43 | assert(!isDouble()); 44 | return ival & JSON_VALUE_PAYLOAD_MASK; 45 | } 46 | double toNumber() const { 47 | assert(getTag() == JSON_NUMBER); 48 | return fval; 49 | } 50 | char *toString() const { 51 | assert(getTag() == JSON_STRING); 52 | return (char *)getPayload(); 53 | } 54 | JsonNode *toNode() const { 55 | assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT); 56 | return (JsonNode *)getPayload(); 57 | } 58 | }; 59 | 60 | struct JsonNode { 61 | JsonValue value; 62 | JsonNode *next; 63 | char *key; 64 | }; 65 | 66 | struct JsonIterator { 67 | JsonNode *p; 68 | 69 | void operator++() { 70 | p = p->next; 71 | } 72 | bool operator!=(const JsonIterator &x) const { 73 | return p != x.p; 74 | } 75 | JsonNode *operator*() const { 76 | return p; 77 | } 78 | JsonNode *operator->() const { 79 | return p; 80 | } 81 | }; 82 | 83 | inline JsonIterator begin(JsonValue o) { 84 | return JsonIterator{o.toNode()}; 85 | } 86 | inline JsonIterator end(JsonValue) { 87 | return JsonIterator{nullptr}; 88 | } 89 | 90 | #define JSON_ERRNO_MAP(XX) \ 91 | XX(OK, "ok") \ 92 | XX(BAD_NUMBER, "bad number") \ 93 | XX(BAD_STRING, "bad string") \ 94 | XX(BAD_IDENTIFIER, "bad identifier") \ 95 | XX(STACK_OVERFLOW, "stack overflow") \ 96 | XX(STACK_UNDERFLOW, "stack underflow") \ 97 | XX(MISMATCH_BRACKET, "mismatch bracket") \ 98 | XX(UNEXPECTED_CHARACTER, "unexpected character") \ 99 | XX(UNQUOTED_KEY, "unquoted key") \ 100 | XX(BREAKING_BAD, "breaking bad") \ 101 | XX(ALLOCATION_FAILURE, "allocation failure") 102 | 103 | enum JsonErrno { 104 | #define XX(no, str) JSON_##no, 105 | JSON_ERRNO_MAP(XX) 106 | #undef XX 107 | }; 108 | 109 | const char *jsonStrError(int err); 110 | 111 | class JsonAllocator { 112 | struct Zone { 113 | Zone *next; 114 | size_t used; 115 | } *head = nullptr; 116 | 117 | public: 118 | JsonAllocator() = default; 119 | JsonAllocator(const JsonAllocator &) = delete; 120 | JsonAllocator &operator=(const JsonAllocator &) = delete; 121 | JsonAllocator(JsonAllocator &&x) : head(x.head) { 122 | x.head = nullptr; 123 | } 124 | JsonAllocator &operator=(JsonAllocator &&x) { 125 | head = x.head; 126 | x.head = nullptr; 127 | return *this; 128 | } 129 | ~JsonAllocator() { 130 | deallocate(); 131 | } 132 | void *allocate(size_t size); 133 | void deallocate(); 134 | }; 135 | 136 | int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator); 137 | -------------------------------------------------------------------------------- /evaluation/cocoapi/common/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && athr) keep[j]=0; 105 | } 106 | } 107 | } 108 | 109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { 110 | double h, w, i, u, ga, da; siz g, d; int crowd; 111 | for( g=0; gthr) keep[j]=0; 129 | } 130 | } 131 | } 132 | 133 | void rleToBbox( const RLE *R, BB bb, siz n ) { 134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); 173 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 174 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 175 | if(dx>=dy) for( d=0; d<=dx; d++ ) { 176 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 177 | } else for( d=0; d<=dy; d++ ) { 178 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 179 | } 180 | } 181 | /* get points along y-boundary and downsample */ 182 | free(x); free(y); k=m; m=0; double xd, yd; 183 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 184 | for( j=1; jw-1 ) continue; 187 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 189 | x[m]=(int) xd; y[m]=(int) yd; m++; 190 | } 191 | /* compute rle encoding given y-boundary points */ 192 | k=m; a=malloc(sizeof(uint)*(k+1)); 193 | for( j=0; j0) b[m++]=a[j++]; else { 199 | j++; if(jm, p=0; long x; int more; 206 | char *s=malloc(sizeof(char)*m*6); 207 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 209 | while( more ) { 210 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 211 | if(more) c |= 0x20; c+=48; s[p++]=c; 212 | } 213 | } 214 | s[p]=0; return s; 215 | } 216 | 217 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 218 | siz m=0, p=0, k; long x; int more; uint *cnts; 219 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 220 | while( s[p] ) { 221 | x=0; k=0; more=1; 222 | while( more ) { 223 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 224 | more = c & 0x20; p++; k++; 225 | if(!more && (c & 0x10)) x |= -1 << 5*k; 226 | } 227 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 228 | } 229 | rleInit(R,h,w,m,cnts); free(cnts); 230 | } 231 | -------------------------------------------------------------------------------- /evaluation/cocoapi/common/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /evaluation/cocoapi/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /evaluation/convert_AVDgt_to_COCOgt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | ''' 5 | Convert AVD annotations into MSCOCO format for evaluation. 6 | 7 | Combines annotations from multiple scenes into a single file for use 8 | with the MSCOCO evaluation code. 9 | ''' 10 | 11 | #AVD_root_path = '/net/bvisionserver3/playpen10/ammirato/Data/RohitData/' 12 | AVD_root_path = '/playpen/ammirato/Data/RohitData/' 13 | save_path = '../Data/GT/' 14 | save_name = 'home0031.json' 15 | scene_list = [ 16 | #'Home_001_1', 17 | #'Home_001_2', 18 | #'Home_002_1', 19 | 'Home_003_1', 20 | #'Home_003_2', 21 | #'Home_004_1', 22 | #'Home_004_2', 23 | #'Home_005_1', 24 | #'Home_005_2', 25 | #'Home_006_1', 26 | #'Home_008_1', 27 | #'Home_014_1', 28 | #'Home_014_2', 29 | #'Office_001_1', 30 | 31 | # 'Home_101_1', 32 | # 'Home_102_1', 33 | # 'Home_103_1', 34 | # 'Home_104_1', 35 | # 'Home_105_1', 36 | # 'Home_106_1', 37 | # 'Home_107_1', 38 | # 'Home_108_1', 39 | # 'Home_109_1', 40 | 41 | 42 | ] 43 | 44 | 45 | if not(os.path.isdir(save_path)): 46 | os.makedirs(save_path) 47 | 48 | #first make categories dict 49 | map_file = open(os.path.join(AVD_root_path,'instance_id_map.txt'),'r') 50 | categories = [] 51 | for line in map_file: 52 | line = str.split(line) 53 | cid = int(line[1]) 54 | name = line[0] 55 | categories.append({'id':cid, 'name':name}) 56 | 57 | img_anns = [] 58 | box_anns = [] 59 | box_id_counter = 0 60 | 61 | cids = [] 62 | for scene in scene_list: 63 | scene_path = os.path.join(AVD_root_path,scene) 64 | annotations = json.load(open(os.path.join(scene_path,'annotations.json'))) 65 | 66 | for img_name in annotations.keys(): 67 | 68 | img_ind = int(img_name[:-4]) 69 | 70 | pre_boxid_counter = box_id_counter 71 | boxes = annotations[img_name]['bounding_boxes'] 72 | for box in boxes: 73 | 74 | xmin = box[0] 75 | ymin = box[1] 76 | width = box[2]-box[0] +1 77 | height = box[3]-box[1] +1 78 | iscrowd = 0 79 | if max(width, height) <= 25 or min(width,height) <= 15: 80 | iscrowd=1 81 | if box[5] >= 5: 82 | iscrowd=1 83 | 84 | area = width*height 85 | cid = box[4] 86 | 87 | cids.append(cid) 88 | box_anns.append({'area':area,'bbox':[xmin,ymin,width,height], 89 | 'category_id':cid,'image_id':img_ind, 90 | 'iscrowd':iscrowd,'segmentation':[], 91 | 'id':box_id_counter}) 92 | box_id_counter += 1 93 | img_anns.append({'file_name':img_name, 'id':img_ind, 'height':540, 'width':960}) 94 | coco_anns = {'images':img_anns, 'annotations':box_anns,'categories':categories} 95 | 96 | with open(os.path.join(save_path,save_name), 'w') as outfile: 97 | json.dump(coco_anns, outfile) 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /evaluation/eval_by_object.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from pycocotools.coco import COCO 3 | from pycocotools.cocoeval import COCOeval 4 | import numpy as np 5 | 6 | 7 | annType ='bbox' 8 | 9 | #initialize COCO ground truth api 10 | cocoGt=COCO('/net/bvisionserver3/playpen10/ammirato/Data/RohitCOCOgt/avd_all.json') 11 | #initialize COCO detections api 12 | det_bp = '/net/bvisionserver3/playpen10/ammirato/Data/Detection/recorded_models_and_meta/test_outputs/' 13 | cocoDt=cocoGt.loadRes(det_bp + 'TDID_GMUsynth2AVD_05_12.json') 14 | 15 | #catIds =[1050, 1052, 1053, 1054, 1055, 1270, 1143, 1243, 1244, 1245, 1247, 1252, 1255, 1256, 1257, 1004, 1005, 1007, 1140, 1142,1271, 1272] 16 | #catIds =[1050, 1052, 1053, 1054, 1055, 1270, 1143, 1243, 1244, 1245, 1247, 1252, 1255, 1256, 1257, 1004, 1005, 1007, 1140, 1142,1271, 1272] 17 | #catIds = [1270,1271,1272,1140,1142,1143,1004,1005,1007,1252,1255,1256,1257,1243,1244,1245,1247,1050,1052,1053,1054,1055] 18 | #catIds = [5,50,10,12,14,79,28,94,96,18,21] 19 | catIds = [5,10,12,14,21,28] 20 | 21 | 22 | # running evaluation 23 | cocoEval = COCOeval(cocoGt,cocoDt,annType) 24 | #cocoEval.params.imgIds = imgIds 25 | cocoEval.params.iouThrs = np.array([0.5]) 26 | cocoEval.params.areaRng = [[0, 10000000000.0], [416, 10000000000.0 ], [0, 416], [416, 3700], [3700, 3750], [3750, 7500], [7500,10000000000.0]] 27 | cocoEval.params.areaRngLbl = ['all', 'valid', 'l0', 'l1', 'l2', 'l3', 'l4'] 28 | cocoEval.params.maxDets = [1, 100, 500] 29 | cocoEval.params.useSegs = [0] 30 | 31 | for cid in catIds: 32 | print('\n\n {}'.format(cid)) 33 | cocoEval.params.catIds = [cid] 34 | cocoEval.evaluate() 35 | cocoEval.accumulate() 36 | cocoEval.summarize() 37 | 38 | -------------------------------------------------------------------------------- /model_defs/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /model_defs/TDID.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.autograd import Variable 4 | import torchvision.models as models 5 | import cv2 6 | import numpy as np 7 | import sys 8 | 9 | from .anchors.proposal_layer import proposal_layer as proposal_layer_py 10 | from .anchors.anchor_target_layer import anchor_target_layer as anchor_target_layer_py 11 | from utils import * 12 | 13 | class TDID(torch.nn.Module): 14 | ''' 15 | Target Driven Instance Detection network. 16 | 17 | Detects a single target object in a scene image. Fully convolutional. 18 | 19 | Input parameters: 20 | cfg: (Config) a config instance from configs/ 21 | ''' 22 | 23 | def __init__(self, cfg): 24 | super(TDID, self).__init__() 25 | self.cfg = cfg 26 | self.anchor_scales = cfg.ANCHOR_SCALES 27 | 28 | self.features,self._feat_stride,self.num_feature_channels = \ 29 | self.get_feature_net(cfg.FEATURE_NET_NAME) 30 | self.embedding_conv = self.get_embedding_conv(cfg) 31 | self.corr_conv = Conv2d(cfg.NUM_TARGETS*self.num_feature_channels, 32 | self.num_feature_channels, 3, 33 | relu=True, same_padding=True) 34 | self.diff_conv = Conv2d(cfg.NUM_TARGETS*self.num_feature_channels, 35 | self.num_feature_channels, 3, 36 | relu=True, same_padding=True) 37 | #for getting output size of score and bbbox convs 38 | # 3 = number of anchor aspect ratios 39 | # 2 = number of classes (background, target) 40 | # 4 = number of bounding box parameters 41 | self.score_conv = Conv2d(512, len(self.anchor_scales) * 3 * 2, 1, relu=False, same_padding=False) 42 | self.bbox_conv = Conv2d(512, len(self.anchor_scales) * 3 * 4, 1, relu=False, same_padding=False) 43 | 44 | # loss 45 | self.class_cross_entropy_loss = None 46 | self.box_regression_loss = None 47 | self.roi_cross_entropy_loss = None 48 | 49 | @property 50 | def loss(self): 51 | ''' 52 | Get loss of last forward pass through the network 53 | ''' 54 | return self.class_cross_entropy_loss + self.box_regression_loss * 10 55 | 56 | def forward(self, target_data, img_data, img_info, gt_boxes=None, 57 | features_given=False): 58 | ''' 59 | Forward pass through TDID network. 60 | 61 | B = batch size 62 | C = number of channels 63 | H = height 64 | W = width 65 | 66 | Input parameters: 67 | target_data: (torch.FloatTensor) (B*2)xCxHxW tensor of target data 68 | img_data: (torch.FloatTensor) BxCxHxW tensor of scene image data 69 | img_info: (tuple) shape of original scene image 70 | 71 | gt_boxes (optional): (ndarray) ground truth bounding boxes for this 72 | scene/target pair. Must be provided for training 73 | not used for testing. Default: None 74 | features_given (optional): (bool) If True, target_data and img_data 75 | are assumed to be feature maps. The feature 76 | extraction portion of the forward pass 77 | is skipped. Default: False 78 | 79 | Returns: 80 | scores: (torch.autograd.variable.Variable) Bxcfg.PROPOSAL_BATCH_SIZEx1 81 | rois: (torch.autograd.variable.Variable) Bxcfg.PROPOSAL_BATCH_SIZEx4 82 | 83 | ''' 84 | if features_given: 85 | img_features = img_data 86 | target_features = target_data 87 | else: 88 | img_features = self.features(img_data) 89 | target_features = self.features(target_data) 90 | 91 | 92 | all_corrs = [] 93 | all_diffs = [] 94 | for batch_ind in range(img_features.size()[0]): 95 | img_ind = np_to_variable(np.asarray([batch_ind]), 96 | is_cuda=True, dtype=torch.LongTensor) 97 | cur_img_feats = torch.index_select(img_features,0,img_ind) 98 | 99 | cur_diffs = [] 100 | cur_corrs = [] 101 | for target_type in range(self.cfg.NUM_TARGETS): 102 | target_ind = np_to_variable(np.asarray([batch_ind* 103 | self.cfg.NUM_TARGETS+target_type]), 104 | is_cuda=True,dtype=torch.LongTensor) 105 | cur_target_feats = torch.index_select(target_features,0, 106 | target_ind[0]) 107 | cur_target_feats = cur_target_feats.view(-1,1, 108 | cur_target_feats.size()[2], 109 | cur_target_feats.size()[3]) 110 | pooled_target_feats = F.max_pool2d(cur_target_feats, 111 | (cur_target_feats.size()[2], 112 | cur_target_feats.size()[3])) 113 | 114 | cur_diffs.append(cur_img_feats - 115 | pooled_target_feats.permute(1,0,2,3).expand_as(cur_img_feats)) 116 | if self.cfg.CORR_WITH_POOLED: 117 | cur_corrs.append(F.conv2d(cur_img_feats, 118 | pooled_target_feats, 119 | groups=self.num_feature_channels)) 120 | else: 121 | target_conv_padding = (max(0,int( 122 | target_features.size()[2]/2)), 123 | max(0,int( 124 | target_features.size()[3]/2))) 125 | cur_corrs.append(F.conv2d(cur_img_feats,cur_target_feats, 126 | padding=target_conv_padding, 127 | groups=self.num_feature_channels)) 128 | 129 | 130 | cur_corrs = torch.cat(cur_corrs,1) 131 | cur_corrs = self.select_to_match_dimensions(cur_corrs,cur_img_feats) 132 | all_corrs.append(cur_corrs) 133 | all_diffs.append(torch.cat(cur_diffs,1)) 134 | 135 | corr = self.corr_conv(torch.cat(all_corrs,0)) 136 | diff = self.diff_conv(torch.cat(all_diffs,0)) 137 | 138 | if self.cfg.USE_IMG_FEATS and self.cfg.USE_DIFF_FEATS: 139 | if self.cfg.USE_CC_FEATS: 140 | concat_feats = torch.cat([corr,img_features, diff],1) 141 | else: 142 | concat_feats = torch.cat([img_features, diff],1) 143 | elif self.cfg.USE_IMG_FEATS: 144 | if self.cfg.USE_CC_FEATS: 145 | concat_feats = torch.cat([corr,img_features],1) 146 | else: 147 | concat_feats = torch.cat([img_features],1) 148 | elif self.cfg.USE_DIFF_FEATS: 149 | if self.cfg.USE_CC_FEATS: 150 | concat_feats = torch.cat([corr,diff],1) 151 | else: 152 | concat_feats = torch.cat([diff],1) 153 | else: 154 | concat_feats = corr 155 | 156 | embedding_feats = self.embedding_conv(concat_feats) 157 | class_score = self.score_conv(embedding_feats) 158 | class_score_reshape = self.reshape_layer(class_score, 2) 159 | class_prob = F.softmax(class_score_reshape) 160 | class_prob_reshape = self.reshape_layer(class_prob, len(self.anchor_scales)*3*2) 161 | 162 | bbox_pred = self.bbox_conv(embedding_feats) 163 | 164 | # proposal layer 165 | rois, scores, anchor_inds, labels = self.proposal_layer( 166 | class_prob_reshape, 167 | bbox_pred, 168 | img_info, 169 | self.cfg, 170 | self._feat_stride, 171 | self.anchor_scales, 172 | gt_boxes) 173 | 174 | if self.training: 175 | assert gt_boxes is not None 176 | anchor_data = self.anchor_target_layer(class_score,gt_boxes, 177 | img_info, self.cfg, 178 | self._feat_stride, 179 | self.anchor_scales) 180 | self.class_cross_entropy_loss, self.box_regression_loss = \ 181 | self.build_loss(class_score_reshape, bbox_pred, anchor_data) 182 | 183 | self.roi_cross_entropy_loss = self.build_roi_loss(class_score, 184 | scores,anchor_inds, labels) 185 | 186 | return scores, rois 187 | 188 | 189 | 190 | def build_loss(self, class_score_reshape, bbox_pred, anchor_data): 191 | ''' 192 | Compute loss of a batch from a single forward pass 193 | 194 | Input parameters: 195 | class_score_reshape: (torch.FloatTensor) 196 | bbox_pred: (torch.FloatTensor) 197 | anchor_data: (ndarray) 198 | 199 | Returns: 200 | cross_entropy: (torch.autograd.variable.Variable) classifcation loss 201 | loss_box: (torch.autograd.variable.Variable) bbox regression loss 202 | 203 | ''' 204 | # classification loss 205 | class_score = class_score_reshape.permute(0, 2, 3, 1).contiguous().view(-1, 2) 206 | 207 | anchor_label = anchor_data[0].view(-1) 208 | keep = Variable(anchor_label.data.ne(-1).nonzero().squeeze()).cuda() 209 | class_score = torch.index_select(class_score, 0, keep) 210 | anchor_label = torch.index_select(anchor_label, 0, keep) 211 | 212 | fg_cnt = torch.sum(anchor_label.data.ne(0)) 213 | 214 | # box loss 215 | bbox_targets = anchor_data[1] 216 | bbox_inside_weights = anchor_data[2] 217 | bbox_outside_weights = anchor_data[3] 218 | bbox_targets = torch.mul(bbox_targets, bbox_inside_weights) 219 | bbox_pred = torch.mul(bbox_pred, bbox_inside_weights) 220 | 221 | cross_entropy = F.cross_entropy(class_score,anchor_label, size_average=False) 222 | loss_box = F.smooth_l1_loss(bbox_pred, bbox_targets, size_average=False) / (fg_cnt + 1e-4) 223 | return cross_entropy, loss_box 224 | 225 | 226 | def build_roi_loss(self, class_score, scores, anchor_inds, labels): 227 | ''' 228 | Compute classifcation loss of specified anchor boxes 229 | 230 | Input paramters: 231 | 232 | 233 | Returns: 234 | 235 | ''' 236 | 237 | class_score = class_score.permute(0, 2, 3, 1) 238 | bg_scores = torch.index_select(class_score,3,np_to_variable(np.arange(0,9),is_cuda=True, dtype=torch.LongTensor)) 239 | fg_scores = torch.index_select(class_score,3,np_to_variable(np.arange(9,18),is_cuda=True, dtype=torch.LongTensor)) 240 | bg_scores = bg_scores.contiguous().view(-1,1) 241 | fg_scores = fg_scores.contiguous().view(-1,1) 242 | class_score = torch.cat([bg_scores, fg_scores],1) 243 | class_score = torch.index_select(class_score, 0, anchor_inds.view(-1)) 244 | 245 | labels = labels.view(-1) 246 | roi_cross_entropy = F.cross_entropy(class_score, labels, size_average=False) 247 | return roi_cross_entropy 248 | 249 | 250 | 251 | 252 | 253 | @staticmethod 254 | def reshape_layer(x, d): 255 | ''' 256 | Reshape a tensor to have second dimension d, changing 3rd dimension 257 | 258 | Input parameters: 259 | x: (torch.autograd.variable.Variable) 260 | d: (int) 261 | 262 | Returns: 263 | (torch.autograd.variable.Variable) 264 | 265 | ''' 266 | 267 | input_shape = x.size() 268 | # b c w h 269 | x = x.view( 270 | input_shape[0], 271 | int(d), 272 | int(float(input_shape[1] * input_shape[2]) / float(d)), 273 | input_shape[3] 274 | ) 275 | return x 276 | 277 | @staticmethod 278 | def select_to_match_dimensions(a,b): 279 | ''' 280 | Select elements from first tensor so it's size matches second tensor. 281 | 282 | Input parameters: 283 | a: (torch.autograd.variable.Variable) 284 | b: (torch.autograd.variable.Variable) 285 | 286 | Returns: 287 | (torch.autograd.variable.Variable) 288 | 289 | ''' 290 | 291 | if a.size()[2] > b.size()[2]: 292 | a = torch.index_select(a, 2, 293 | np_to_variable(np.arange(0, 294 | b.size()[2]).astype(np.int32), 295 | is_cuda=True,dtype=torch.LongTensor)) 296 | if a.size()[3] > b.size()[3]: 297 | a = torch.index_select(a, 3, 298 | np_to_variable(np.arange(0, 299 | b.size()[3]).astype(np.int32), 300 | is_cuda=True,dtype=torch.LongTensor)) 301 | return a 302 | 303 | 304 | @staticmethod 305 | def proposal_layer(class_prob_reshape, bbox_pred, img_info, cfg, _feat_stride, anchor_scales, gt_boxes=None): 306 | ''' 307 | Get top scoring detections 308 | 309 | Wrapper for proposal_layer_py. 310 | 311 | Input parameters: 312 | class_prob_reshape: (torch.autograd.variable.Variable) 313 | bbox_pred: (torch.autograd.variable.Variable) 314 | img_info: (tuple) 315 | cfg: (Config) from ../configs 316 | _feat_stride: (int) 317 | anchor_scales: (list of int) 318 | 319 | gt_boxes (optional): (ndarray) Defatul: None 320 | 321 | 322 | ''' 323 | 324 | #convert to numpy 325 | class_prob_reshape = class_prob_reshape.data.cpu().numpy() 326 | bbox_pred = bbox_pred.data.cpu().numpy() 327 | 328 | rois, scores, anchor_inds, labels = proposal_layer_py( 329 | class_prob_reshape, 330 | bbox_pred, 331 | img_info, cfg, 332 | _feat_stride=_feat_stride, 333 | anchor_scales=anchor_scales, 334 | gt_boxes=gt_boxes) 335 | #convert to pytorch 336 | rois = np_to_variable(rois, is_cuda=True) 337 | anchor_inds = np_to_variable(anchor_inds, is_cuda=True, 338 | dtype=torch.LongTensor) 339 | labels = np_to_variable(labels, is_cuda=True, 340 | dtype=torch.LongTensor) 341 | scores = np_to_variable(scores, is_cuda=True) 342 | return rois, scores, anchor_inds, labels 343 | 344 | 345 | @staticmethod 346 | def anchor_target_layer(class_score, gt_boxes, img_info, 347 | cfg, _feat_stride, anchor_scales): 348 | ''' 349 | Assigns fg/bg label to anchor boxes. 350 | 351 | 352 | Input parameters: 353 | class_score: (torch.autograd.variable.Variable) 354 | gt_boxes: (ndarray) 355 | img_info: (tuple of int) 356 | cfg: (Config) from ../configs 357 | _feat_stride: (int) 358 | anchor_scales: (list of int) 359 | 360 | Returns: 361 | labels: (torch.autograd.variable.Variable) 362 | bbox_targets: (torch.autograd.variable.Variable) 363 | bbox_inside_weights:(torch.autograd.variable.Variable) 364 | bbox_outside_weights:(torch.autograd.variable.Variable) 365 | ''' 366 | class_score = class_score.data.cpu().numpy() 367 | labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = \ 368 | anchor_target_layer_py(class_score, gt_boxes, img_info, 369 | cfg, _feat_stride, anchor_scales) 370 | 371 | labels = np_to_variable(labels, is_cuda=True, dtype=torch.LongTensor) 372 | bbox_targets = np_to_variable(bbox_targets, is_cuda=True) 373 | bbox_inside_weights = np_to_variable(bbox_inside_weights, is_cuda=True) 374 | bbox_outside_weights = np_to_variable(bbox_outside_weights, is_cuda=True) 375 | 376 | return labels, bbox_targets, bbox_inside_weights, bbox_outside_weights 377 | 378 | def get_features(self, img_data): 379 | img_data = np_to_variable(img_data, is_cuda=True) 380 | img_data = img_data.permute(0, 3, 1, 2) 381 | features = self.features(img_data) 382 | 383 | return features 384 | 385 | 386 | @staticmethod 387 | def get_feature_net(net_name): 388 | ''' 389 | Get the object representing the desired feature extraction network 390 | 391 | Note: only the part of the network considered useful for feature 392 | extraction is returned. i.e. everythnig but the fully 393 | connected layers of AlexNet. 394 | 395 | Input parameters: 396 | net_name: (str) the name of the desired network 397 | 398 | 399 | Availble net names: 400 | vgg16_bn 401 | squeezenet1_1 402 | resenet101 403 | alexnet 404 | ''' 405 | if net_name == 'vgg16_bn': 406 | fnet = models.vgg16_bn(pretrained=False) 407 | return torch.nn.Sequential(*list(fnet.features.children())[:-1]), 16, 512 408 | elif net_name == 'squeezenet1_1': 409 | fnet = models.squeezenet1_1(pretrained=False) 410 | return torch.nn.Sequential(*list(fnet.features.children())[:-1]), 16, 512 411 | elif net_name == 'resnet101': 412 | fnet = models.resnet101(pretrained=False) 413 | return torch.nn.Sequential(*list(fnet.children())[:-2]), 32, 2048 414 | elif net_name == 'alexnet': 415 | fnet = models.alexnet(pretrained=False) 416 | return torch.nn.Sequential(*list(fnet.features.children())), 17, 256 417 | else: 418 | raise NotImplementedError 419 | 420 | def get_embedding_conv(self,cfg): 421 | ''' 422 | Get a Conv2D layer for the TDID embedding based on the config paprams 423 | 424 | Input parameters: 425 | cfg: (Config) from ../configs/ 426 | 427 | ''' 428 | if cfg.USE_IMG_FEATS and cfg.USE_DIFF_FEATS: 429 | if cfg.USE_CC_FEATS: 430 | return Conv2d(3*self.num_feature_channels, 431 | 512, 3, relu=False, same_padding=True) 432 | else: 433 | return Conv2d(2*self.num_feature_channels, 434 | 512, 3, relu=False, same_padding=True) 435 | elif cfg.USE_IMG_FEATS: 436 | if cfg.USE_CC_FEATS: 437 | return Conv2d(2*self.num_feature_channels, 438 | 512, 3, relu=False, same_padding=True) 439 | else: 440 | return Conv2d(self.num_feature_channels, 441 | 512, 3, relu=False, same_padding=True) 442 | elif cfg.USE_DIFF_FEATS: 443 | if cfg.USE_CC_FEATS: 444 | return Conv2d(2*self.num_feature_channels, 445 | 512, 3, relu=False, same_padding=True) 446 | else: 447 | return Conv2d(self.num_feature_channels, 448 | 512, 3, relu=False, same_padding=True) 449 | else: 450 | return Conv2d(self.num_feature_channels, 451 | 512, 3, relu=False, same_padding=True) 452 | 453 | 454 | -------------------------------------------------------------------------------- /model_defs/TDID_fast.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import torchvision.models as models 6 | 7 | import cv2 8 | import numpy as np 9 | import sys 10 | import time 11 | 12 | from instance_detection.utils.timer import Timer 13 | from rpn_msr.proposal_layer import proposal_layer as proposal_layer_py 14 | from rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py 15 | 16 | import network 17 | from network import Conv2d, FC 18 | 19 | 20 | class TDID(nn.Module): 21 | groups=512 22 | 23 | def __init__(self, cfg): 24 | super(TDID, self).__init__() 25 | self.cfg = cfg 26 | self.anchor_scales = cfg.ANCHOR_SCALES 27 | 28 | self.features,self._feat_stride,self.num_feature_channels = \ 29 | self.get_feature_net(cfg.FEATURE_NET_NAME) 30 | 31 | self.groups = self.num_feature_channels 32 | self.conv1 = self.get_conv1(cfg) 33 | self.cc_conv = Conv2d(cfg.NUM_TARGETS*self.num_feature_channels, 34 | self.num_feature_channels, 3, 35 | relu=True, same_padding=True) 36 | self.diff_conv = Conv2d(cfg.NUM_TARGETS*self.num_feature_channels, 37 | self.num_feature_channels, 3, 38 | relu=True, same_padding=True) 39 | self.score_conv = Conv2d(512, len(self.anchor_scales) * 3 * 2, 1, relu=False, same_padding=False) 40 | self.bbox_conv = Conv2d(512, len(self.anchor_scales) * 3 * 4, 1, relu=False, same_padding=False) 41 | 42 | # loss 43 | self.roi_cross_entropy = None 44 | self.cross_entropy = None 45 | self.loss_box = None 46 | 47 | self.timer = Timer() 48 | self.time_info = {'img_features':0} 49 | self.time = 0 50 | 51 | @property 52 | def loss(self): 53 | #return self.roi_cross_entropy + self.cross_entropy + self.loss_box * 10 54 | return self.cross_entropy + self.loss_box * 10 55 | #return self.roi_cross_entropy 56 | 57 | def forward(self, target_data, im_data, gt_boxes=None, features_given=False, im_info=None, return_timing_info=False): 58 | 59 | 60 | #self.timer.tic() 61 | self.time = time.clock() 62 | #get image features 63 | if features_given: 64 | img_features = im_data 65 | target_features = target_data 66 | else: 67 | img_features = self.features(im_data) 68 | target_features = self.features(target_data) 69 | 70 | #featrues timing end 71 | padding = (max(0,int(target_features.size()[2]/2)), 72 | max(0,int(target_features.size()[3]/2))) 73 | ccs = [] 74 | diffs = [] 75 | 76 | sample_img = img_features 77 | diff = [] 78 | cc = [] 79 | sample_target1 = target_features[0,:,:,:].unsqueeze(0) 80 | sample_target2 = target_features[1,:,:,:].unsqueeze(0) 81 | 82 | sample_target1 = sample_target1.permute((1,0,2,3)) 83 | sample_target2 = sample_target2.permute((1,0,2,3)) 84 | 85 | tf_pooled1 = F.max_pool2d(sample_target1,(sample_target1.size()[2], 86 | sample_target1.size()[3])) 87 | tf_pooled2 = F.max_pool2d(sample_target2,(sample_target2.size()[2], 88 | sample_target2.size()[3])) 89 | 90 | diff.append(sample_img - tf_pooled1.permute(1,0,2,3).expand_as(sample_img)) 91 | diff.append(sample_img - tf_pooled2.permute(1,0,2,3).expand_as(sample_img)) 92 | cc.append(F.conv2d(sample_img,tf_pooled1,groups=self.groups)) 93 | cc.append(F.conv2d(sample_img,tf_pooled2,groups=self.groups)) 94 | 95 | #pooll/diff/corr timing end 96 | 97 | cc = torch.cat(cc,1) 98 | diffs = torch.cat(diff,1) 99 | 100 | cc = self.cc_conv(cc) 101 | diffs = self.diff_conv(diffs) 102 | cc = torch.cat([cc,diffs],1) 103 | rpn_conv1 = self.conv1(cc) 104 | 105 | #rpnconv timing end 106 | 107 | 108 | # rpn score 109 | rpn_cls_score = self.score_conv(rpn_conv1) 110 | rpn_cls_score_reshape = self.reshape_layer(rpn_cls_score, 2) 111 | rpn_cls_prob = F.softmax(rpn_cls_score_reshape) 112 | rpn_cls_prob_reshape = self.reshape_layer(rpn_cls_prob, len(self.anchor_scales)*3*2) 113 | 114 | # rpn boxes 115 | rpn_bbox_pred = self.bbox_conv(rpn_conv1) 116 | 117 | 118 | #score/reg timgin end 119 | 120 | # proposal layer 121 | rois,scores, anchor_inds, labels = self.proposal_layer(rpn_cls_prob_reshape, 122 | rpn_bbox_pred, 123 | im_info, 124 | self.cfg, 125 | self._feat_stride, 126 | self.anchor_scales, 127 | gt_boxes) 128 | 129 | #rois = network.np_to_variable(np.zeros((1,300,4)),is_cuda=False) 130 | #scores = network.np_to_variable(np.zeros((1,300,1)),is_cuda=False) 131 | #anchor_inds =network.np_to_variable(np.zeros((1,300,1)),is_cuda=False,dtype=torch.cuda.LongTensor) 132 | #labels = network.np_to_variable(np.zeros((1,300)),is_cuda=False,dtype=torch.cuda.LongTensor) 133 | #rois = np.zeros((1,300,4)) 134 | #scores = np.zeros((1,300,1)) 135 | #anchor_inds =np.zeros((1,300,1)) 136 | #labels = np.zeros((1,300)) 137 | #self.time_info['img_features'] = self.timer.toc(average=False) 138 | self.time_info['img_features'] = time.clock() - self.time 139 | #prop timing end 140 | 141 | #return target_features, features, rois, scores 142 | if return_timing_info: 143 | return scores.data.cpu().numpy(), rois.data.cpu().numpy(), self.time_info 144 | else: 145 | return scores, rois 146 | 147 | 148 | 149 | def build_loss(self, rpn_cls_score_reshape, rpn_bbox_pred, rpn_data): 150 | # classification loss 151 | rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(-1, 2) 152 | 153 | rpn_label = rpn_data[0].view(-1) 154 | rpn_keep = Variable(rpn_label.data.ne(-1).nonzero().squeeze()).cuda() 155 | rpn_cls_score = torch.index_select(rpn_cls_score, 0, rpn_keep) 156 | rpn_label = torch.index_select(rpn_label, 0, rpn_keep) 157 | 158 | fg_cnt = torch.sum(rpn_label.data.ne(0)) 159 | 160 | # box loss 161 | rpn_bbox_targets = rpn_data[1] 162 | rpn_bbox_inside_weights = rpn_data[2] 163 | rpn_bbox_outside_weights = rpn_data[3] 164 | rpn_bbox_targets = torch.mul(rpn_bbox_targets, rpn_bbox_inside_weights) 165 | rpn_bbox_pred = torch.mul(rpn_bbox_pred, rpn_bbox_inside_weights) 166 | 167 | rpn_cross_entropy = F.cross_entropy(rpn_cls_score, rpn_label, size_average=False) 168 | rpn_loss_box = F.smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, size_average=False) / (fg_cnt + 1e-4) 169 | return rpn_cross_entropy, rpn_loss_box 170 | 171 | 172 | def build_roi_loss(self, rpn_cls_score_reshape, rpn_cls_prob_reshape, scores, anchor_inds, labels): 173 | 174 | batch_size = rpn_cls_score_reshape.size()[0] 175 | rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1)#.contiguous().view(-1, 2) 176 | bg_scores = torch.index_select(rpn_cls_score,3,network.np_to_variable(np.arange(0,9),is_cuda=True, dtype=torch.LongTensor)) 177 | fg_scores = torch.index_select(rpn_cls_score,3,network.np_to_variable(np.arange(9,18),is_cuda=True, dtype=torch.LongTensor)) 178 | bg_scores = bg_scores.contiguous().view(-1,1) 179 | fg_scores = fg_scores.contiguous().view(-1,1) 180 | 181 | rpn_cls_score = torch.cat([bg_scores, fg_scores],1) 182 | 183 | rpn_cls_score = torch.index_select(rpn_cls_score, 0, anchor_inds.view(-1)) 184 | labels = labels.view(-1) 185 | 186 | roi_cross_entropy = F.cross_entropy(rpn_cls_score, labels, size_average=False) 187 | 188 | return roi_cross_entropy 189 | 190 | 191 | @staticmethod 192 | def reshape_layer(x, d): 193 | input_shape = x.size() 194 | # b c w h 195 | x = x.view( 196 | input_shape[0], 197 | int(d), 198 | int(float(input_shape[1] * input_shape[2]) / float(d)), 199 | input_shape[3] 200 | ) 201 | return x 202 | 203 | @staticmethod 204 | def select_to_match_dimensions(a,b): 205 | if a.size()[2] > b.size()[2]: 206 | a = torch.index_select(a, 2, 207 | network.np_to_variable(np.arange(0, 208 | b.size()[2]).astype(np.int32), 209 | is_cuda=True,dtype=torch.LongTensor)) 210 | if a.size()[3] > b.size()[3]: 211 | a = torch.index_select(a, 3, 212 | network.np_to_variable(np.arange(0, 213 | b.size()[3]).astype(np.int32), 214 | is_cuda=True,dtype=torch.LongTensor)) 215 | return a 216 | 217 | 218 | @staticmethod 219 | def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg, _feat_stride, anchor_scales, gt_boxes=None): 220 | 221 | #convert to numpy 222 | rpn_cls_prob_reshape = rpn_cls_prob_reshape.data.cpu().numpy() 223 | rpn_bbox_pred = rpn_bbox_pred.data.cpu().numpy() 224 | 225 | rois, scores, anchor_inds, labels = proposal_layer_py(rpn_cls_prob_reshape, 226 | rpn_bbox_pred, 227 | im_info, cfg, 228 | _feat_stride=_feat_stride, 229 | anchor_scales=anchor_scales, 230 | gt_boxes=gt_boxes) 231 | rois = network.np_to_variable(rois, is_cuda=True) 232 | anchor_inds = network.np_to_variable(anchor_inds, is_cuda=True, 233 | dtype=torch.LongTensor) 234 | labels = network.np_to_variable(labels, is_cuda=True, 235 | dtype=torch.LongTensor) 236 | #just get fg scores, make bg scores 0 237 | scores = network.np_to_variable(scores, is_cuda=True) 238 | return rois, scores, anchor_inds, labels 239 | 240 | 241 | @staticmethod 242 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, 243 | cfg, _feat_stride, anchor_scales): 244 | """ 245 | rpn_cls_score: for pytorch (1, Ax2, H, W) bg/fg scores of previous conv layer 246 | gt_boxes: (G, 5) vstack of [x1, y1, x2, y2, class] 247 | gt_ishard: (G, 1), 1 or 0 indicates difficult or not 248 | dontcare_areas: (D, 4), some areas may contains small objs but no labelling. D may be 0 249 | im_info: a list of [image_height, image_width, scale_ratios] 250 | _feat_stride: the downsampling ratio of feature map to the original input image 251 | anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) 252 | ---------- 253 | Returns 254 | ---------- 255 | rpn_labels : (1, 1, HxA, W), for each anchor, 0 denotes bg, 1 fg, -1 dontcare 256 | rpn_bbox_targets: (1, 4xA, H, W), distances of the anchors to the gt_boxes(may contains some transform) 257 | that are the regression objectives 258 | rpn_bbox_inside_weights: (1, 4xA, H, W) weights of each boxes, mainly accepts hyper param in cfg 259 | rpn_bbox_outside_weights: (1, 4xA, H, W) used to balance the fg/bg, 260 | beacuse the numbers of bgs and fgs mays significiantly different 261 | """ 262 | rpn_cls_score = rpn_cls_score.data.cpu().numpy() 263 | rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \ 264 | anchor_target_layer_py(rpn_cls_score, gt_boxes, im_info, 265 | cfg, _feat_stride, anchor_scales) 266 | 267 | rpn_labels = network.np_to_variable(rpn_labels, is_cuda=True, dtype=torch.LongTensor) 268 | rpn_bbox_targets = network.np_to_variable(rpn_bbox_targets, is_cuda=True) 269 | rpn_bbox_inside_weights = network.np_to_variable(rpn_bbox_inside_weights, is_cuda=True) 270 | rpn_bbox_outside_weights = network.np_to_variable(rpn_bbox_outside_weights, is_cuda=True) 271 | 272 | return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights 273 | 274 | def get_features(self, im_data): 275 | im_data = network.np_to_variable(im_data, is_cuda=True) 276 | im_data = im_data.permute(0, 3, 1, 2) 277 | features = self.features(im_data) 278 | 279 | return features 280 | 281 | 282 | @staticmethod 283 | def get_feature_net(net_name): 284 | if net_name == 'vgg16_bn': 285 | fnet = models.vgg16_bn(pretrained=False) 286 | return torch.nn.Sequential(*list(fnet.features.children())[:-1]), 16, 512 287 | elif net_name == 'squeezenet1_1': 288 | fnet = models.squeezenet1_1(pretrained=False) 289 | return torch.nn.Sequential(*list(fnet.features.children())[:-1]), 16, 512 290 | elif net_name == 'resnet101': 291 | fnet = models.resnet101(pretrained=False) 292 | return torch.nn.Sequential(*list(fnet.children())[:-2]), 32, 2048 293 | else: 294 | print 'feature net type not supported!' 295 | sys.exit() 296 | 297 | def get_conv1(self,cfg): 298 | if cfg.USE_IMG_FEATS and cfg.USE_DIFF_FEATS: 299 | if cfg.USE_CC_FEATS: 300 | return Conv2d(3*self.num_feature_channels, 301 | 512, 3, relu=False, same_padding=True) 302 | else: 303 | return Conv2d(2*self.num_feature_channels, 304 | 512, 3, relu=False, same_padding=True) 305 | elif cfg.USE_IMG_FEATS: 306 | if cfg.USE_CC_FEATS: 307 | return Conv2d(2*self.num_feature_channels, 308 | 512, 3, relu=False, same_padding=True) 309 | else: 310 | return Conv2d(self.num_feature_channels, 311 | 512, 3, relu=False, same_padding=True) 312 | elif cfg.USE_DIFF_FEATS: 313 | if cfg.USE_CC_FEATS: 314 | return Conv2d(2*self.num_feature_channels, 315 | 512, 3, relu=False, same_padding=True) 316 | else: 317 | return Conv2d(self.num_feature_channels, 318 | 512, 3, relu=False, same_padding=True) 319 | else: 320 | return Conv2d(self.num_feature_channels, 321 | 512, 3, relu=False, same_padding=True) 322 | 323 | Conv2d(3*self.num_feature_channels, 324 | 512, 3, relu=False, same_padding=True) 325 | -------------------------------------------------------------------------------- /model_defs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/model_defs/__init__.py -------------------------------------------------------------------------------- /model_defs/anchors/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /model_defs/anchors/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cython_bbox 2 | 3 | -------------------------------------------------------------------------------- /model_defs/anchors/anchor_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | # Edited by Phil Ammirato, UNC-Chapel Hill 8 | 9 | import os 10 | import yaml 11 | import numpy as np 12 | import numpy.random as npr 13 | 14 | from .generate_anchors import generate_anchors 15 | from .cython_bbox import bbox_overlaps, bbox_intersections 16 | from .bbox_transform import bbox_transform 17 | 18 | def anchor_target_layer(cls_score, gt_boxes, img_info, cfg, _feat_stride=16, 19 | anchor_scales=[2, 4, 8,]): 20 | ''' 21 | Produces anchor classification labels and bounding-box regression targets. 22 | 23 | Input parameters: 24 | cls_score: (ndarray) network output score map 25 | gt_boxes: (ndarray) ground truth bounding boxes 26 | img_info: (tuple of int) 27 | cfg: (Config) 28 | 29 | _feat_stride(optional): (int) scaling factor between input feature 30 | map (class_prob_reshape) and original image. 31 | Default: 16 32 | anchor_scales (optional): (list of int) scale for size of anchor boxes 33 | Default: [2,4,8] 34 | 35 | Returns: 36 | all_labels : (ndarray) labels assigned to each anchor box 37 | all_bbox_targets: (ndarray) box parameter targets for each anchor 38 | all_bbox_inside_weights: (ndarray) to be removed 39 | all_bbox_outside_weights: (ndarray) to be removed 40 | 41 | ''' 42 | # Algorithm: 43 | # 44 | # for each (H, W) location i 45 | # generate 9 anchor boxes centered on cell i 46 | # apply predicted bbox deltas at cell i to each of the 9 anchors 47 | # filter out-of-image anchors 48 | # measure GT overlap 49 | 50 | batch_size = cls_score.shape[0] 51 | 52 | _anchors = generate_anchors(scales=np.array(anchor_scales)) 53 | _num_anchors = _anchors.shape[0] 54 | 55 | # allow boxes to sit over the edge by a small amount 56 | _allowed_border = 0 57 | 58 | # map of shape (..., H, W) 59 | # pytorch (bs, c, h, w) 60 | height, width = cls_score.shape[2:4] 61 | 62 | # 1. Generate proposals from bbox deltas and shifted anchors 63 | shift_x = np.arange(0, width) * _feat_stride 64 | shift_y = np.arange(0, height) * _feat_stride 65 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) # in W H order 66 | # K is H x W 67 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 68 | shift_x.ravel(), shift_y.ravel())).transpose() 69 | # add A anchors (1, A, 4) to 70 | # cell K shifts (K, 1, 4) to get 71 | # shift anchors (K, A, 4) 72 | # reshape to (K*A, 4) shifted anchors 73 | A = _num_anchors 74 | K = shifts.shape[0] 75 | all_anchors = (_anchors.reshape((1, A, 4)) + 76 | shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 77 | all_anchors = all_anchors.reshape((K * A, 4)) 78 | total_anchors = int(K * A) 79 | 80 | # only keep anchors inside the image 81 | inds_inside = np.where( 82 | (all_anchors[:, 0] >= -_allowed_border) & 83 | (all_anchors[:, 1] >= -_allowed_border) & 84 | (all_anchors[:, 2] < img_info[1] + _allowed_border) & # width 85 | (all_anchors[:, 3] < img_info[0] + _allowed_border) # height 86 | )[0] 87 | 88 | # keep only inside anchors 89 | anchors = all_anchors[inds_inside, :] 90 | 91 | all_labels = None 92 | all_bbox_targets = None 93 | all_bbox_inside_weights = None 94 | all_bbox_outside_weights = None 95 | 96 | for batch_ind in range(batch_size): 97 | 98 | # label: 1 is positive, 0 is negative, -1 is dont care 99 | labels = np.empty((len(inds_inside),), dtype=np.float32) 100 | labels.fill(-1) 101 | 102 | #get rid of background gt_boxes 103 | gt_box = np.expand_dims(gt_boxes[batch_ind,:], 0) 104 | if gt_box[0,-1] == 0: 105 | #if target is not present(no gt box) all boxes are bg (0) 106 | labels.fill(0) 107 | else: 108 | # overlaps between the anchors and the gt boxes 109 | # overlaps (ex, gt), shape is A x G 110 | overlaps = bbox_overlaps( 111 | np.ascontiguousarray(anchors, dtype=np.float), 112 | np.ascontiguousarray(gt_box, dtype=np.float)) 113 | argmax_overlaps = overlaps.argmax(axis=1) # (A) 114 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 115 | gt_argmax_overlaps = overlaps.argmax(axis=0) # G 116 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 117 | np.arange(overlaps.shape[1])] 118 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 119 | 120 | if not cfg.PROPOSAL_CLOBBER_POSITIVES: 121 | # assign bg labels first so that positive labels can clobber them 122 | labels[max_overlaps < cfg.PROPOSAL_NEGATIVE_OVERLAP] = 0 123 | 124 | # fg label: for each gt, anchor with highest overlap 125 | labels[gt_argmax_overlaps] = 1 126 | # fg label: above threshold IOU 127 | labels[max_overlaps >= cfg.PROPOSAL_POSITIVE_OVERLAP] = 1 128 | 129 | if cfg.PROPOSAL_CLOBBER_POSITIVES: 130 | # assign bg labels last so that negative labels can clobber positives 131 | labels[max_overlaps < cfg.PROPOSAL_NEGATIVE_OVERLAP] = 0 132 | 133 | 134 | # subsample positive labels if we have too many 135 | num_fg = int(cfg.PROPOSAL_FG_FRACTION * cfg.PROPOSAL_BATCH_SIZE) 136 | fg_inds = np.where(labels == 1)[0] 137 | if len(fg_inds) > num_fg: 138 | disable_inds = npr.choice( 139 | fg_inds, size=(len(fg_inds) - num_fg), replace=False) 140 | labels[disable_inds] = -1 141 | 142 | # subsample negative labels if we have too many 143 | num_bg = cfg.PROPOSAL_BATCH_SIZE - np.sum(labels == 1) 144 | bg_inds = np.where(labels == 0)[0] 145 | if len(bg_inds) > num_bg: 146 | 147 | disable_inds = npr.choice( 148 | bg_inds, size=(len(bg_inds) - num_bg), replace=False) 149 | labels[disable_inds] = -1 150 | 151 | #if the gt_box is a dummy bg, there are no bbox_targets 152 | if gt_box[0,-1] == 0: 153 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) 154 | else: 155 | bbox_targets = _compute_targets(anchors, gt_box[argmax_overlaps, :]) 156 | 157 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 158 | bbox_inside_weights[labels == 1, :] = np.array(cfg.PROPOSAL_BBOX_INSIDE_WEIGHTS) 159 | 160 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 161 | if cfg.PROPOSAL_POSITIVE_WEIGHT < 0: 162 | # uniform weighting of examples (given non-uniform sampling) 163 | positive_weights = np.ones((1, 4)) 164 | negative_weights = np.zeros((1, 4)) 165 | else: 166 | assert ((cfg.PROPOSAL_POSITIVE_WEIGHT > 0) & 167 | (cfg.PROPOSAL_POSITIVE_WEIGHT < 1)) 168 | positive_weights = (cfg.PROPOSAL_POSITIVE_WEIGHT / 169 | (np.sum(labels == 1)) + 1) 170 | negative_weights = ((1.0 - cfg.PROPOSAL_POSITIVE_WEIGHT) / 171 | (np.sum(labels == 0)) + 1) 172 | bbox_outside_weights[labels == 1, :] = positive_weights 173 | bbox_outside_weights[labels == 0, :] = negative_weights 174 | 175 | # map up to original set of anchors 176 | labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 177 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) 178 | bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) 179 | bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) 180 | 181 | # labels 182 | labels = labels.reshape((1, height, width, A)) 183 | labels = labels.transpose(0, 3, 1, 2) 184 | labels = labels.reshape((1, 1, A * height, width)).transpose(0, 2, 3, 1) 185 | #labels = labels 186 | 187 | # bbox_targets 188 | bbox_targets = bbox_targets \ 189 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 190 | 191 | bbox_targets = bbox_targets 192 | # bbox_inside_weights 193 | bbox_inside_weights = bbox_inside_weights \ 194 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 195 | 196 | bbox_inside_weights = bbox_inside_weights 197 | 198 | # bbox_outside_weights 199 | bbox_outside_weights = bbox_outside_weights \ 200 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 201 | 202 | bbox_outside_weights = bbox_outside_weights 203 | 204 | #finished one batch, update data structs 205 | if all_labels is None: 206 | all_labels = labels 207 | all_bbox_targets = bbox_targets 208 | all_bbox_inside_weights = bbox_inside_weights 209 | all_bbox_outside_weights = bbox_outside_weights 210 | else: 211 | all_labels = np.concatenate((all_labels,labels),0) 212 | all_bbox_targets = np.concatenate((all_bbox_targets, 213 | bbox_targets), 0) 214 | all_bbox_inside_weights = np.concatenate((all_bbox_inside_weights, 215 | bbox_inside_weights),0) 216 | all_bbox_outside_weights = np.concatenate((all_bbox_outside_weights, 217 | bbox_outside_weights),0) 218 | 219 | return all_labels, all_bbox_targets, all_bbox_inside_weights, all_bbox_outside_weights 220 | 221 | 222 | def _unmap(data, count, inds, fill=0): 223 | """ Unmap a subset of item (data) back to the original set of items (of 224 | size count) """ 225 | if len(data.shape) == 1: 226 | ret = np.empty((count,), dtype=np.float32) 227 | ret.fill(fill) 228 | ret[inds] = data 229 | else: 230 | ret = np.empty((count,) + data.shape[1:], dtype=np.float32) 231 | ret.fill(fill) 232 | ret[inds, :] = data 233 | return ret 234 | 235 | 236 | def _compute_targets(ex_rois, gt_rois): 237 | """Compute bounding-box regression targets for an image.""" 238 | 239 | assert ex_rois.shape[0] == gt_rois.shape[0] 240 | assert ex_rois.shape[1] == 4 241 | assert gt_rois.shape[1] == 5 242 | 243 | return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False) 244 | -------------------------------------------------------------------------------- /model_defs/anchors/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps(np.ndarray[DTYPE_t, ndim=2] boxes, 16 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 17 | return bbox_overlaps_c(boxes, query_boxes) 18 | 19 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_overlaps_c( 20 | np.ndarray[DTYPE_t, ndim=2] boxes, 21 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 22 | """ 23 | Parameters 24 | ---------- 25 | boxes: (N, 4) ndarray of float 26 | query_boxes: (K, 4) ndarray of float 27 | Returns 28 | ------- 29 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 30 | """ 31 | cdef unsigned int N = boxes.shape[0] 32 | cdef unsigned int K = query_boxes.shape[0] 33 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 34 | cdef DTYPE_t iw, ih, box_area 35 | cdef DTYPE_t ua 36 | cdef unsigned int k, n 37 | for k in range(K): 38 | box_area = ( 39 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 40 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 41 | ) 42 | for n in range(N): 43 | iw = ( 44 | min(boxes[n, 2], query_boxes[k, 2]) - 45 | max(boxes[n, 0], query_boxes[k, 0]) + 1 46 | ) 47 | if iw > 0: 48 | ih = ( 49 | min(boxes[n, 3], query_boxes[k, 3]) - 50 | max(boxes[n, 1], query_boxes[k, 1]) + 1 51 | ) 52 | if ih > 0: 53 | ua = float( 54 | (boxes[n, 2] - boxes[n, 0] + 1) * 55 | (boxes[n, 3] - boxes[n, 1] + 1) + 56 | box_area - iw * ih 57 | ) 58 | overlaps[n, k] = iw * ih / ua 59 | return overlaps 60 | 61 | 62 | def bbox_intersections( 63 | np.ndarray[DTYPE_t, ndim=2] boxes, 64 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 65 | return bbox_intersections_c(boxes, query_boxes) 66 | 67 | 68 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_intersections_c( 69 | np.ndarray[DTYPE_t, ndim=2] boxes, 70 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 71 | """ 72 | For each query box compute the intersection ratio covered by boxes 73 | ---------- 74 | Parameters 75 | ---------- 76 | boxes: (N, 4) ndarray of float 77 | query_boxes: (K, 4) ndarray of float 78 | Returns 79 | ------- 80 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 81 | """ 82 | cdef unsigned int N = boxes.shape[0] 83 | cdef unsigned int K = query_boxes.shape[0] 84 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 85 | cdef DTYPE_t iw, ih, box_area 86 | cdef DTYPE_t ua 87 | cdef unsigned int k, n 88 | for k in range(K): 89 | box_area = ( 90 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 91 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 92 | ) 93 | for n in range(N): 94 | iw = ( 95 | min(boxes[n, 2], query_boxes[k, 2]) - 96 | max(boxes[n, 0], query_boxes[k, 0]) + 1 97 | ) 98 | if iw > 0: 99 | ih = ( 100 | min(boxes[n, 3], query_boxes[k, 3]) - 101 | max(boxes[n, 1], query_boxes[k, 1]) + 1 102 | ) 103 | if ih > 0: 104 | intersec[n, k] = iw * ih / box_area 105 | return intersec -------------------------------------------------------------------------------- /model_defs/anchors/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | #edited by Phil Ammirato 8 | 9 | import numpy as np 10 | from sympy.physics.paulialgebra import delta 11 | 12 | 13 | def bbox_transform(ex_rois, gt_rois): 14 | """ 15 | computes the distance from ground-truth boxes to the given boxes, normed by their size 16 | :param ex_rois: n * 4 numpy array, given boxes 17 | :param gt_rois: n * 4 numpy array, ground-truth boxes 18 | :return: deltas: n * 4 numpy array, ground-truth boxes 19 | """ 20 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 21 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 22 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 23 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 24 | 25 | # assert np.min(ex_widths) > 0.1 and np.min(ex_heights) > 0.1, \ 26 | # 'Invalid boxes found: {} {}'. \ 27 | # format(ex_rois[np.argmin(ex_widths), :], ex_rois[np.argmin(ex_heights), :]) 28 | 29 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 30 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 31 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 32 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 33 | 34 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 35 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 36 | targets_dw = np.log(gt_widths / ex_widths) 37 | targets_dh = np.log(gt_heights / ex_heights) 38 | 39 | targets = np.vstack( 40 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 41 | return targets 42 | 43 | 44 | def bbox_transform_inv(boxes, deltas): 45 | if boxes.shape[0] == 0: 46 | return np.zeros((0,), dtype=deltas.dtype) 47 | 48 | boxes = boxes.astype(deltas.dtype, copy=False) 49 | 50 | widths = boxes[:,:, 2] - boxes[:,:, 0] + 1.0 51 | heights = boxes[:,:, 3] - boxes[:,:, 1] + 1.0 52 | ctr_x = boxes[:,:, 0] + 0.5 * widths 53 | ctr_y = boxes[:,:, 1] + 0.5 * heights 54 | 55 | dx = deltas[:,:, 0::4] 56 | dy = deltas[:,:, 1::4] 57 | dw = deltas[:,:, 2::4] 58 | dh = deltas[:,:, 3::4] 59 | 60 | pred_ctr_x = dx * widths[:,:, np.newaxis] + ctr_x[:,:, np.newaxis] 61 | pred_ctr_y = dy * heights[:,:, np.newaxis] + ctr_y[:,:, np.newaxis] 62 | pred_w = np.exp(dw) * widths[:,:, np.newaxis] 63 | pred_h = np.exp(dh) * heights[:,:, np.newaxis] 64 | 65 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 66 | # x1 67 | pred_boxes[:,:, 0::4] = pred_ctr_x - 0.5 * pred_w 68 | # y1 69 | pred_boxes[:,:, 1::4] = pred_ctr_y - 0.5 * pred_h 70 | # x2 71 | pred_boxes[:,:, 2::4] = pred_ctr_x + 0.5 * pred_w 72 | # y2 73 | pred_boxes[:,:, 3::4] = pred_ctr_y + 0.5 * pred_h 74 | 75 | return pred_boxes 76 | 77 | 78 | def clip_boxes(boxes, im_shape): 79 | """ 80 | Clip boxes to image boundaries. 81 | """ 82 | if boxes.shape[0] == 0: 83 | return boxes 84 | 85 | # x1 >= 0 86 | boxes[:,:, 0::4] = np.maximum(np.minimum(boxes[:,:, 0::4], im_shape[1] - 1), 0) 87 | # y1 >= 0 88 | boxes[:,:, 1::4] = np.maximum(np.minimum(boxes[:,:, 1::4], im_shape[0] - 1), 0) 89 | # x2 < im_shape[1] 90 | boxes[:,:, 2::4] = np.maximum(np.minimum(boxes[:,:, 2::4], im_shape[1] - 1), 0) 91 | # y2 < im_shape[0] 92 | boxes[:,:, 3::4] = np.maximum(np.minimum(boxes[:,:, 3::4], im_shape[0] - 1), 0) 93 | return boxes 94 | 95 | 96 | -------------------------------------------------------------------------------- /model_defs/anchors/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6)): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | 44 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 45 | ratio_anchors = _ratio_enum(base_anchor, ratios) 46 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 47 | for i in range(ratio_anchors.shape[0])]) 48 | return anchors 49 | 50 | def _whctrs(anchor): 51 | """ 52 | Return width, height, x center, and y center for an anchor (window). 53 | """ 54 | 55 | w = anchor[2] - anchor[0] + 1 56 | h = anchor[3] - anchor[1] + 1 57 | x_ctr = anchor[0] + 0.5 * (w - 1) 58 | y_ctr = anchor[1] + 0.5 * (h - 1) 59 | return w, h, x_ctr, y_ctr 60 | 61 | def _mkanchors(ws, hs, x_ctr, y_ctr): 62 | """ 63 | Given a vector of widths (ws) and heights (hs) around a center 64 | (x_ctr, y_ctr), output a set of anchors (windows). 65 | """ 66 | 67 | ws = ws[:, np.newaxis] 68 | hs = hs[:, np.newaxis] 69 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 70 | y_ctr - 0.5 * (hs - 1), 71 | x_ctr + 0.5 * (ws - 1), 72 | y_ctr + 0.5 * (hs - 1))) 73 | return anchors 74 | 75 | def _ratio_enum(anchor, ratios): 76 | """ 77 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 78 | """ 79 | 80 | w, h, x_ctr, y_ctr = _whctrs(anchor) 81 | size = w * h 82 | size_ratios = size / ratios 83 | ws = np.round(np.sqrt(size_ratios)) 84 | hs = np.round(ws * ratios) 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | 88 | def _scale_enum(anchor, scales): 89 | """ 90 | Enumerate a set of anchors for each scale wrt an anchor. 91 | """ 92 | 93 | w, h, x_ctr, y_ctr = _whctrs(anchor) 94 | ws = w * scales 95 | hs = h * scales 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | if __name__ == '__main__': 100 | import time 101 | t = time.time() 102 | a = generate_anchors() 103 | from IPython import embed; embed() 104 | -------------------------------------------------------------------------------- /model_defs/anchors/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | # Edited by Phil Ammirato, UNC-Chapel Hill 8 | 9 | import numpy as np 10 | import yaml 11 | 12 | from .generate_anchors import generate_anchors 13 | from .bbox_transform import bbox_transform_inv, clip_boxes 14 | from ..nms.nms_wrapper import nms 15 | from .cython_bbox import bbox_overlaps, bbox_intersections 16 | 17 | 18 | 19 | 20 | def proposal_layer(class_prob_reshape, bbox_pred, img_info, cfg, _feat_stride=16, 21 | anchor_scales=[2, 4, 8],gt_boxes=None): 22 | ''' 23 | Outputs object detection proposals 24 | 25 | Applys estimated bounding-box transformations to a set of 26 | regular boxes (called "anchors"). 27 | 28 | fg = foreground (the target object) 29 | bg = background (not the target) 30 | 31 | 32 | Input parameters: 33 | 34 | class_prob_reshape: (ndarray) 35 | bbox_pred: (ndarray) 36 | img_info: (tuple of int) 37 | cfg: (Config) 38 | 39 | _feat_stride(optional): (int) scaling factor between input feature 40 | map (class_prob_reshape) and original image. 41 | Default: 16 42 | anchor_scales (optional): (list of int) scale for size of anchor boxes 43 | Default: [2,4,8] 44 | gt_boxes (optional): (ndarray) If not None, return value all_labels 45 | will have fg/bg label of each anchor box. If None 46 | all_labels will be meaningless. Default: None 47 | 48 | Returns: 49 | all_proposals: (ndarray) The proposed bounding boxes 50 | all_scores: (ndarray) The fg/bg score for each bounding box 51 | all_anchor_inds: (ndarray) The index of the anchor box that 52 | corresponds to the proposed bounding box 53 | all_labels: (ndarray) ground truth fg/bg label for each proposed 54 | bounding box. 55 | 56 | # Algorithm: 57 | # 58 | # for each (H, W) location i 59 | # generate A anchor boxes centered on cell i 60 | # apply predicted bbox deltas at cell i to each of the A anchors 61 | # clip predicted boxes to image 62 | # remove predicted boxes with either height or width < threshold 63 | # sort all (proposal, score) pairs by score from highest to lowest 64 | # take top cfg.PRE_NMS_TOP_N proposals before NMS 65 | # apply NMS with threshold 0.7 to remaining proposals 66 | # take after_nms_topN proposals after NMS 67 | # return the top proposals (-> RoIs top, scores top) 68 | 69 | ''' 70 | 71 | batch_size = class_prob_reshape.shape[0] 72 | _anchors = generate_anchors(scales=np.array(anchor_scales)) 73 | _num_anchors = _anchors.shape[0] 74 | 75 | # the first set of _num_anchors channels are bg probs 76 | # the second set are the fg probs, which we want 77 | scores = class_prob_reshape[:, _num_anchors:, :, :] 78 | bbox_deltas = bbox_pred 79 | 80 | # 1. Generate proposals from bbox deltas and shifted anchors 81 | height, width = scores.shape[-2:] 82 | 83 | # Enumerate all shifts 84 | shift_x = np.arange(0, width) * _feat_stride 85 | shift_y = np.arange(0, height) * _feat_stride 86 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 87 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 88 | shift_x.ravel(), shift_y.ravel())).transpose() 89 | 90 | # Enumerate all shifted anchors: 91 | # 92 | # add A anchors (1, A, 4) to 93 | # cell K shifts (K, 1, 4) to get 94 | # shift anchors (K, A, 4) 95 | # reshape to (K*A, 4) shifted anchors 96 | A = _num_anchors 97 | K = shifts.shape[0] 98 | anchors = _anchors.reshape((1, A, 4)) + \ 99 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 100 | anchors = anchors.reshape((K * A, 4)) 101 | anchors = np.tile(anchors, (batch_size,1,1)) 102 | 103 | # Transpose and reshape predicted bbox transformations to get them 104 | # into the same order as the anchors: 105 | # 106 | # bbox deltas will be (1, 4 * A, H, W) format 107 | # transpose to (1, H, W, 4 * A) 108 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 109 | # in slowest to fastest order 110 | bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((batch_size,-1, 4)) 111 | 112 | # Same story for the scores: 113 | # 114 | # scores are (1, A, H, W) format 115 | # transpose to (1, H, W, A) 116 | # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) 117 | scores = scores.transpose((0, 2, 3, 1)).reshape((batch_size,-1)) 118 | 119 | # Convert anchors into proposals via bbox transformations 120 | proposals = bbox_transform_inv(anchors, bbox_deltas) 121 | 122 | # 2. clip predicted boxes to image 123 | proposals = clip_boxes(proposals, img_info[:2]) 124 | 125 | # 3. remove predicted boxes with either height or width < threshold 126 | # (NOTE: convert min_size to input image scale stored in img_info[2]) 127 | lose = _filter_boxes(proposals, cfg.PROPOSAL_MIN_BOX_SIZE * img_info[2]) 128 | proposals[lose[0],lose[1],:] = 0 129 | scores[lose[0],lose[1]] = 0 130 | 131 | # 4. sort all (proposal, score) pairs by score from highest to lowest 132 | # 5. take top cfg.PRE_NMS_TOP_N (e.g. 6000) 133 | order = scores.argsort(1)[:,::-1] 134 | anchor_inds = np.tile(np.arange(order.shape[1]),(batch_size,1)) 135 | if cfg.PRE_NMS_TOP_N > 0: 136 | order = order[:,:cfg.PRE_NMS_TOP_N] 137 | b_select = np.arange(batch_size) 138 | proposals = np.take(proposals,order,axis=1)[b_select,b_select,:,:] 139 | scores = np.take(scores,order,axis=1)[b_select,b_select,:] 140 | anchor_inds = np.take(anchor_inds,order,axis=1)[b_select,b_select,:] 141 | 142 | all_proposals = None 143 | all_scores = None 144 | all_anchor_inds = None 145 | all_labels = None 146 | 147 | for batch_ind in range(batch_size): 148 | 149 | b_proposals = proposals[batch_ind,:,:] 150 | b_scores = np.expand_dims(scores[batch_ind,:], 1) 151 | b_anchor_inds = (np.expand_dims(anchor_inds[batch_ind,:],1) + 152 | batch_ind*anchors.shape[1]) 153 | 154 | # 6. apply nms (e.g. threshold = 0.7) 155 | # 7. take after_nms_topN (e.g. 300) 156 | # 8. return the top proposals (-> RoIs top) 157 | keep = nms(np.hstack((b_proposals, b_scores)), cfg.NMS_THRESH) 158 | if cfg.POST_NMS_TOP_N > 0: 159 | keep = keep[:cfg.POST_NMS_TOP_N] 160 | 161 | b_proposals = b_proposals[keep, :] 162 | b_scores = b_scores[keep] 163 | b_anchor_inds = b_anchor_inds[keep] 164 | 165 | assert(b_anchor_inds.shape[0] == b_scores.shape[0]) 166 | 167 | if b_proposals.shape[0] == 0: 168 | b_proposals = np.zeros((1,4)) 169 | b_scores = np.zeros((1,1)) 170 | b_anchor_inds = np.zeros(1) 171 | 172 | #match anchor inds with gt boxes 173 | b_labels = -1*np.ones(b_anchor_inds.size) 174 | if gt_boxes is not None: 175 | b_labels.fill(0) 176 | #get rid of background gt_boxes 177 | gt_box = np.expand_dims(gt_boxes[batch_ind,:],axis=0) 178 | if gt_box[0,-1] == 0:#this is a bg box 179 | b_labels.fill(0) 180 | else: 181 | # overlaps between the anchors and the gt boxes 182 | # overlaps (ex, gt), shape is A x G 183 | overlaps = bbox_overlaps( 184 | np.ascontiguousarray(b_proposals, dtype=np.float), 185 | np.ascontiguousarray(gt_box, dtype=np.float)) 186 | argmax_overlaps = overlaps.argmax(axis=1) # (A) 187 | max_overlaps = overlaps[np.arange(len(b_anchor_inds)), argmax_overlaps] 188 | gt_argmax_overlaps = overlaps.argmax(axis=0) # G 189 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 190 | np.arange(overlaps.shape[1])] 191 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 192 | 193 | if not cfg.PROPOSAL_CLOBBER_POSITIVES: 194 | # assign bg labels first so that positive labels can clobber them 195 | #labels[max_overlaps < cfg.TRAIN.PROPOSAL_NEGATIVE_OVERLAP] = 0 196 | b_labels[max_overlaps < .2] = 0 197 | 198 | # fg label: for each gt, anchor with highest overlap 199 | b_labels[gt_argmax_overlaps] = 1 200 | # fg label: above threshold IOU 201 | b_labels[max_overlaps >= .5] = 1 202 | 203 | if True:#cfg.TRAIN.PROPOSAL_CLOBBER_POSITIVES: 204 | # assign bg labels last so that negative labels can clobber positives 205 | b_labels[max_overlaps < .2] = 0 206 | 207 | #finshed with one batch, update data structs 208 | if all_proposals is None: 209 | all_proposals = np.expand_dims(b_proposals, axis=0) 210 | all_scores = np.expand_dims(b_scores, axis=0) 211 | all_anchor_inds = np.expand_dims(b_anchor_inds, axis=0) 212 | all_labels = np.expand_dims(b_labels, axis=0) 213 | else: 214 | all_proposals = _append_and_pad(all_proposals,b_proposals) 215 | all_scores = _append_and_pad(all_scores,b_scores) 216 | all_anchor_inds = _append_and_pad(all_anchor_inds,b_anchor_inds) 217 | all_labels = _append_and_pad(all_labels,b_labels) 218 | 219 | return all_proposals, all_scores,all_anchor_inds,all_labels 220 | 221 | 222 | def _append_and_pad(all_batches, single_batch): 223 | """ appends a1 to a2 at axis 0, padding the shorter of a1,a2""" 224 | if all_batches.shape[1] < single_batch.shape[0]: 225 | num_to_add = single_batch.shape[0] - all_batches.shape[1] 226 | all_batches = _pad_to_match(all_batches, num_to_add, axis=1) 227 | elif all_batches.shape[1] > single_batch.shape[0]: 228 | num_to_add = all_batches.shape[1] - single_batch.shape[0] 229 | single_batch = _pad_to_match(single_batch,num_to_add, axis=0) 230 | single_batch = np.expand_dims(single_batch,0) 231 | 232 | return np.concatenate((all_batches,single_batch)) 233 | 234 | 235 | def _pad_to_match(to_pad, num_to_add, axis=0): 236 | pad_dims = [] 237 | for dim, dim_size in enumerate(to_pad.shape): 238 | if (dim==axis): 239 | pad_dims.append(num_to_add) 240 | else: 241 | pad_dims.append(dim_size) 242 | padding = np.zeros(pad_dims) 243 | return np.concatenate((to_pad,padding),axis=axis) 244 | 245 | 246 | 247 | def _filter_boxes(boxes, min_size): 248 | """Remove all boxes with any side smaller than min_size.""" 249 | ws = boxes[:,:, 2] - boxes[:,:, 0] + 1 250 | hs = boxes[:,:, 3] - boxes[:,:, 1] + 1 251 | lose = np.where((ws < min_size) & (hs < min_size)) 252 | return lose 253 | 254 | 255 | 256 | 257 | -------------------------------------------------------------------------------- /model_defs/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | python setup.py build_ext --inplace 6 | 7 | rm -rf build/ 8 | -------------------------------------------------------------------------------- /model_defs/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /model_defs/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammirato/target_driven_instance_detection/be0d5fbd4c60cbd1f2ff483547449e703e1d3f56/model_defs/nms/__init__.py -------------------------------------------------------------------------------- /model_defs/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /model_defs/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /model_defs/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /model_defs/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /model_defs/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | #from faster_rcnn.nms.cpu_nms import cpu_nms 9 | #from faster_rcnn.nms.gpu_nms import gpu_nms 10 | from .cpu_nms import cpu_nms 11 | from .gpu_nms import gpu_nms 12 | 13 | 14 | def nms(dets, thresh, force_cpu=False): 15 | """Dispatch to either CPU or GPU NMS implementations.""" 16 | 17 | if dets.shape[0] == 0: 18 | return [] 19 | if True: #cfg.USE_GPU_NMS and not force_cpu: 20 | return gpu_nms(dets, thresh, device_id=0) 21 | else: 22 | return cpu_nms(dets, thresh) 23 | -------------------------------------------------------------------------------- /model_defs/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /model_defs/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDA_HOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDA_HOME' in os.environ: 38 | home = os.environ['CUDA_HOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | if nvcc is None: 45 | raise EnvironmentError('The nvcc binary could not be ' 46 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | home = os.path.dirname(os.path.dirname(nvcc)) 48 | 49 | cudaconfig = {'home': home, 'nvcc': nvcc, 50 | 'include': pjoin(home, 'include'), 51 | 'lib64': pjoin(home, 'lib64')} 52 | #for k, v in cudaconfig.iteritems(): 53 | for k in cudaconfig.keys(): 54 | v = cudaconfig[k] 55 | if not os.path.exists(v): 56 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 57 | 58 | return cudaconfig 59 | 60 | 61 | CUDA = locate_cuda() 62 | 63 | # Obtain the numpy include directory. This logic works across numpy versions. 64 | try: 65 | numpy_include = np.get_include() 66 | except AttributeError: 67 | numpy_include = np.get_numpy_include() 68 | 69 | 70 | def customize_compiler_for_nvcc(self): 71 | """inject deep into distutils to customize how the dispatch 72 | to gcc/nvcc works. 73 | 74 | If you subclass UnixCCompiler, it's not trivial to get your subclass 75 | injected in, and still have the right customizations (i.e. 76 | distutils.sysconfig.customize_compiler) run on it. So instead of going 77 | the OO route, I have this. Note, it's kindof like a wierd functional 78 | subclassing going on.""" 79 | 80 | # tell the compiler it can processes .cu 81 | self.src_extensions.append('.cu') 82 | 83 | # save references to the default compiler_so and _comple methods 84 | default_compiler_so = self.compiler_so 85 | super = self._compile 86 | 87 | # now redefine the _compile method. This gets executed for each 88 | # object but distutils doesn't have the ability to change compilers 89 | # based on source extension: we add it. 90 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 91 | print(extra_postargs) 92 | if os.path.splitext(src)[1] == '.cu': 93 | # use the cuda for .cu files 94 | self.set_executable('compiler_so', CUDA['nvcc']) 95 | # use only a subset of the extra_postargs, which are 1-1 translated 96 | # from the extra_compile_args in the Extension class 97 | postargs = extra_postargs['nvcc'] 98 | else: 99 | postargs = extra_postargs['gcc'] 100 | 101 | super(obj, src, ext, cc_args, postargs, pp_opts) 102 | # reset the default compiler_so, which we might have changed for cuda 103 | self.compiler_so = default_compiler_so 104 | 105 | # inject our redefined _compile method into the class 106 | self._compile = _compile 107 | 108 | 109 | # run the customize_compiler 110 | class custom_build_ext(build_ext): 111 | def build_extensions(self): 112 | customize_compiler_for_nvcc(self.compiler) 113 | build_ext.build_extensions(self) 114 | 115 | 116 | ext_modules = [ 117 | Extension( 118 | "anchors.cython_bbox", 119 | ["anchors/bbox.pyx"], 120 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 121 | include_dirs=[numpy_include] 122 | ), 123 | Extension( 124 | "nms.cpu_nms", 125 | ["nms/cpu_nms.pyx"], 126 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 127 | include_dirs=[numpy_include] 128 | ), 129 | Extension('nms.gpu_nms', 130 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 131 | library_dirs=[CUDA['lib64']], 132 | libraries=['cudart'], 133 | language='c++', 134 | runtime_library_dirs=[CUDA['lib64']], 135 | # this syntax is specific to this build system 136 | # we're only going to use certain compiler args with nvcc and not with gcc 137 | # the implementation of this trick is in customize_compiler() below 138 | extra_compile_args={'gcc': ["-Wno-unused-function"], 139 | 'nvcc': ['-arch=sm_35', 140 | '--ptxas-options=-v', 141 | '-c', 142 | '--compiler-options', 143 | "'-fPIC'"]}, 144 | include_dirs=[numpy_include, CUDA['include']] 145 | ), 146 | ] 147 | 148 | setup( 149 | name='tdid', 150 | ext_modules=ext_modules, 151 | # inject our custom trigger 152 | cmdclass={'build_ext': custom_build_ext}, 153 | ) 154 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | opencv-python 3 | sympy 4 | matplotlib 5 | h5py 6 | -------------------------------------------------------------------------------- /test_tdid.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchvision.models as models 4 | import cv2 5 | #import cPickle 6 | import numpy as np 7 | import importlib 8 | import json 9 | 10 | from model_defs.TDID import TDID 11 | from model_defs.nms.nms_wrapper import nms 12 | from utils import * 13 | 14 | import active_vision_dataset_processing.data_loading.active_vision_dataset as AVD 15 | 16 | 17 | def im_detect(net, target_data,im_data, im_info, features_given=True): 18 | """ 19 | Detect single target object in a single scene image. 20 | 21 | Input Parameters: 22 | net: (TDID) the network 23 | target_data: (torch Variable) target images 24 | im_data: (torch Variable) scene_image 25 | im_info: (tuple) (height,width,channels) of im_data 26 | 27 | features_given(optional): (bool) if true, target_data and im_data 28 | are feature maps from net.features, 29 | not images. Default: True 30 | 31 | 32 | Returns: 33 | scores (ndarray): N x 2 array of class scores 34 | (N boxes, classes={background,target}) 35 | boxes (ndarray): N x 4 array of predicted bounding boxes 36 | """ 37 | 38 | cls_prob, rois = net(target_data, im_data, im_info, 39 | features_given=features_given) 40 | scores = cls_prob.data.cpu().numpy()[0,:,:] 41 | zs = np.zeros((scores.size, 1)) 42 | scores = np.concatenate((zs,scores),1) 43 | boxes = rois.data.cpu().numpy()[0,:, :] 44 | 45 | return scores, boxes 46 | 47 | 48 | def test_net(model_name, net, dataloader, target_images, chosen_ids, cfg, 49 | max_dets_per_target=5, score_thresh=0.1, 50 | output_dir=None): 51 | """ 52 | Test a TDID network. 53 | 54 | Input Parameters: 55 | model_name: (string) name of model for saving results 56 | net: (TDID) the network 57 | dataloader: (torch DataLoader) dataloader for test set 58 | target_images: (dict) holds paths to target images 59 | chosen_ids: (list) list of object ids to test on 60 | cfg: (Config) config file 61 | 62 | max_dets_per_target (optional): (int) maximum number of detections 63 | outputted for a single target/scene 64 | image pair. Default: 5. 65 | score_thresh (optional): (float) minimum score a box must have to be 66 | outputted. Default: .1 67 | output_dir (optional): (str) full path of directory to save results in 68 | if None, nothing will be saved. 69 | Default: None. 70 | 71 | 72 | """ 73 | results = [] 74 | num_images = len(dataloader) 75 | id_to_name = cfg.ID_TO_NAME 76 | # timers 77 | _t = {'im_detect': Timer(), 'misc': Timer()} 78 | 79 | if output_dir is not None: 80 | if not(os.path.isdir(output_dir)): 81 | os.makedirs(output_dir) 82 | det_file = os.path.join(output_dir, model_name+'.json') 83 | 84 | #load targets, maybe compute features 85 | target_features_dict = {} 86 | target_data_dict = {} 87 | for id_ind,t_id in enumerate(chosen_ids): 88 | target_name = id_to_name[t_id] 89 | if target_name == 'background': 90 | continue 91 | target_data = [] 92 | for t_type,_ in enumerate(target_images[target_name]): 93 | img_ind = np.random.choice(np.arange( 94 | len(target_images[target_name][t_type]))) 95 | target_img = cv2.imread(target_images[target_name][t_type][img_ind]) 96 | target_img = normalize_image(target_img,cfg) 97 | target_data.append(target_img) 98 | 99 | target_data = match_and_concat_images_list(target_data) 100 | target_data = np_to_variable(target_data, is_cuda=True) 101 | target_data = target_data.permute(0, 3, 1, 2) 102 | if cfg.TEST_ONE_AT_A_TIME: 103 | target_data_dict[target_name] = target_data 104 | else: 105 | target_features_dict[target_name] = net.features(target_data) 106 | 107 | for i,batch in enumerate(dataloader): 108 | im_data= batch[0] 109 | org_img = im_data 110 | im_info = im_data.shape[:] 111 | if cfg.TEST_RESIZE_IMG_FACTOR > 0: 112 | im_data = cv2.resize(im_data,(0,0),fx=cfg.TEST_RESIZE_IMG_FACTOR, fy=cfg.TEST_RESIZE_IMG_FACTOR) 113 | im_data = normalize_image(im_data,cfg) 114 | im_data = np_to_variable(im_data, is_cuda=True) 115 | im_data = im_data.unsqueeze(0) 116 | im_data = im_data.permute(0, 3, 1, 2) 117 | 118 | #get image name and index 119 | img_name = batch[1][1] 120 | img_id = int(img_name[:-4]) 121 | 122 | #get image features 123 | if not cfg.TEST_ONE_AT_A_TIME: 124 | img_features = net.features(im_data) 125 | 126 | for id_ind,t_id in enumerate(chosen_ids): 127 | target_name = id_to_name[t_id] 128 | if target_name == 'background': 129 | continue 130 | 131 | if cfg.TEST_ONE_AT_A_TIME: 132 | target_data = target_data_dict[target_name] 133 | _t['im_detect'].tic() 134 | scores, boxes = im_detect(net, target_data, im_data, im_info, 135 | features_given=False) 136 | detect_time = _t['im_detect'].toc(average=False) 137 | else: 138 | target_features = target_features_dict[target_name] 139 | _t['im_detect'].tic() 140 | scores, boxes = im_detect(net, target_features, img_features, 141 | im_info, features_given=True) 142 | detect_time = _t['im_detect'].toc(average=False) 143 | _t['misc'].tic() 144 | 145 | if cfg.TEST_RESIZE_IMG_FACTOR > 0: 146 | boxes *= (1.0/cfg.TEST_RESIZE_IMG_FACTOR) 147 | if cfg.TEST_RESIZE_BOXES_FACTOR > 0: 148 | boxes *= cfg.TEST_RESIZE_BOXES_FACTOR 149 | 150 | #get scores for foreground, non maximum supression 151 | inds = np.where(scores[:, 1] > score_thresh)[0] 152 | fg_scores = scores[inds, 1] 153 | fg_boxes = boxes[inds,:] 154 | fg_dets = np.hstack((fg_boxes, fg_scores[:, np.newaxis])) \ 155 | .astype(np.float32, copy=False) 156 | keep = nms(fg_dets, cfg.TEST_NMS_OVERLAP_THRESH) 157 | fg_dets = fg_dets[keep, :] 158 | 159 | # Limit to max_per_target detections *over all classes* 160 | if max_dets_per_target > 0: 161 | image_scores = np.hstack([fg_dets[:, -1]]) 162 | if len(image_scores) > max_dets_per_target: 163 | image_thresh = np.sort(image_scores)[-max_dets_per_target] 164 | keep = np.where(fg_dets[:, -1] >= image_thresh)[0] 165 | fg_dets = fg_dets[keep, :] 166 | nms_time = _t['misc'].toc(average=False) 167 | 168 | print( 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \ 169 | .format(i + 1, num_images, detect_time, nms_time)) 170 | 171 | #put class id in the box 172 | fg_dets = np.insert(fg_dets,4,t_id,axis=1) 173 | 174 | for box in fg_dets: 175 | cid = int(box[4]) 176 | xmin = int(box[0]) 177 | ymin = int(box[1]) 178 | width = int(box[2]-box[0] + 1) 179 | height = int(box[3]-box[1] + 1) 180 | score = float(box[5]) 181 | results.append({'image_id':img_id, 'category_id':cid, 182 | 'bbox':[xmin,ymin,width,height],'score':score}) 183 | 184 | org_img = cv2.rectangle(org_img, (box[0], box[1]), (box[2],box[3]), (255,0,0), 2) 185 | 186 | cv2.imwrite('./out_img.jpg', org_img) 187 | if output_dir is not None: 188 | with open(det_file, 'w') as f: 189 | json.dump(results,f) 190 | return results 191 | 192 | 193 | 194 | if __name__ == '__main__': 195 | 196 | #load config file 197 | cfg_file = 'configAVD2' #NO EXTENSTION! 198 | cfg = importlib.import_module('configs.'+cfg_file) 199 | cfg = cfg.get_config() 200 | 201 | ##prepare target images (gather paths to the images) 202 | target_images ={} 203 | if cfg.PYTORCH_FEATURE_NET: 204 | target_images = get_target_images(cfg.TARGET_IMAGE_DIR, 205 | cfg.NAME_TO_ID.keys()) 206 | else: 207 | raise NotImplementedError 208 | #would need to add new normaliztion to get_target_images, and elsewhere 209 | 210 | #make sure only targets that have ids, and have target images are chosen 211 | test_ids = check_object_ids(cfg.TEST_OBJ_IDS, cfg.ID_TO_NAME,target_images) 212 | if test_ids==-1: 213 | print('Invalid IDS!') 214 | sys.exit() 215 | 216 | testset = get_AVD_dataset(cfg.AVD_ROOT_DIR, 217 | cfg.TEST_LIST, 218 | test_ids, 219 | max_difficulty=cfg.MAX_OBJ_DIFFICULTY, 220 | fraction_of_no_box=cfg.TEST_FRACTION_OF_NO_BOX_IMAGES) 221 | 222 | #create train/test loaders, with CUSTOM COLLATE function 223 | testloader = torch.utils.data.DataLoader(testset, 224 | batch_size=1, 225 | shuffle=True, 226 | num_workers=cfg.NUM_WORKERS, 227 | collate_fn=AVD.collate) 228 | 229 | # load net 230 | print('Loading ' + cfg.FULL_MODEL_LOAD_NAME + ' ...') 231 | net = TDID(cfg) 232 | load_net(cfg.FULL_MODEL_LOAD_DIR + cfg.FULL_MODEL_LOAD_NAME, net) 233 | net.features.eval()#freeze batchnorms layers? 234 | print('load model successfully!') 235 | 236 | net.cuda() 237 | net.eval() 238 | 239 | # evaluation 240 | test_net(cfg.MODEL_BASE_SAVE_NAME, net, testloader, 241 | target_images,test_ids,cfg, 242 | max_dets_per_target=cfg.MAX_DETS_PER_TARGET, 243 | score_thresh=cfg.SCORE_THRESH, 244 | output_dir=cfg.TEST_OUTPUT_DIR) 245 | 246 | 247 | 248 | 249 | -------------------------------------------------------------------------------- /test_tdid_det4class.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchvision.models as models 4 | import cv2 5 | import cPickle 6 | import numpy as np 7 | import importlib 8 | import json 9 | 10 | from model_defs.TDID import TDID 11 | from model_defs.nms.nms_wrapper import nms 12 | from utils import * 13 | from model_defs.anchors.bbox_transform import bbox_transform_inv, clip_boxes 14 | 15 | import active_vision_dataset_processing.data_loading.active_vision_dataset as AVD 16 | 17 | 18 | 19 | 20 | 21 | def im_classify(net, target_data,im_data, im_info, features_given=True): 22 | """ 23 | Gives classifcation score for image/target pair 24 | 25 | """ 26 | 27 | cls_prob = net(target_data, im_data, 28 | features_given=features_given, im_info=im_info) 29 | scores = cls_prob.data.cpu().numpy()[0,:,:] 30 | return scores.max() 31 | 32 | 33 | def test_net(model_name, net, dataloader, id_to_name, target_images, chosen_ids, cfg, 34 | max_dets_per_target=5, score_thresh=0.1, 35 | output_dir=None,): 36 | """Test a TDID network on an image dataset.""" 37 | #num images in test set 38 | num_images = len(dataloader) 39 | 40 | # timers 41 | _t = {'im_detect': Timer(), 'misc': Timer()} 42 | 43 | #pre compute features for all targets 44 | target_features_dict = {} 45 | target_data_dict = {} 46 | for id_ind,t_id in enumerate(chosen_ids): 47 | target_name = id_to_name[t_id] 48 | if target_name == 'background': 49 | continue 50 | target_data = [] 51 | for t_type,_ in enumerate(target_images[target_name]): 52 | img_ind = np.random.choice(np.arange( 53 | len(target_images[target_name][t_type]))) 54 | target_img = cv2.imread(target_images[target_name][t_type][img_ind]) 55 | target_img = normalize_image(target_img,cfg) 56 | target_data.append(target_img) 57 | 58 | target_data = match_and_concat_images_list(target_data) 59 | target_data = np_to_variable(target_data, is_cuda=True) 60 | target_data = target_data.permute(0, 3, 1, 2) 61 | if cfg.TEST_ONE_AT_A_TIME: 62 | target_data_dict[target_name] = target_data 63 | else: 64 | target_features_dict[target_name] = net.features(target_data) 65 | 66 | print('Hi') 67 | 68 | num_correct = 0 69 | num_total = 0 70 | total_score = 0 71 | total_run = 0 72 | for i,batch in enumerate(dataloader): 73 | im_data= batch[0] 74 | im_info = im_data.shape[:] 75 | im_data=normalize_image(im_data,cfg) 76 | im_data = np_to_variable(im_data, is_cuda=True) 77 | im_data = im_data.unsqueeze(0) 78 | im_data = im_data.permute(0, 3, 1, 2) 79 | 80 | #get image name and index 81 | img_name = batch[1][1] 82 | img_ind = int(img_name[:-4]) 83 | 84 | gt_id = batch[1][0][0][4] 85 | 86 | max_score = 0 87 | max_id = 0 88 | tos = 0 89 | #get image features 90 | if not cfg.TEST_ONE_AT_A_TIME: 91 | img_features = net.features(im_data) 92 | 93 | for id_ind,t_id in enumerate(chosen_ids): 94 | target_name = id_to_name[t_id] 95 | if target_name == 'background': 96 | continue 97 | 98 | if cfg.TEST_ONE_AT_A_TIME: 99 | target_data = target_data_dict[target_name] 100 | _t['im_detect'].tic() 101 | score = im_detect(net, target_data, im_data, im_info, 102 | features_given=False) 103 | detect_time = _t['im_detect'].toc(average=False) 104 | else: 105 | target_features = target_features_dict[target_name] 106 | _t['im_detect'].tic() 107 | score = im_detect(net, target_features, img_features, im_info) 108 | detect_time = _t['im_detect'].toc(average=False) 109 | 110 | _t['misc'].tic() 111 | 112 | total_score += score 113 | total_run += 1 114 | if score>max_score: 115 | max_score = score 116 | max_id = t_id 117 | if t_id == gt_id: 118 | tos = score 119 | if max_id == gt_id: 120 | num_correct += 1 121 | num_total += 1 122 | 123 | 124 | print num_correct 125 | print num_total 126 | print float(total_score)/float(total_run) 127 | return float(num_correct)/float(num_total) 128 | 129 | 130 | 131 | 132 | 133 | 134 | if __name__ == '__main__': 135 | 136 | #load config file 137 | cfg_file = 'configGEN4UWC' #NO EXTENSTION! 138 | cfg = importlib.import_module('configs.'+cfg_file) 139 | cfg = cfg.get_config() 140 | 141 | ##prepare target images (gather paths to the images) 142 | target_images ={} 143 | if cfg.PYTORCH_FEATURE_NET: 144 | target_images = get_target_images(cfg.TARGET_IMAGE_DIR,cfg.NAME_TO_ID.keys()) 145 | else: 146 | print 'Must use pytorch pretrained model, others not supported' 147 | #would need to add new normaliztion to get_target_images, and elsewhere 148 | 149 | #make sure only targets that have ids, and have target images are chosen 150 | test_ids = check_object_ids(cfg.TEST_OBJ_IDS, cfg.ID_TO_NAME,target_images) 151 | #print test_ids 152 | if test_ids==-1: 153 | print 'Invalid IDS!' 154 | sys.exit() 155 | 156 | testset = get_AVD_dataset(cfg.DATA_BASE_DIR, 157 | cfg.TEST_LIST, 158 | test_ids, 159 | max_difficulty=6,#cfg.MAX_OBJ_DIFFICULTY, 160 | fraction_of_no_box=1)#cfg.TEST_FRACTION_OF_NO_BOX_IMAGES) 161 | 162 | #create train/test loaders, with CUSTOM COLLATE function 163 | testloader = torch.utils.data.DataLoader(testset, 164 | batch_size=1, 165 | shuffle=False, 166 | num_workers=cfg.NUM_WORKERS, 167 | collate_fn=AVD.collate) 168 | 169 | load_names = [ 170 | 'TDID_GEN4UWC_20_16_20000_230.62730_0.52458.h5', 171 | 'TDID_GEN4UWC_20_8_10000_291.84813_0.48088.h5', 172 | 'TDID_GEN4UWC_20_16_20000_230.62730_0.52458.h5', 173 | # 'TDID_GEN4UWC_17_1_2800_971.88680_0.45155.h5', 174 | # 'TDID_GEN4UWC_18_2_2300_693.64961_0.51149.h5', 175 | # 'TDID_GEN4UWC_18_3_3400_694.39169_0.44955.h5', 176 | # 'TDID_GEN4UWC_17_2_2900_863.00803_0.45954.h5', 177 | # 'TDID_GEN4UWC_15_1_1000_476.61535_0.45654.h5', 178 | # 'TDID_GEN4UWC_16_2_1700_345.35398_0.30070.h5', 179 | # 'TDID_GEN4UWC_15_2_1800_338.26621_0.38262.h5', 180 | ] 181 | for load_name in load_names: 182 | 183 | # load net 184 | #print('Loading ' + cfg.FULL_MODEL_LOAD_NAME + ' ...') 185 | net = TDID(cfg) 186 | load_net(cfg.FULL_MODEL_LOAD_DIR + load_name, net) 187 | net.features.eval()#freeze batchnorms layers? 188 | print('load model successfully!') 189 | 190 | net.cuda() 191 | net.eval() 192 | 193 | # evaluation 194 | acc = test_net(cfg.MODEL_BASE_SAVE_NAME, net, testloader, cfg.ID_TO_NAME, 195 | target_images,test_ids,cfg, 196 | max_dets_per_target=cfg.MAX_DETS_PER_TARGET, 197 | score_thresh=cfg.SCORE_THRESH, 198 | output_dir=cfg.TEST_OUTPUT_DIR) 199 | 200 | print '{} {}'.format(acc, load_name) 201 | 202 | 203 | -------------------------------------------------------------------------------- /train_tdid.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | import torchvision.models as models 4 | import os 5 | import sys 6 | import importlib 7 | import numpy as np 8 | from datetime import datetime 9 | import cv2 10 | import time 11 | 12 | from model_defs.TDID import TDID 13 | from utils import * 14 | from evaluation.coco_det_eval import coco_det_eval 15 | 16 | import active_vision_dataset_processing.data_loading.active_vision_dataset as AVD 17 | 18 | # load config 19 | cfg_file = 'configAVD2' #NO FILE EXTENSTION! 20 | cfg = importlib.import_module('configs.'+cfg_file) 21 | cfg = cfg.get_config() 22 | 23 | if cfg.DET4CLASS: 24 | test_net = importlib.import_module('test_tdid_det4class').test_net 25 | else: 26 | test_net = importlib.import_module('test_tdid').test_net 27 | 28 | 29 | def validate_and_save(cfg,net,valset,target_images, epoch, total_iterations): 30 | ''' 31 | Test on validation data, and save a snapshot of model 32 | ''' 33 | valloader = torch.utils.data.DataLoader(valset, 34 | batch_size=1, 35 | shuffle=True, 36 | collate_fn=AVD.collate) 37 | model_name = cfg.MODEL_BASE_SAVE_NAME + '_{}'.format(epoch) 38 | net.eval() 39 | all_results = test_net(model_name, net, valloader, 40 | target_images, cfg.VAL_OBJ_IDS, cfg, 41 | max_dets_per_target=cfg.MAX_DETS_PER_TARGET, 42 | output_dir=cfg.TEST_OUTPUT_DIR, 43 | score_thresh=cfg.SCORE_THRESH) 44 | 45 | if len(all_results) == 0: 46 | #coco code can't handle no detections? 47 | m_ap = 0 48 | else: 49 | m_ap = coco_det_eval(cfg.VAL_GROUND_TRUTH_BOXES, 50 | cfg.TEST_OUTPUT_DIR+model_name+'.json', 51 | catIds=cfg.VAL_OBJ_IDS) 52 | 53 | save_name = os.path.join(cfg.SNAPSHOT_SAVE_DIR, 54 | (cfg.MODEL_BASE_SAVE_NAME+ 55 | '_{}_{}_{:1.5f}_{:1.5f}.h5').format(epoch, 56 | total_iterations, epoch_loss/epoch_step_cnt,m_ap)) 57 | save_net(save_name, net) 58 | print('save model: {}'.format(save_name)) 59 | net.train() 60 | net.features.eval() #freeze batch norm layers? 61 | 62 | 63 | #prepare target images (gather paths to the images) 64 | target_images ={} 65 | if cfg.PYTORCH_FEATURE_NET: 66 | target_images = get_target_images(cfg.TARGET_IMAGE_DIR,cfg.NAME_TO_ID.keys()) 67 | else: 68 | raise NotImplementedError 69 | #would need to add new normalization to get_target_images, and utilts, etc 70 | 71 | #make sure only targets that have ids, and have target images are chosen 72 | train_ids = check_object_ids(cfg.TRAIN_OBJ_IDS, cfg.ID_TO_NAME,target_images) 73 | cfg.TRAIN_OBJ_IDS = train_ids 74 | val_ids = check_object_ids(cfg.VAL_OBJ_IDS, cfg.ID_TO_NAME,target_images) 75 | cfg.VAL_OBJ_IDS = val_ids 76 | if train_ids==-1 or val_ids==-1: 77 | print('Invalid IDS!') 78 | sys.exit() 79 | 80 | 81 | print('Setting up training data...') 82 | train_set = get_AVD_dataset(cfg.AVD_ROOT_DIR, 83 | cfg.TRAIN_LIST, 84 | train_ids, 85 | max_difficulty=cfg.MAX_OBJ_DIFFICULTY, 86 | fraction_of_no_box=cfg.FRACTION_OF_NO_BOX_IMAGES) 87 | valset = get_AVD_dataset(cfg.AVD_ROOT_DIR, 88 | cfg.VAL_LIST, 89 | val_ids, 90 | max_difficulty=cfg.MAX_OBJ_DIFFICULTY, 91 | fraction_of_no_box=cfg.VAL_FRACTION_OF_NO_BOX_IMAGES) 92 | 93 | trainloader = torch.utils.data.DataLoader(train_set, 94 | batch_size=cfg.BATCH_SIZE, 95 | shuffle=True, 96 | num_workers=cfg.NUM_WORKERS, 97 | collate_fn=AVD.collate) 98 | 99 | print('Loading network...') 100 | net = TDID(cfg) 101 | if cfg.LOAD_FULL_MODEL: 102 | load_net(cfg.FULL_MODEL_LOAD_DIR + cfg.FULL_MODEL_LOAD_NAME, net) 103 | else: 104 | weights_normal_init(net, dev=0.01) 105 | if cfg.USE_PRETRAINED_WEIGHTS: 106 | net.features = load_pretrained_weights(cfg.FEATURE_NET_NAME) 107 | net.features.eval()#freeze batchnorms layers? 108 | 109 | if not os.path.exists(cfg.SNAPSHOT_SAVE_DIR): 110 | os.makedirs(cfg.SNAPSHOT_SAVE_DIR) 111 | if not os.path.exists(cfg.META_SAVE_DIR): 112 | os.makedirs(cfg.META_SAVE_DIR) 113 | 114 | #put net on gpu 115 | net.cuda() 116 | net.train() 117 | 118 | #setup optimizer 119 | params = list(net.parameters()) 120 | optimizer = torch.optim.SGD(params, lr=cfg.LEARNING_RATE, 121 | momentum=cfg.MOMENTUM, 122 | weight_decay=cfg.WEIGHT_DECAY) 123 | # things to keep track of during training 124 | train_loss = 0 125 | t = Timer() 126 | t.tic() 127 | total_iterations = 1 128 | 129 | save_training_meta_data(cfg,net) 130 | 131 | print('Begin Training...') 132 | for epoch in range(1,cfg.MAX_NUM_EPOCHS+1): 133 | target_use_cnt = {} 134 | for cid in train_ids: 135 | target_use_cnt[cid] = [0,0] 136 | epoch_loss = 0 137 | epoch_step_cnt = 0 138 | for step,batch in enumerate(trainloader): 139 | total_iterations += 1 140 | if cfg.BATCH_SIZE == 1: 141 | batch[0] = [batch[0]] 142 | batch[1] = [batch[1]] 143 | if type(batch[0]) is not list or len(batch[0]) < cfg.BATCH_SIZE: 144 | continue 145 | 146 | batch_im_data = [] 147 | batch_target_data = [] 148 | batch_gt_boxes = [] 149 | for batch_ind in range(cfg.BATCH_SIZE): 150 | im_data=batch[0][batch_ind] 151 | im_data=normalize_image(im_data,cfg) 152 | gt_boxes = np.asarray(batch[1][batch_ind][0],dtype=np.float32) 153 | 154 | if np.random.rand() < cfg.RESIZE_IMG: 155 | im_data = cv2.resize(im_data,(0,0),fx=cfg.RESIZE_IMG_FACTOR, 156 | fy=cfg.RESIZE_IMG_FACTOR) 157 | if gt_boxes.shape[0] >0: 158 | gt_boxes[:,:4] *= cfg.RESIZE_IMG_FACTOR 159 | 160 | #if there are no boxes for this image, add a dummy background box 161 | if gt_boxes.shape[0] == 0: 162 | gt_boxes = np.asarray([[0,0,1,1,0]]) 163 | 164 | objects_present = gt_boxes[:,4] 165 | objects_present = objects_present[np.where(objects_present!=0)[0]] 166 | not_present = np.asarray([ind for ind in train_ids 167 | if ind not in objects_present and 168 | ind != 0]) 169 | 170 | #pick a target 171 | if ((np.random.rand() < cfg.CHOOSE_PRESENT_TARGET or 172 | not_present.shape[0]==0) and 173 | objects_present.shape[0]!=0): 174 | target_ind = int(np.random.choice(objects_present)) 175 | gt_boxes = gt_boxes[np.where(gt_boxes[:,4]==target_ind)[0],:-1] 176 | gt_boxes[0,4] = 1 177 | target_use_cnt[target_ind][0] += 1 178 | else:#the target is not in the image, give a dummy background box 179 | target_ind = int(np.random.choice(not_present)) 180 | gt_boxes = np.asarray([[0,0,1,1,0]]) 181 | target_use_cnt[target_ind][1] += 1 182 | 183 | #get target images 184 | target_name = cfg.ID_TO_NAME[target_ind] 185 | target_data = [] 186 | for t_type,_ in enumerate(target_images[target_name]): 187 | img_ind = np.random.choice(np.arange( 188 | len(target_images[target_name][t_type]))) 189 | target_img = cv2.imread(target_images[target_name][t_type][img_ind]) 190 | if np.random.rand() < cfg.AUGMENT_TARGET_IMAGES: 191 | target_img = augment_image(target_img, 192 | do_illum=cfg.AUGMENT_TARGET_ILLUMINATION) 193 | target_img = normalize_image(target_img,cfg) 194 | batch_target_data.append(target_img) 195 | 196 | batch_im_data.append(im_data) 197 | batch_gt_boxes.extend(gt_boxes) 198 | 199 | #prep data for input to network 200 | target_data = match_and_concat_images_list(batch_target_data, 201 | min_size=cfg.MIN_TARGET_SIZE) 202 | im_data = match_and_concat_images_list(batch_im_data) 203 | gt_boxes = np.asarray(batch_gt_boxes) 204 | im_info = im_data.shape[1:] 205 | im_data = np_to_variable(im_data, is_cuda=True) 206 | im_data = im_data.permute(0, 3, 1, 2) 207 | target_data = np_to_variable(target_data, is_cuda=True) 208 | target_data = target_data.permute(0, 3, 1, 2) 209 | 210 | # forward 211 | net(target_data, im_data, im_info, gt_boxes=gt_boxes) 212 | # if cfg.USE_ROI_LOSS_ONLY: 213 | # loss = net.roi_cross_entropy_loss 214 | # else: 215 | # loss = net.loss 216 | loss = net.loss 217 | 218 | train_loss += loss.data[0] 219 | epoch_step_cnt += 1 220 | epoch_loss += loss.data[0] 221 | 222 | # backprop and parameter update 223 | optimizer.zero_grad() 224 | loss.backward() 225 | clip_gradient(net, 10.) 226 | optimizer.step() 227 | 228 | #print out training info 229 | if step % cfg.DISPLAY_INTERVAL == 0: 230 | duration = t.toc(average=False) 231 | fps = step+1.0 / duration 232 | 233 | log_text = 'step %d, epoch_avg_loss: %.4f, fps: %.2f (%.2fs per batch) ' \ 234 | 'epoch:%d loss: %.4f tot_avg_loss: %.4f %s' % ( 235 | step, epoch_loss/epoch_step_cnt, fps, 1./fps, 236 | epoch, loss.data[0],train_loss/(step+1), cfg.MODEL_BASE_SAVE_NAME) 237 | print(log_text) 238 | print(target_use_cnt) 239 | 240 | if (not cfg.SAVE_BY_EPOCH) and total_iterations % cfg.SAVE_FREQ==0: 241 | validate_and_save(cfg,net,valset,target_images,epoch,total_iterations) 242 | 243 | ###################################################### 244 | #epoch over 245 | if cfg.SAVE_BY_EPOCH and epoch % cfg.SAVE_FREQ == 0: 246 | validate_and_save(cfg,net,valset,target_images, epoch, total_iterations) 247 | 248 | --------------------------------------------------------------------------------