├── .gitignore ├── .ipynb_checkpoints └── train_net-checkpoint.py ├── LICENSE ├── README.md ├── config_files ├── .ipynb_checkpoints │ ├── oicr_alexnet-checkpoint.yaml │ ├── oicr_alexnet_dilated-checkpoint.yaml │ ├── oicr_vgg-checkpoint.yaml │ ├── oicr_vgg_dilated-checkpoint.yaml │ ├── oicr_vggm_dilated-checkpoint.yaml │ ├── pcl_vgg_dilated-checkpoint.yaml │ ├── wsddn_alexnet-checkpoint.yaml │ ├── wsddn_vgg-checkpoint.yaml │ └── wsddn_vggm-checkpoint.yaml ├── oicr_alexnet.yaml ├── oicr_alexnet_dilated.yaml ├── oicr_vgg.yaml ├── oicr_vgg_dilated.yaml ├── oicr_vggm_dilated.yaml ├── pcl_vgg_dilated.yaml ├── wsddn_alexnet.yaml ├── wsddn_vgg.yaml └── wsddn_vggm.yaml ├── train_net.py └── wsod ├── __init__.py ├── __pycache__ └── __init__.cpython-37.pyc ├── config ├── .ipynb_checkpoints │ ├── __init__-checkpoint.py │ └── config-checkpoint.py ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── config.cpython-37.pyc └── config.py ├── data ├── .ipynb_checkpoints │ └── voc-checkpoint.py ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── voc.cpython-37.pyc └── voc.py ├── models ├── .ipynb_checkpoints │ ├── __init__-checkpoint.py │ ├── cmil-checkpoint.py │ ├── heads-checkpoint.py │ ├── losses-checkpoint.py │ ├── melm-checkpoint.py │ ├── models-checkpoint.py │ └── utils-checkpoint.py ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── heads.cpython-37.pyc │ ├── losses.cpython-37.pyc │ ├── models.cpython-37.pyc │ └── utils.cpython-37.pyc ├── backbones │ ├── .ipynb_checkpoints │ │ ├── __init__-checkpoint.py │ │ └── vggm-checkpoint.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── vggm.cpython-37.pyc │ └── vggm.py ├── cmil.py ├── heads.py ├── losses.py ├── melm.py ├── models.py └── utils.py └── optim ├── .ipynb_checkpoints ├── __init__-checkpoint.py └── caffesgd-checkpoint.py ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc └── caffesgd.cpython-37.pyc └── caffesgd.py /.gitignore: -------------------------------------------------------------------------------- 1 | datasets 2 | anno 3 | outputs 4 | wsod/models/backbones/vggm1024-caffe.pt 5 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/train_net-checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Detectron2 training script with a plain training loop. 4 | 5 | This scripts reads a given config file and runs the training or evaluation. 6 | It is an entry point that is able to train standard models in detectron2. 7 | 8 | In order to let one script support training of many models, 9 | this script contains logic that are specific to these built-in models and therefore 10 | may not be suitable for your own project. 11 | For example, your research project perhaps only needs a single "evaluator". 12 | 13 | Therefore, we recommend you to use detectron2 as an library and take 14 | this file as an example of how to use the library. 15 | You may want to write your own script with your datasets and other customizations. 16 | 17 | Compared to "train_net.py", this script supports fewer features, and also 18 | includes fewer abstraction. 19 | """ 20 | 21 | import logging 22 | import os 23 | import time 24 | from collections import OrderedDict 25 | import torch 26 | from torch.nn.parallel import DistributedDataParallel 27 | 28 | import detectron2.utils.comm as comm 29 | from detectron2.checkpoint import DetectionCheckpointer, PeriodicCheckpointer 30 | from detectron2.data import ( 31 | MetadataCatalog, 32 | build_detection_test_loader, 33 | build_detection_train_loader, 34 | ) 35 | from detectron2.engine import default_argument_parser, default_setup, launch 36 | from detectron2.evaluation import ( 37 | CityscapesEvaluator, 38 | COCOEvaluator, 39 | COCOPanopticEvaluator, 40 | DatasetEvaluators, 41 | LVISEvaluator, 42 | #PascalVOCDetectionEvaluator, 43 | SemSegEvaluator, 44 | inference_on_dataset, 45 | print_csv_format, 46 | ) 47 | from detectron2.modeling import build_model 48 | from detectron2.solver.lr_scheduler import WarmupMultiStepLR, WarmupCosineLR 49 | from detectron2.utils.events import ( 50 | CommonMetricPrinter, 51 | EventStorage, 52 | JSONWriter, 53 | TensorboardXWriter, 54 | ) 55 | 56 | logger = logging.getLogger("detectron2") 57 | 58 | from wsod.config import get_cfg 59 | from wsod.data.voc import PascalVOCDetectionEvaluator 60 | from wsod import models 61 | 62 | 63 | class TrainTimer(): 64 | def __init__(self): 65 | self.reset() 66 | 67 | def reset(self): 68 | self.start = time.perf_counter() 69 | 70 | def check(self): 71 | return time.perf_counter() - self.start 72 | 73 | def tic(self): 74 | tm = self.check() 75 | self.reset() 76 | return tm 77 | 78 | def build_optimizer(cfg, model: torch.nn.Module) -> torch.optim.Optimizer: 79 | """ 80 | Build an optimizer from config. 81 | """ 82 | params: List[Dict[str, Any]] = [] 83 | for key, value in model.named_parameters(): 84 | if not value.requires_grad: 85 | print(f'{key} requires no grad') 86 | continue 87 | lr = cfg.SOLVER.BASE_LR 88 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 89 | if key.endswith("norm.weight") or key.endswith("norm.bias"): 90 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM 91 | elif key.endswith(".bias"): 92 | # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 93 | # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer 94 | # hyperparameters are by default exactly the same as for regular 95 | # weights. 96 | lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR 97 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS 98 | if 'refinement' in key: 99 | lr = lr * cfg.SOLVER.REFINEMENT_LR_FACTOR 100 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 101 | print(f'{key} | lr: {lr:6.04f}, weight_decay: {weight_decay:6.04f}') 102 | 103 | solver_type = cfg.SOLVER.TYPE.lower() 104 | if solver_type == 'sgd': 105 | optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) 106 | elif solver_type == 'caffesgd': 107 | from optim.caffesgd import CaffeSGD 108 | optimizer = CaffeSGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) 109 | return optimizer 110 | 111 | 112 | def build_lr_scheduler(cfg, optimizer): 113 | """ 114 | Build a LR scheduler from config. 115 | """ 116 | name = cfg.SOLVER.LR_SCHEDULER_NAME 117 | if name == "WarmupMultiStepLR": 118 | return WarmupMultiStepLR( 119 | optimizer, 120 | cfg.SOLVER.STEPS, 121 | cfg.SOLVER.GAMMA, 122 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 123 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 124 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 125 | ) 126 | elif name == "WarmupCosineLR": 127 | return WarmupCosineLR( 128 | optimizer, 129 | cfg.SOLVER.MAX_ITER, 130 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 131 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 132 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 133 | ) 134 | elif name == "CaffeLRScheduler": 135 | from optim.caffesgd import CaffeLRScheduler 136 | return CaffeLRScheduler( 137 | optimizer, 138 | cfg.SOLVER.STEPS, 139 | cfg.SOLVER.GAMMA, 140 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 141 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 142 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 143 | ) 144 | else: 145 | raise ValueError("Unknown LR scheduler: {}".format(name)) 146 | 147 | 148 | 149 | def get_evaluator(cfg, dataset_name, output_folder=None): 150 | """ 151 | Create evaluator(s) for a given dataset. 152 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 153 | For your own dataset, you can simply create an evaluator manually in your 154 | script and do not have to worry about the hacky if-else logic here. 155 | """ 156 | if output_folder is None: 157 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 158 | evaluator_list = [] 159 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 160 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 161 | evaluator_list.append( 162 | SemSegEvaluator( 163 | dataset_name, 164 | distributed=True, 165 | num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 166 | ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 167 | output_dir=output_folder, 168 | ) 169 | ) 170 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 171 | evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) 172 | if evaluator_type == "coco_panoptic_seg": 173 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 174 | if evaluator_type == "cityscapes": 175 | assert ( 176 | torch.cuda.device_count() >= comm.get_rank() 177 | ), "CityscapesEvaluator currently do not work with multiple machines." 178 | return CityscapesEvaluator(dataset_name) 179 | if evaluator_type == "pascal_voc": 180 | return PascalVOCDetectionEvaluator(dataset_name) 181 | if evaluator_type == "lvis": 182 | return LVISEvaluator(dataset_name, cfg, True, output_folder) 183 | if len(evaluator_list) == 0: 184 | raise NotImplementedError( 185 | "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type) 186 | ) 187 | if len(evaluator_list) == 1: 188 | return evaluator_list[0] 189 | return DatasetEvaluators(evaluator_list) 190 | 191 | 192 | def do_test(cfg, model): 193 | results = OrderedDict() 194 | for dataset_name in cfg.DATASETS.TEST: 195 | data_loader = build_detection_test_loader(cfg, dataset_name) 196 | evaluator = get_evaluator( 197 | cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) 198 | ) 199 | results_i = inference_on_dataset(model, data_loader, evaluator) 200 | results[dataset_name] = results_i 201 | if comm.is_main_process(): 202 | logger.info("Evaluation results for {} in csv format:".format(dataset_name)) 203 | print_csv_format(results_i) 204 | if len(results) == 1: 205 | results = list(results.values())[0] 206 | return results 207 | 208 | 209 | def do_train(cfg, model, resume=False): 210 | model.train() 211 | optimizer = build_optimizer(cfg, model) 212 | 213 | if cfg.SOLVER.MIXED_PRECISION: 214 | from apex import amp 215 | scale = None if cfg.SOLVER.LOSS_SCALE < 1e-12 else cfg.SOLVER.LOSS_SCALE 216 | model, optimizer = amp.initialize(model, optimizer, loss_scale=scale) 217 | 218 | def backward_with_scale(losses): 219 | with amp.scale_loss(losses, optimizer) as scaled_loss: 220 | scaled_loss.backward(retain_graph=True) 221 | else: 222 | def backward_with_scale(losses): 223 | losses.backward(retain_graph=True) 224 | 225 | scheduler = build_lr_scheduler(cfg, optimizer) 226 | 227 | checkpointer = DetectionCheckpointer( 228 | model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler 229 | ) 230 | start_iter = ( 231 | checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 232 | ) 233 | max_iter = cfg.SOLVER.MAX_ITER 234 | 235 | periodic_checkpointer = PeriodicCheckpointer( 236 | checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter 237 | ) 238 | 239 | writers = ( 240 | [ 241 | CommonMetricPrinter(max_iter), 242 | JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), 243 | TensorboardXWriter(cfg.OUTPUT_DIR), 244 | ] 245 | if comm.is_main_process() 246 | else [] 247 | ) 248 | 249 | # compared to "train_net.py", we do not support accurate timing and 250 | # precise BN here, because they are not trivial to implement 251 | data_loader = build_detection_train_loader(cfg) 252 | logger.info("Starting training from iteration {}".format(start_iter)) 253 | timer = TrainTimer() 254 | with EventStorage(start_iter) as storage: 255 | for data, iteration in zip(data_loader, range(start_iter, max_iter)): 256 | iteration = iteration + 1 257 | storage.step() 258 | storage.put_scalar('data_time', timer.tic()) 259 | 260 | loss_dict = model(data) 261 | losses = sum(loss for loss in loss_dict.values()) 262 | assert torch.isfinite(losses).all(), loss_dict 263 | 264 | loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} 265 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 266 | if comm.is_main_process(): 267 | storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) 268 | 269 | backward_with_scale(losses) 270 | 271 | if iteration % cfg.SOLVER.ITER_SIZE == 0: 272 | optimizer.step() 273 | optimizer.zero_grad() 274 | scheduler.step() 275 | 276 | storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) 277 | storage.put_scalar('time', timer.tic()) 278 | 279 | 280 | if ( 281 | cfg.TEST.EVAL_PERIOD > 0 282 | and iteration % cfg.TEST.EVAL_PERIOD == 0 283 | and iteration != max_iter 284 | ): 285 | do_test(cfg, model) 286 | # Compared to "train_net.py", the test results are not dumped to EventStorage 287 | comm.synchronize() 288 | 289 | if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): 290 | for writer in writers: 291 | writer.write() 292 | periodic_checkpointer.step(iteration) 293 | 294 | 295 | def setup(args): 296 | """ 297 | Create configs and perform basic setups. 298 | """ 299 | cfg = get_cfg() 300 | cfg.merge_from_file(args.config_file) 301 | cfg.merge_from_list(args.opts) 302 | 303 | cfg.freeze() 304 | default_setup( 305 | cfg, args 306 | ) # if you don't like any of the default setup, write your own setup code 307 | return cfg 308 | 309 | 310 | def main(args): 311 | cfg = setup(args) 312 | 313 | model = build_model(cfg).to('cuda') 314 | logger.info("Model:\n{}".format(model)) 315 | if args.eval_only: 316 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 317 | cfg.MODEL.WEIGHTS, resume=args.resume 318 | ) 319 | return do_test(cfg, model) 320 | 321 | distributed = comm.get_world_size() > 1 322 | if distributed: 323 | model = DistributedDataParallel( 324 | model, device_ids=[comm.get_local_rank()], broadcast_buffers=False 325 | ) 326 | 327 | do_train(cfg, model, resume=args.resume) 328 | return do_test(cfg, model) 329 | 330 | 331 | if __name__ == "__main__": 332 | args = default_argument_parser().parse_args() 333 | print("Command Line Args:", args) 334 | launch( 335 | main, 336 | args.num_gpus, 337 | num_machines=args.num_machines, 338 | machine_rank=args.machine_rank, 339 | dist_url=args.dist_url, 340 | args=(args,), 341 | ) 342 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Brad Ezard 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weakly Supervised Object Detection 2 | My implementations of a variety of algorithms and models for the weakly supervised object detection problem, based on facebookresearch/detectron2 3 | 4 | ## Requirements 5 | - Python 3 6 | - Latest Pytorch (1.3.1 as of last readme update) 7 | - Latest Torchvision (0.4.2 as of last readme update) 8 | - Detectron2 9 | 10 | ## Models implemented in this repository 11 | Below are the models currently implemented in this repository, as well as their performance using VGG16 as a backbone. 12 | - [Weakly Supervised Deep Detection Networks (Bilen et. al., 2015)](https://arxiv.org/abs/1511.02853) (26.17, no spatial regularisation) 13 | - [Multiple Instance Detection Network with Online Instance Classifier Refinement (Tang et. al., 2017)](https://arxiv.org/abs/1704.00138) (40.83) 14 | - [PCL: Proposal Cluster Learning for Weakly Supervised Object Detection (Tang et. al., 2018)](https://arxiv.org/abs/1807.03342) (44.21) 15 | -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/oicr_alexnet-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_alexnet" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "alexnet" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | 33 | REFINEMENT_HEAD: 34 | K: 3 35 | 36 | LOSS_FN: "oicr_loss" 37 | 38 | DATALOADER: 39 | SAMPLER_TRAIN: "TrainingSampler" 40 | NUM_WORKERS: 4 41 | ASPECT_RATIO_GROUPING: False 42 | 43 | DATASETS: 44 | TRAIN: ("voc_2007_trainval",) 45 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 46 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 47 | 48 | TEST: ("voc_2007_test",) 49 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 50 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 51 | 52 | INPUT: 53 | FORMAT: "RGB" 54 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 55 | MAX_SIZE_TRAIN: 2000 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | MIN_SIZE_TEST: 0 58 | MAX_SIZE_TEST: 0 59 | 60 | TEST: 61 | EVAL_PERIOD: 0 62 | DETECTIONS_PER_IMAGE: 100 63 | AUG: 64 | ENABLED: True 65 | MIN_SIZES: (480, 576, 688, 864, 1200) 66 | MAX_SIZE: 2000 67 | FLIP: True 68 | 69 | SOLVER: 70 | IMS_PER_BATCH: 1 71 | ITER_SIZE: 2 72 | BASE_LR: 1e-3 73 | WEIGHT_DECAY: 5e-4 74 | BIAS_LR_FACTOR: 2.0 75 | WEIGHT_DECAY_BIAS: 0.0 76 | REFINEMENT_LR_FACTOR: 10.0 77 | MOMENTUM: 0.9 78 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 79 | STEPS: (40000,) 80 | GAMMA: 0.1 81 | WARMUP_FACTOR: 1. 82 | WARMUP_ITERS: 0 83 | MAX_ITER: 50000 84 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/oicr_alexnet_dilated-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_alexnet_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "alexnet_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | 33 | REFINEMENT_HEAD: 34 | K: 3 35 | 36 | LOSS_FN: "oicr_loss" 37 | 38 | DATALOADER: 39 | SAMPLER_TRAIN: "TrainingSampler" 40 | NUM_WORKERS: 4 41 | ASPECT_RATIO_GROUPING: False 42 | 43 | DATASETS: 44 | TRAIN: ("voc_2007_trainval",) 45 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 46 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 47 | 48 | TEST: ("voc_2007_test",) 49 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 50 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 51 | 52 | INPUT: 53 | FORMAT: "RGB" 54 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 55 | MAX_SIZE_TRAIN: 2000 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | MIN_SIZE_TEST: 0 58 | MAX_SIZE_TEST: 0 59 | 60 | TEST: 61 | EVAL_PERIOD: 0 62 | DETECTIONS_PER_IMAGE: 100 63 | AUG: 64 | ENABLED: True 65 | MIN_SIZES: (480, 576, 688, 864, 1200) 66 | MAX_SIZE: 2000 67 | FLIP: True 68 | 69 | SOLVER: 70 | MIXED_PRECISION: True 71 | TYPE: "CaffeSGD" 72 | IMS_PER_BATCH: 1 73 | ITER_SIZE: 2 74 | BASE_LR: 1e-3 75 | WEIGHT_DECAY: 5e-4 76 | BIAS_LR_FACTOR: 2.0 77 | WEIGHT_DECAY_BIAS: 0.0 78 | REFINEMENT_LR_FACTOR: 10.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (40000,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 50000 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/oicr_vgg-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_vgg" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "oicr_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 2 75 | BASE_LR: 1e-3 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | REFINEMENT_LR_FACTOR: 10.0 80 | MOMENTUM: 0.9 81 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 82 | STEPS: (40000,) 83 | GAMMA: 0.1 84 | WARMUP_FACTOR: 1. 85 | WARMUP_ITERS: 0 86 | MAX_ITER: 50000 87 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/oicr_vgg_dilated-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_vgg_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 1.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | LOSS_FN: "oicr_loss" 38 | 39 | DATALOADER: 40 | SAMPLER_TRAIN: "TrainingSampler" 41 | NUM_WORKERS: 4 42 | ASPECT_RATIO_GROUPING: False 43 | 44 | DATASETS: 45 | TRAIN: ("voc_2007_trainval",) 46 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 47 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 48 | 49 | TEST: ("voc_2007_test",) 50 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 51 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 52 | 53 | INPUT: 54 | FORMAT: "RGB" 55 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 56 | MAX_SIZE_TRAIN: 2000 57 | MIN_SIZE_TRAIN_SAMPLING: "choice" 58 | MIN_SIZE_TEST: 0 59 | MAX_SIZE_TEST: 0 60 | 61 | TEST: 62 | EVAL_PERIOD: 0 63 | DETECTIONS_PER_IMAGE: 100 64 | AUG: 65 | ENABLED: True 66 | MIN_SIZES: (480, 576, 688, 864, 1200) 67 | MAX_SIZE: 2000 68 | FLIP: True 69 | 70 | SOLVER: 71 | #MIXED_PRECISION: True 72 | IMS_PER_BATCH: 1 73 | ITER_SIZE: 2 74 | BASE_LR: 1e-3 75 | WEIGHT_DECAY: 5e-4 76 | BIAS_LR_FACTOR: 2.0 77 | WEIGHT_DECAY_BIAS: 0.0 78 | REFINEMENT_LR_FACTOR: 10.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (40000,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 50000 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/oicr_vggm_dilated-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_vggm_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (103.530, 116.280, 123.675) #BGR 13 | PIXEL_STD: (1.0, 1.0, 1.0) # Caffe doesn't normalize std 14 | 15 | BACKBONE: 16 | NAME: "vggm_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "oicr_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "BGR" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 2 75 | BASE_LR: 1e-3 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | REFINEMENT_LR_FACTOR: 10.0 80 | MOMENTUM: 0.9 81 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 82 | STEPS: (40000,) 83 | GAMMA: 0.1 84 | WARMUP_FACTOR: 1. 85 | WARMUP_ITERS: 0 86 | MAX_ITER: 50000 87 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/pcl_vgg_dilated-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/pcl_vgg_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 1.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "pcl_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 2 74 | ITER_SIZE: 2 75 | BASE_LR: 5e-4 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | REFINEMENT_LR_FACTOR: 1.0 80 | MOMENTUM: 0.9 81 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 82 | STEPS: (35000,) 83 | GAMMA: 0.1 84 | WARMUP_FACTOR: 1. 85 | WARMUP_ITERS: 0 86 | MAX_ITER: 50000 87 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/wsddn_alexnet-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/wsddn_alexnet" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "alexnet" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 0 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "wsddn_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 1 75 | BASE_LR: 1e-5 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (50110,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 100220 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/wsddn_vgg-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/wsddn_vgg" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 0 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "wsddn_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 1 75 | BASE_LR: 1e-5 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (50110,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 100220 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/.ipynb_checkpoints/wsddn_vggm-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/wsddn_vggm" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (103.530, 116.280, 123.675) #BGR 13 | PIXEL_STD: (1.0, 1.0, 1.0) # Caffe doesn't normalize std 14 | 15 | BACKBONE: 16 | NAME: "vggm" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 0 36 | 37 | LOSS_FN: "wsddn_loss" 38 | 39 | DATALOADER: 40 | SAMPLER_TRAIN: "TrainingSampler" 41 | NUM_WORKERS: 4 42 | 43 | DATASETS: 44 | TRAIN: ("voc_2007_trainval",) 45 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 46 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 47 | 48 | TEST: ("voc_2007_test",) 49 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 50 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 51 | 52 | INPUT: 53 | FORMAT: "BGR" 54 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 55 | MAX_SIZE_TRAIN: 2000 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | MIN_SIZE_TEST: 0 58 | MAX_SIZE_TEST: 0 59 | 60 | TEST: 61 | EVAL_PERIOD: 0 62 | DETECTIONS_PER_IMAGE: 100 63 | AUG: 64 | ENABLED: True 65 | MIN_SIZES: (480, 576, 688, 864, 1200) 66 | MAX_SIZE: 2000 67 | FLIP: True 68 | 69 | SOLVER: 70 | # Mixed Precision 71 | MIXED_PRECISION: True 72 | LOSS_SCALE: 0.0 73 | 74 | # Minibatch settings 75 | IMS_PER_BATCH: 1 76 | ITER_SIZE: 1 77 | 78 | # Optimizer settings 79 | TYPE: 'caffesgd' 80 | BASE_LR: 1e-5 81 | WEIGHT_DECAY: 5e-4 82 | BIAS_LR_FACTOR: 2.0 83 | WEIGHT_DECAY_BIAS: 0.0 84 | MOMENTUM: 0.9 85 | 86 | # Scheduler 87 | LR_SCHEDULER_NAME: "CaffeLRScheduler" 88 | STEPS: (50110,) 89 | GAMMA: 0.1 90 | WARMUP_FACTOR: 1. 91 | WARMUP_ITERS: 0 92 | 93 | # Other 94 | MAX_ITER: 100220 95 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/oicr_alexnet.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_alexnet" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "alexnet" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | 33 | REFINEMENT_HEAD: 34 | K: 3 35 | 36 | LOSS_FN: "oicr_loss" 37 | 38 | DATALOADER: 39 | SAMPLER_TRAIN: "TrainingSampler" 40 | NUM_WORKERS: 4 41 | ASPECT_RATIO_GROUPING: False 42 | 43 | DATASETS: 44 | TRAIN: ("voc_2007_trainval",) 45 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 46 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 47 | 48 | TEST: ("voc_2007_test",) 49 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 50 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 51 | 52 | INPUT: 53 | FORMAT: "RGB" 54 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 55 | MAX_SIZE_TRAIN: 2000 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | MIN_SIZE_TEST: 0 58 | MAX_SIZE_TEST: 0 59 | 60 | TEST: 61 | EVAL_PERIOD: 0 62 | DETECTIONS_PER_IMAGE: 100 63 | AUG: 64 | ENABLED: True 65 | MIN_SIZES: (480, 576, 688, 864, 1200) 66 | MAX_SIZE: 2000 67 | FLIP: True 68 | 69 | SOLVER: 70 | IMS_PER_BATCH: 1 71 | ITER_SIZE: 2 72 | BASE_LR: 1e-3 73 | WEIGHT_DECAY: 5e-4 74 | BIAS_LR_FACTOR: 2.0 75 | WEIGHT_DECAY_BIAS: 0.0 76 | REFINEMENT_LR_FACTOR: 10.0 77 | MOMENTUM: 0.9 78 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 79 | STEPS: (40000,) 80 | GAMMA: 0.1 81 | WARMUP_FACTOR: 1. 82 | WARMUP_ITERS: 0 83 | MAX_ITER: 50000 84 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/oicr_alexnet_dilated.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_alexnet_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "alexnet_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | 33 | REFINEMENT_HEAD: 34 | K: 3 35 | 36 | LOSS_FN: "oicr_loss" 37 | 38 | DATALOADER: 39 | SAMPLER_TRAIN: "TrainingSampler" 40 | NUM_WORKERS: 4 41 | ASPECT_RATIO_GROUPING: False 42 | 43 | DATASETS: 44 | TRAIN: ("voc_2007_trainval",) 45 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 46 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 47 | 48 | TEST: ("voc_2007_test",) 49 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 50 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 51 | 52 | INPUT: 53 | FORMAT: "RGB" 54 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 55 | MAX_SIZE_TRAIN: 2000 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | MIN_SIZE_TEST: 0 58 | MAX_SIZE_TEST: 0 59 | 60 | TEST: 61 | EVAL_PERIOD: 0 62 | DETECTIONS_PER_IMAGE: 100 63 | AUG: 64 | ENABLED: True 65 | MIN_SIZES: (480, 576, 688, 864, 1200) 66 | MAX_SIZE: 2000 67 | FLIP: True 68 | 69 | SOLVER: 70 | MIXED_PRECISION: True 71 | TYPE: "CaffeSGD" 72 | IMS_PER_BATCH: 1 73 | ITER_SIZE: 2 74 | BASE_LR: 1e-3 75 | WEIGHT_DECAY: 5e-4 76 | BIAS_LR_FACTOR: 2.0 77 | WEIGHT_DECAY_BIAS: 0.0 78 | REFINEMENT_LR_FACTOR: 10.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (40000,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 50000 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/oicr_vgg.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_vgg" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "oicr_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 2 75 | BASE_LR: 1e-3 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | REFINEMENT_LR_FACTOR: 10.0 80 | MOMENTUM: 0.9 81 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 82 | STEPS: (40000,) 83 | GAMMA: 0.1 84 | WARMUP_FACTOR: 1. 85 | WARMUP_ITERS: 0 86 | MAX_ITER: 50000 87 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/oicr_vgg_dilated.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_vgg_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 1.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | LOSS_FN: "oicr_loss" 38 | 39 | DATALOADER: 40 | SAMPLER_TRAIN: "TrainingSampler" 41 | NUM_WORKERS: 4 42 | ASPECT_RATIO_GROUPING: False 43 | 44 | DATASETS: 45 | TRAIN: ("voc_2007_trainval",) 46 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 47 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 48 | 49 | TEST: ("voc_2007_test",) 50 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 51 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 52 | 53 | INPUT: 54 | FORMAT: "RGB" 55 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 56 | MAX_SIZE_TRAIN: 2000 57 | MIN_SIZE_TRAIN_SAMPLING: "choice" 58 | MIN_SIZE_TEST: 0 59 | MAX_SIZE_TEST: 0 60 | 61 | TEST: 62 | EVAL_PERIOD: 0 63 | DETECTIONS_PER_IMAGE: 100 64 | AUG: 65 | ENABLED: True 66 | MIN_SIZES: (480, 576, 688, 864, 1200) 67 | MAX_SIZE: 2000 68 | FLIP: True 69 | 70 | SOLVER: 71 | #MIXED_PRECISION: True 72 | IMS_PER_BATCH: 1 73 | ITER_SIZE: 2 74 | BASE_LR: 1e-3 75 | WEIGHT_DECAY: 5e-4 76 | BIAS_LR_FACTOR: 2.0 77 | WEIGHT_DECAY_BIAS: 0.0 78 | REFINEMENT_LR_FACTOR: 10.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (40000,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 50000 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/oicr_vggm_dilated.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/oicr_vggm_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (103.530, 116.280, 123.675) #BGR 13 | PIXEL_STD: (1.0, 1.0, 1.0) # Caffe doesn't normalize std 14 | 15 | BACKBONE: 16 | NAME: "vggm_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "oicr_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "BGR" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 2 75 | BASE_LR: 1e-3 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | REFINEMENT_LR_FACTOR: 10.0 80 | MOMENTUM: 0.9 81 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 82 | STEPS: (40000,) 83 | GAMMA: 0.1 84 | WARMUP_FACTOR: 1. 85 | WARMUP_ITERS: 0 86 | MAX_ITER: 50000 87 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/pcl_vgg_dilated.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/pcl_vgg_dilated" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16_dilated" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 4 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 1.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 3 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "pcl_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 1000 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 2 75 | BASE_LR: 5e-4 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | REFINEMENT_LR_FACTOR: 1.0 80 | MOMENTUM: 0.9 81 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 82 | STEPS: (35000,) 83 | GAMMA: 0.1 84 | WARMUP_FACTOR: 1. 85 | WARMUP_ITERS: 0 86 | MAX_ITER: 50000 87 | CHECKPOINT_PERIOD: 5000 88 | -------------------------------------------------------------------------------- /config_files/wsddn_alexnet.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/wsddn_alexnet" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "alexnet" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 0 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "wsddn_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 1 75 | BASE_LR: 1e-5 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (50110,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 100220 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/wsddn_vgg.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/wsddn_vgg" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (123.675, 116.280, 103.530) #RGB 13 | PIXEL_STD: (58.395, 57.120, 57.375) #RGB 14 | 15 | BACKBONE: 16 | NAME: "vgg16" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 7 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 0 36 | 37 | PREDICTION_LAYERS: (0,) 38 | 39 | LOSS_FN: "wsddn_loss" 40 | 41 | DATALOADER: 42 | SAMPLER_TRAIN: "TrainingSampler" 43 | NUM_WORKERS: 4 44 | ASPECT_RATIO_GROUPING: False 45 | 46 | DATASETS: 47 | TRAIN: ("voc_2007_trainval",) 48 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 49 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 50 | 51 | TEST: ("voc_2007_test",) 52 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 53 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 54 | 55 | INPUT: 56 | FORMAT: "RGB" 57 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 58 | MAX_SIZE_TRAIN: 2000 59 | MIN_SIZE_TRAIN_SAMPLING: "choice" 60 | MIN_SIZE_TEST: 0 61 | MAX_SIZE_TEST: 0 62 | 63 | TEST: 64 | EVAL_PERIOD: 0 65 | DETECTIONS_PER_IMAGE: 100 66 | AUG: 67 | ENABLED: True 68 | MIN_SIZES: (480, 576, 688, 864, 1200) 69 | MAX_SIZE: 2000 70 | FLIP: True 71 | 72 | SOLVER: 73 | IMS_PER_BATCH: 1 74 | ITER_SIZE: 1 75 | BASE_LR: 1e-5 76 | WEIGHT_DECAY: 5e-4 77 | BIAS_LR_FACTOR: 2.0 78 | WEIGHT_DECAY_BIAS: 0.0 79 | MOMENTUM: 0.9 80 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 81 | STEPS: (50110,) 82 | GAMMA: 0.1 83 | WARMUP_FACTOR: 1. 84 | WARMUP_ITERS: 0 85 | MAX_ITER: 100220 86 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /config_files/wsddn_vggm.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "outputs/wsddn_vggm" 2 | 3 | MODEL: 4 | DEVICE: "cuda" 5 | 6 | META_ARCHITECTURE: "GeneralisedMIL" 7 | 8 | LOAD_PROPOSALS: True 9 | PROPOSAL_GENERATOR: 10 | MIN_SIZE: 20 11 | 12 | PIXEL_MEAN: (103.530, 116.280, 123.675) #BGR 13 | PIXEL_STD: (1.0, 1.0, 1.0) # Caffe doesn't normalize std 14 | 15 | BACKBONE: 16 | NAME: "vggm" 17 | WEIGHTS: "imagenet" 18 | FREEZE_CONVS: 0 19 | 20 | ROI_HEADS: 21 | NUM_CLASSES: 20 22 | NMS_THRESH_TEST: 0.3 23 | SCORE_THRESH_TEST: 1e-3 24 | 25 | ROI_BOX_HEAD: 26 | POOLER_TYPE: "ROIPool" 27 | POOLER_RESOLUTION: 6 28 | 29 | MIDN_HEAD: 30 | NUM_CLASSIFIER: 1 31 | NUM_DETECTOR: 1 32 | DETECTOR_TEMP: 2.0 33 | 34 | REFINEMENT_HEAD: 35 | K: 0 36 | 37 | LOSS_FN: "wsddn_loss" 38 | 39 | DATALOADER: 40 | SAMPLER_TRAIN: "TrainingSampler" 41 | NUM_WORKERS: 4 42 | 43 | DATASETS: 44 | TRAIN: ("voc_2007_trainval",) 45 | PROPOSAL_FILES_TRAIN: ("anno/proposals_trainval.pkl",) 46 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 5000 47 | 48 | TEST: ("voc_2007_test",) 49 | PROPOSAL_FILES_TEST: ('anno/proposals_test.pkl',) 50 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 5000 51 | 52 | INPUT: 53 | FORMAT: "BGR" 54 | MIN_SIZE_TRAIN: (480, 576, 688, 864, 1200) 55 | MAX_SIZE_TRAIN: 2000 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | MIN_SIZE_TEST: 0 58 | MAX_SIZE_TEST: 0 59 | 60 | TEST: 61 | EVAL_PERIOD: 0 62 | DETECTIONS_PER_IMAGE: 100 63 | AUG: 64 | ENABLED: True 65 | MIN_SIZES: (480, 576, 688, 864, 1200) 66 | MAX_SIZE: 2000 67 | FLIP: True 68 | 69 | SOLVER: 70 | # Mixed Precision 71 | MIXED_PRECISION: True 72 | LOSS_SCALE: 0.0 73 | 74 | # Minibatch settings 75 | IMS_PER_BATCH: 1 76 | ITER_SIZE: 1 77 | 78 | # Optimizer settings 79 | TYPE: 'caffesgd' 80 | BASE_LR: 1e-5 81 | WEIGHT_DECAY: 5e-4 82 | BIAS_LR_FACTOR: 2.0 83 | WEIGHT_DECAY_BIAS: 0.0 84 | MOMENTUM: 0.9 85 | 86 | # Scheduler 87 | LR_SCHEDULER_NAME: "CaffeLRScheduler" 88 | STEPS: (50110,) 89 | GAMMA: 0.1 90 | WARMUP_FACTOR: 1. 91 | WARMUP_ITERS: 0 92 | 93 | # Other 94 | MAX_ITER: 100220 95 | CHECKPOINT_PERIOD: 5000 -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Detectron2 training script with a plain training loop. 4 | 5 | This scripts reads a given config file and runs the training or evaluation. 6 | It is an entry point that is able to train standard models in detectron2. 7 | 8 | In order to let one script support training of many models, 9 | this script contains logic that are specific to these built-in models and therefore 10 | may not be suitable for your own project. 11 | For example, your research project perhaps only needs a single "evaluator". 12 | 13 | Therefore, we recommend you to use detectron2 as an library and take 14 | this file as an example of how to use the library. 15 | You may want to write your own script with your datasets and other customizations. 16 | 17 | Compared to "train_net.py", this script supports fewer features, and also 18 | includes fewer abstraction. 19 | """ 20 | 21 | import logging 22 | import os 23 | import time 24 | from collections import OrderedDict 25 | import torch 26 | from torch.nn.parallel import DistributedDataParallel 27 | 28 | import detectron2.utils.comm as comm 29 | from detectron2.checkpoint import DetectionCheckpointer, PeriodicCheckpointer 30 | from detectron2.data import ( 31 | MetadataCatalog, 32 | build_detection_test_loader, 33 | build_detection_train_loader, 34 | ) 35 | from detectron2.engine import default_argument_parser, default_setup, launch 36 | from detectron2.evaluation import ( 37 | CityscapesEvaluator, 38 | COCOEvaluator, 39 | COCOPanopticEvaluator, 40 | DatasetEvaluators, 41 | LVISEvaluator, 42 | #PascalVOCDetectionEvaluator, 43 | SemSegEvaluator, 44 | inference_on_dataset, 45 | print_csv_format, 46 | ) 47 | from detectron2.modeling import build_model 48 | from detectron2.solver.lr_scheduler import WarmupMultiStepLR, WarmupCosineLR 49 | from detectron2.utils.events import ( 50 | CommonMetricPrinter, 51 | EventStorage, 52 | JSONWriter, 53 | TensorboardXWriter, 54 | ) 55 | 56 | logger = logging.getLogger("detectron2") 57 | 58 | from wsod.config import get_cfg 59 | from wsod.data.voc import PascalVOCDetectionEvaluator 60 | from wsod import models 61 | 62 | 63 | class TrainTimer(): 64 | def __init__(self): 65 | self.reset() 66 | 67 | def reset(self): 68 | self.start = time.perf_counter() 69 | 70 | def check(self): 71 | return time.perf_counter() - self.start 72 | 73 | def tic(self): 74 | tm = self.check() 75 | self.reset() 76 | return tm 77 | 78 | def build_optimizer(cfg, model: torch.nn.Module) -> torch.optim.Optimizer: 79 | """ 80 | Build an optimizer from config. 81 | """ 82 | params: List[Dict[str, Any]] = [] 83 | for key, value in model.named_parameters(): 84 | if not value.requires_grad: 85 | print(f'{key} requires no grad') 86 | continue 87 | lr = cfg.SOLVER.BASE_LR 88 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 89 | if key.endswith("norm.weight") or key.endswith("norm.bias"): 90 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM 91 | elif key.endswith(".bias"): 92 | # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 93 | # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer 94 | # hyperparameters are by default exactly the same as for regular 95 | # weights. 96 | lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR 97 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS 98 | if 'refinement' in key: 99 | lr = lr * cfg.SOLVER.REFINEMENT_LR_FACTOR 100 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 101 | print(f'{key} | lr: {lr:6.04f}, weight_decay: {weight_decay:6.04f}') 102 | 103 | solver_type = cfg.SOLVER.TYPE.lower() 104 | if solver_type == 'sgd': 105 | optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) 106 | elif solver_type == 'caffesgd': 107 | from optim.caffesgd import CaffeSGD 108 | optimizer = CaffeSGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) 109 | return optimizer 110 | 111 | 112 | def build_lr_scheduler(cfg, optimizer): 113 | """ 114 | Build a LR scheduler from config. 115 | """ 116 | name = cfg.SOLVER.LR_SCHEDULER_NAME 117 | if name == "WarmupMultiStepLR": 118 | return WarmupMultiStepLR( 119 | optimizer, 120 | cfg.SOLVER.STEPS, 121 | cfg.SOLVER.GAMMA, 122 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 123 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 124 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 125 | ) 126 | elif name == "WarmupCosineLR": 127 | return WarmupCosineLR( 128 | optimizer, 129 | cfg.SOLVER.MAX_ITER, 130 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 131 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 132 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 133 | ) 134 | elif name == "CaffeLRScheduler": 135 | from optim.caffesgd import CaffeLRScheduler 136 | return CaffeLRScheduler( 137 | optimizer, 138 | cfg.SOLVER.STEPS, 139 | cfg.SOLVER.GAMMA, 140 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 141 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 142 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 143 | ) 144 | else: 145 | raise ValueError("Unknown LR scheduler: {}".format(name)) 146 | 147 | 148 | 149 | def get_evaluator(cfg, dataset_name, output_folder=None): 150 | """ 151 | Create evaluator(s) for a given dataset. 152 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 153 | For your own dataset, you can simply create an evaluator manually in your 154 | script and do not have to worry about the hacky if-else logic here. 155 | """ 156 | if output_folder is None: 157 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 158 | evaluator_list = [] 159 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 160 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 161 | evaluator_list.append( 162 | SemSegEvaluator( 163 | dataset_name, 164 | distributed=True, 165 | num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 166 | ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 167 | output_dir=output_folder, 168 | ) 169 | ) 170 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 171 | evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) 172 | if evaluator_type == "coco_panoptic_seg": 173 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 174 | if evaluator_type == "cityscapes": 175 | assert ( 176 | torch.cuda.device_count() >= comm.get_rank() 177 | ), "CityscapesEvaluator currently do not work with multiple machines." 178 | return CityscapesEvaluator(dataset_name) 179 | if evaluator_type == "pascal_voc": 180 | return PascalVOCDetectionEvaluator(dataset_name) 181 | if evaluator_type == "lvis": 182 | return LVISEvaluator(dataset_name, cfg, True, output_folder) 183 | if len(evaluator_list) == 0: 184 | raise NotImplementedError( 185 | "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type) 186 | ) 187 | if len(evaluator_list) == 1: 188 | return evaluator_list[0] 189 | return DatasetEvaluators(evaluator_list) 190 | 191 | 192 | def do_test(cfg, model): 193 | results = OrderedDict() 194 | for dataset_name in cfg.DATASETS.TEST: 195 | data_loader = build_detection_test_loader(cfg, dataset_name) 196 | evaluator = get_evaluator( 197 | cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) 198 | ) 199 | results_i = inference_on_dataset(model, data_loader, evaluator) 200 | results[dataset_name] = results_i 201 | if comm.is_main_process(): 202 | logger.info("Evaluation results for {} in csv format:".format(dataset_name)) 203 | print_csv_format(results_i) 204 | if len(results) == 1: 205 | results = list(results.values())[0] 206 | return results 207 | 208 | 209 | def do_train(cfg, model, resume=False): 210 | model.train() 211 | optimizer = build_optimizer(cfg, model) 212 | 213 | if cfg.SOLVER.MIXED_PRECISION: 214 | from apex import amp 215 | scale = None if cfg.SOLVER.LOSS_SCALE < 1e-12 else cfg.SOLVER.LOSS_SCALE 216 | model, optimizer = amp.initialize(model, optimizer, loss_scale=scale) 217 | 218 | def backward_with_scale(losses): 219 | with amp.scale_loss(losses, optimizer) as scaled_loss: 220 | scaled_loss.backward(retain_graph=True) 221 | else: 222 | def backward_with_scale(losses): 223 | losses.backward(retain_graph=True) 224 | 225 | scheduler = build_lr_scheduler(cfg, optimizer) 226 | 227 | checkpointer = DetectionCheckpointer( 228 | model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler 229 | ) 230 | start_iter = ( 231 | checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 232 | ) 233 | max_iter = cfg.SOLVER.MAX_ITER 234 | 235 | periodic_checkpointer = PeriodicCheckpointer( 236 | checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter 237 | ) 238 | 239 | writers = ( 240 | [ 241 | CommonMetricPrinter(max_iter), 242 | JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), 243 | TensorboardXWriter(cfg.OUTPUT_DIR), 244 | ] 245 | if comm.is_main_process() 246 | else [] 247 | ) 248 | 249 | # compared to "train_net.py", we do not support accurate timing and 250 | # precise BN here, because they are not trivial to implement 251 | data_loader = build_detection_train_loader(cfg) 252 | logger.info("Starting training from iteration {}".format(start_iter)) 253 | timer = TrainTimer() 254 | with EventStorage(start_iter) as storage: 255 | for data, iteration in zip(data_loader, range(start_iter, max_iter)): 256 | iteration = iteration + 1 257 | storage.step() 258 | storage.put_scalar('data_time', timer.tic()) 259 | 260 | loss_dict = model(data) 261 | losses = sum(loss for loss in loss_dict.values()) 262 | assert torch.isfinite(losses).all(), loss_dict 263 | 264 | loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} 265 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 266 | if comm.is_main_process(): 267 | storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) 268 | 269 | backward_with_scale(losses) 270 | 271 | if iteration % cfg.SOLVER.ITER_SIZE == 0: 272 | optimizer.step() 273 | optimizer.zero_grad() 274 | scheduler.step() 275 | 276 | storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) 277 | storage.put_scalar('time', timer.tic()) 278 | 279 | 280 | if ( 281 | cfg.TEST.EVAL_PERIOD > 0 282 | and iteration % cfg.TEST.EVAL_PERIOD == 0 283 | and iteration != max_iter 284 | ): 285 | do_test(cfg, model) 286 | # Compared to "train_net.py", the test results are not dumped to EventStorage 287 | comm.synchronize() 288 | 289 | if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): 290 | for writer in writers: 291 | writer.write() 292 | periodic_checkpointer.step(iteration) 293 | 294 | 295 | def setup(args): 296 | """ 297 | Create configs and perform basic setups. 298 | """ 299 | cfg = get_cfg() 300 | cfg.merge_from_file(args.config_file) 301 | cfg.merge_from_list(args.opts) 302 | 303 | cfg.freeze() 304 | default_setup( 305 | cfg, args 306 | ) # if you don't like any of the default setup, write your own setup code 307 | return cfg 308 | 309 | 310 | def main(args): 311 | cfg = setup(args) 312 | 313 | model = build_model(cfg).to('cuda') 314 | logger.info("Model:\n{}".format(model)) 315 | if args.eval_only: 316 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 317 | cfg.MODEL.WEIGHTS, resume=args.resume 318 | ) 319 | return do_test(cfg, model) 320 | 321 | distributed = comm.get_world_size() > 1 322 | if distributed: 323 | model = DistributedDataParallel( 324 | model, device_ids=[comm.get_local_rank()], broadcast_buffers=False 325 | ) 326 | 327 | do_train(cfg, model, resume=args.resume) 328 | return do_test(cfg, model) 329 | 330 | 331 | if __name__ == "__main__": 332 | args = default_argument_parser().parse_args() 333 | print("Command Line Args:", args) 334 | launch( 335 | main, 336 | args.num_gpus, 337 | num_machines=args.num_machines, 338 | machine_rank=args.machine_rank, 339 | dist_url=args.dist_url, 340 | args=(args,), 341 | ) 342 | -------------------------------------------------------------------------------- /wsod/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/__init__.py -------------------------------------------------------------------------------- /wsod/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/config/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- 1 | from .config import get_cfg -------------------------------------------------------------------------------- /wsod/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import get_cfg -------------------------------------------------------------------------------- /wsod/config/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/config/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/config/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/config/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/config/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from detectron2.config import CfgNode as CN 3 | 4 | # ----------------------------------------------------------------------------- 5 | # Convention about Training / Test specific parameters 6 | # ----------------------------------------------------------------------------- 7 | # Whenever an argument can be either used for training or for testing, the 8 | # corresponding name will be post-fixed by a _TRAIN for a training parameter, 9 | # or _TEST for a test-specific parameter. 10 | # For example, the number of images during training will be 11 | # IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be 12 | # IMAGES_PER_BATCH_TEST 13 | 14 | # ----------------------------------------------------------------------------- 15 | # Config definition 16 | # ----------------------------------------------------------------------------- 17 | 18 | _C = CN() 19 | 20 | _C.VERSION = 2 21 | 22 | _C.MODEL = CN() 23 | _C.MODEL.LOAD_PROPOSALS = False 24 | _C.MODEL.MASK_ON = False 25 | _C.MODEL.KEYPOINT_ON = False 26 | _C.MODEL.DEVICE = "cuda" 27 | _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" 28 | 29 | # Path (possibly with schema like catalog:// or detectron2://) to a checkpoint file 30 | # to be loaded to the model. You can find available models in the model zoo. 31 | _C.MODEL.WEIGHTS = "" 32 | 33 | # Values to be used for image normalization (BGR order). 34 | # To train on images of different number of channels, just set different mean & std. 35 | # Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675] 36 | _C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675] 37 | # When using pre-trained models in Detectron1 or any MSRA models, 38 | # std has been absorbed into its conv1 weights, so the std needs to be set 1. 39 | # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std) 40 | _C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0] 41 | 42 | 43 | # ----------------------------------------------------------------------------- 44 | # INPUT 45 | # ----------------------------------------------------------------------------- 46 | _C.INPUT = CN() 47 | # Size of the smallest side of the image during training 48 | _C.INPUT.MIN_SIZE_TRAIN = (800,) 49 | # Sample size of smallest side by choice or random selection from range give by 50 | # INPUT.MIN_SIZE_TRAIN 51 | _C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice" 52 | # Maximum size of the side of the image during training 53 | _C.INPUT.MAX_SIZE_TRAIN = 1333 54 | # Size of the smallest side of the image during testing. Set to zero to disable resize in testing. 55 | _C.INPUT.MIN_SIZE_TEST = 800 56 | # Maximum size of the side of the image during testing 57 | _C.INPUT.MAX_SIZE_TEST = 1333 58 | 59 | # `True` if cropping is used for data augmentation during training 60 | _C.INPUT.CROP = CN({"ENABLED": False}) 61 | # Cropping type: 62 | # - "relative" crop (H * CROP.SIZE[0], W * CROP.SIZE[1]) part of an input of size (H, W) 63 | # - "relative_range" uniformly sample relative crop size from between [CROP.SIZE[0], [CROP.SIZE[1]]. 64 | # and [1, 1] and use it as in "relative" scenario. 65 | # - "absolute" crop part of an input with absolute size: (CROP.SIZE[0], CROP.SIZE[1]). 66 | _C.INPUT.CROP.TYPE = "relative_range" 67 | # Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of 68 | # pixels if CROP.TYPE is "absolute" 69 | _C.INPUT.CROP.SIZE = [0.9, 0.9] 70 | 71 | 72 | # Whether the model needs RGB, YUV, HSV etc. 73 | # Should be one of the modes defined here, as we use PIL to read the image: 74 | # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes 75 | # with BGR being the one exception. One can set image format to BGR, we will 76 | # internally use RGB for conversion and flip the channels over 77 | _C.INPUT.FORMAT = "BGR" 78 | # The ground truth mask format that the model will use. 79 | # Mask R-CNN supports either "polygon" or "bitmask" as ground truth. 80 | _C.INPUT.MASK_FORMAT = "polygon" # alternative: "bitmask" 81 | 82 | 83 | # ----------------------------------------------------------------------------- 84 | # Dataset 85 | # ----------------------------------------------------------------------------- 86 | _C.DATASETS = CN() 87 | # List of the dataset names for training. Must be registered in DatasetCatalog 88 | _C.DATASETS.TRAIN = () 89 | # List of the pre-computed proposal files for training, which must be consistent 90 | # with datasets listed in DATASETS.TRAIN. 91 | _C.DATASETS.PROPOSAL_FILES_TRAIN = () 92 | # Number of top scoring precomputed proposals to keep for training 93 | _C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000 94 | # List of the dataset names for testing. Must be registered in DatasetCatalog 95 | _C.DATASETS.TEST = () 96 | # List of the pre-computed proposal files for test, which must be consistent 97 | # with datasets listed in DATASETS.TEST. 98 | _C.DATASETS.PROPOSAL_FILES_TEST = () 99 | # Number of top scoring precomputed proposals to keep for test 100 | _C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000 101 | 102 | # ----------------------------------------------------------------------------- 103 | # DataLoader 104 | # ----------------------------------------------------------------------------- 105 | _C.DATALOADER = CN() 106 | # Number of data loading threads 107 | _C.DATALOADER.NUM_WORKERS = 4 108 | # If True, each batch should contain only images for which the aspect ratio 109 | # is compatible. This groups portrait images together, and landscape images 110 | # are not batched with portrait images. 111 | _C.DATALOADER.ASPECT_RATIO_GROUPING = True 112 | # Options: TrainingSampler, RepeatFactorTrainingSampler 113 | _C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler" 114 | # Repeat threshold for RepeatFactorTrainingSampler 115 | _C.DATALOADER.REPEAT_THRESHOLD = 0.0 116 | # if True, the dataloader will filter out images that have no associated 117 | # annotations at train time. 118 | _C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True 119 | 120 | # ---------------------------------------------------------------------------- # 121 | # Backbone options 122 | # ---------------------------------------------------------------------------- # 123 | _C.MODEL.BACKBONE = CN() 124 | 125 | _C.MODEL.BACKBONE.NAME = "build_resnet_backbone" 126 | # Add StopGrad at a specified stage so the bottom layers are frozen 127 | _C.MODEL.BACKBONE.FREEZE_AT = 2 128 | 129 | 130 | # ---------------------------------------------------------------------------- # 131 | # FPN options 132 | # ---------------------------------------------------------------------------- # 133 | _C.MODEL.FPN = CN() 134 | # Names of the input feature maps to be used by FPN 135 | # They must have contiguous power of 2 strides 136 | # e.g., ["res2", "res3", "res4", "res5"] 137 | _C.MODEL.FPN.IN_FEATURES = [] 138 | _C.MODEL.FPN.OUT_CHANNELS = 256 139 | 140 | # Options: "" (no norm), "GN" 141 | _C.MODEL.FPN.NORM = "" 142 | 143 | # Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg" 144 | _C.MODEL.FPN.FUSE_TYPE = "sum" 145 | 146 | 147 | # ---------------------------------------------------------------------------- # 148 | # Proposal generator options 149 | # ---------------------------------------------------------------------------- # 150 | _C.MODEL.PROPOSAL_GENERATOR = CN() 151 | # Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals" 152 | _C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" 153 | # Proposal height and width both need to be greater than MIN_SIZE 154 | # (a the scale used during training or inference) 155 | _C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0 156 | 157 | 158 | # ---------------------------------------------------------------------------- # 159 | # Anchor generator options 160 | # ---------------------------------------------------------------------------- # 161 | _C.MODEL.ANCHOR_GENERATOR = CN() 162 | # The generator can be any name in the ANCHOR_GENERATOR registry 163 | _C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" 164 | # anchor sizes given in absolute pixels w.r.t. the scaled network input. 165 | # Format: list of lists of sizes. SIZES[i] specifies the list of sizes 166 | # to use for IN_FEATURES[i]; len(SIZES) == len(IN_FEATURES) must be true, 167 | # or len(SIZES) == 1 is true and size list SIZES[0] is used for all 168 | # IN_FEATURES. 169 | _C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]] 170 | # Anchor aspect ratios. 171 | # Format is list of lists of sizes. ASPECT_RATIOS[i] specifies the list of aspect ratios 172 | # to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true, 173 | # or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used 174 | # for all IN_FEATURES. 175 | _C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]] 176 | # Anchor angles. 177 | # list[float], the angle in degrees, for each input feature map. 178 | # ANGLES[i] specifies the list of angles for IN_FEATURES[i]. 179 | _C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]] 180 | # Relative offset between the center of the first anchor and the top-left corner of the image 181 | # Units: fraction of feature map stride (e.g., 0.5 means half stride) 182 | # Allowed values are floats in [0, 1) range inclusive. 183 | # Recommended value is 0.5, although it is not expected to affect model accuracy. 184 | _C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0 185 | 186 | # ---------------------------------------------------------------------------- # 187 | # RPN options 188 | # ---------------------------------------------------------------------------- # 189 | _C.MODEL.RPN = CN() 190 | _C.MODEL.RPN.HEAD_NAME = "StandardRPNHead" # used by RPN_HEAD_REGISTRY 191 | 192 | # Names of the input feature maps to be used by RPN 193 | # e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN 194 | _C.MODEL.RPN.IN_FEATURES = ["res4"] 195 | # Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels 196 | # Set to -1 or a large value, e.g. 100000, to disable pruning anchors 197 | _C.MODEL.RPN.BOUNDARY_THRESH = -1 198 | # IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD] 199 | # Minimum overlap required between an anchor and ground-truth box for the 200 | # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD 201 | # ==> positive RPN example: 1) 202 | # Maximum overlap allowed between an anchor and ground-truth box for the 203 | # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD 204 | # ==> negative RPN example: 0) 205 | # Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD) 206 | # are ignored (-1) 207 | _C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7] 208 | _C.MODEL.RPN.IOU_LABELS = [0, -1, 1] 209 | # Total number of RPN examples per image 210 | _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 211 | # Target fraction of foreground (positive) examples per RPN minibatch 212 | _C.MODEL.RPN.POSITIVE_FRACTION = 0.5 213 | # Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets 214 | _C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 215 | # The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1. 216 | _C.MODEL.RPN.SMOOTH_L1_BETA = 0.0 217 | _C.MODEL.RPN.LOSS_WEIGHT = 1.0 218 | # Number of top scoring RPN proposals to keep before applying NMS 219 | # When FPN is used, this is *per FPN level* (not total) 220 | _C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000 221 | _C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000 222 | # Number of top scoring RPN proposals to keep after applying NMS 223 | # When FPN is used, this limit is applied per level and then again to the union 224 | # of proposals from all levels 225 | # NOTE: When FPN is used, the meaning of this config is different from Detectron1. 226 | # It means per-batch topk in Detectron1, but per-image topk here. 227 | # See "modeling/rpn/rpn_outputs.py" for details. 228 | _C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000 229 | _C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000 230 | # NMS threshold used on RPN proposals 231 | _C.MODEL.RPN.NMS_THRESH = 0.7 232 | 233 | # ---------------------------------------------------------------------------- # 234 | # ROI HEADS options 235 | # ---------------------------------------------------------------------------- # 236 | _C.MODEL.ROI_HEADS = CN() 237 | _C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads" 238 | # Number of foreground classes 239 | _C.MODEL.ROI_HEADS.NUM_CLASSES = 80 240 | # Names of the input feature maps to be used by ROI heads 241 | # Currently all heads (box, mask, ...) use the same input feature map list 242 | # e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN 243 | _C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"] 244 | # IOU overlap ratios [IOU_THRESHOLD] 245 | # Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD) 246 | # Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD) 247 | _C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5] 248 | _C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1] 249 | # RoI minibatch size *per image* (number of regions of interest [ROIs]) 250 | # Total number of RoIs per training minibatch = 251 | # ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH 252 | # E.g., a common configuration is: 512 * 16 = 8192 253 | _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 254 | # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) 255 | _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 256 | 257 | # Only used on test mode 258 | 259 | # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to 260 | # balance obtaining high recall with not having too many low precision 261 | # detections that will slow down inference post processing steps (like NMS) 262 | # A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down 263 | # inference. 264 | _C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05 265 | # Overlap threshold used for non-maximum suppression (suppress boxes with 266 | # IoU >= this threshold) 267 | _C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5 268 | # If True, augment proposals with ground-truth boxes before sampling proposals to 269 | # train ROI heads. 270 | _C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True 271 | 272 | 273 | # ---------------------------------------------------------------------------- # 274 | # Box Head 275 | # ---------------------------------------------------------------------------- # 276 | _C.MODEL.ROI_BOX_HEAD = CN() 277 | # C4 don't use head name option 278 | # Options for non-C4 models: FastRCNNConvFCHead, 279 | _C.MODEL.ROI_BOX_HEAD.NAME = "" 280 | # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets 281 | # These are empirically chosen to approximately lead to unit variance targets 282 | _C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0) 283 | # The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1. 284 | _C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0 285 | _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 286 | _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 287 | # Type of pooling operation applied to the incoming feature map for each RoI 288 | _C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" 289 | 290 | _C.MODEL.ROI_BOX_HEAD.NUM_FC = 0 291 | # Hidden layer dimension for FC layers in the RoI box head 292 | _C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024 293 | _C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0 294 | # Channel dimension for Conv layers in the RoI box head 295 | _C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256 296 | # Normalization method for the convolution layers. 297 | # Options: "" (no norm), "GN", "SyncBN". 298 | _C.MODEL.ROI_BOX_HEAD.NORM = "" 299 | # Whether to use class agnostic for bbox regression 300 | _C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False 301 | 302 | # ---------------------------------------------------------------------------- # 303 | # Cascaded Box Head 304 | # ---------------------------------------------------------------------------- # 305 | _C.MODEL.ROI_BOX_CASCADE_HEAD = CN() 306 | # The number of cascade stages is implicitly defined by the length of the following two configs. 307 | _C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = ( 308 | (10.0, 10.0, 5.0, 5.0), 309 | (20.0, 20.0, 10.0, 10.0), 310 | (30.0, 30.0, 15.0, 15.0), 311 | ) 312 | _C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7) 313 | 314 | 315 | # ---------------------------------------------------------------------------- # 316 | # Mask Head 317 | # ---------------------------------------------------------------------------- # 318 | _C.MODEL.ROI_MASK_HEAD = CN() 319 | _C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead" 320 | _C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 321 | _C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 322 | _C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0 # The number of convs in the mask head 323 | _C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256 324 | # Normalization method for the convolution layers. 325 | # Options: "" (no norm), "GN", "SyncBN". 326 | _C.MODEL.ROI_MASK_HEAD.NORM = "" 327 | # Whether to use class agnostic for mask prediction 328 | _C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False 329 | # Type of pooling operation applied to the incoming feature map for each RoI 330 | _C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2" 331 | 332 | 333 | # ---------------------------------------------------------------------------- # 334 | # Keypoint Head 335 | # ---------------------------------------------------------------------------- # 336 | _C.MODEL.ROI_KEYPOINT_HEAD = CN() 337 | _C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead" 338 | _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14 339 | _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0 340 | _C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8)) 341 | _C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17 # 17 is the number of keypoints in COCO. 342 | 343 | # Images with too few (or no) keypoints are excluded from training. 344 | _C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1 345 | # Normalize by the total number of visible keypoints in the minibatch if True. 346 | # Otherwise, normalize by the total number of keypoints that could ever exist 347 | # in the minibatch. 348 | # The keypoint softmax loss is only calculated on visible keypoints. 349 | # Since the number of visible keypoints can vary significantly between 350 | # minibatches, this has the effect of up-weighting the importance of 351 | # minibatches with few visible keypoints. (Imagine the extreme case of 352 | # only one visible keypoint versus N: in the case of N, each one 353 | # contributes 1/N to the gradient compared to the single keypoint 354 | # determining the gradient direction). Instead, we can normalize the 355 | # loss by the total number of keypoints, if it were the case that all 356 | # keypoints were visible in a full minibatch. (Returning to the example, 357 | # this means that the one visible keypoint contributes as much as each 358 | # of the N keypoints.) 359 | _C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True 360 | # Multi-task loss weight to use for keypoints 361 | # Recommended values: 362 | # - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True 363 | # - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False 364 | _C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0 365 | # Type of pooling operation applied to the incoming feature map for each RoI 366 | _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2" 367 | 368 | # ---------------------------------------------------------------------------- # 369 | # Semantic Segmentation Head 370 | # ---------------------------------------------------------------------------- # 371 | _C.MODEL.SEM_SEG_HEAD = CN() 372 | _C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead" 373 | _C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"] 374 | # Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for 375 | # the correposnding pixel. 376 | _C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255 377 | # Number of classes in the semantic segmentation head 378 | _C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54 379 | # Number of channels in the 3x3 convs inside semantic-FPN heads. 380 | _C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128 381 | # Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride. 382 | _C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4 383 | # Normalization method for the convolution layers. Options: "" (no norm), "GN". 384 | _C.MODEL.SEM_SEG_HEAD.NORM = "GN" 385 | _C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0 386 | 387 | _C.MODEL.PANOPTIC_FPN = CN() 388 | # Scaling of all losses from instance detection / segmentation head. 389 | _C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0 390 | 391 | # options when combining instance & semantic segmentation outputs 392 | _C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True}) 393 | _C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5 394 | _C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096 395 | _C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5 396 | 397 | 398 | # ---------------------------------------------------------------------------- # 399 | # RetinaNet Head 400 | # ---------------------------------------------------------------------------- # 401 | _C.MODEL.RETINANET = CN() 402 | 403 | # This is the number of foreground classes. 404 | _C.MODEL.RETINANET.NUM_CLASSES = 80 405 | 406 | _C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"] 407 | 408 | # Convolutions to use in the cls and bbox tower 409 | # NOTE: this doesn't include the last conv for logits 410 | _C.MODEL.RETINANET.NUM_CONVS = 4 411 | 412 | # IoU overlap ratio [bg, fg] for labeling anchors. 413 | # Anchors with < bg are labeled negative (0) 414 | # Anchors with >= bg and < fg are ignored (-1) 415 | # Anchors with >= fg are labeled positive (1) 416 | _C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5] 417 | _C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1] 418 | 419 | # Prior prob for rare case (i.e. foreground) at the beginning of training. 420 | # This is used to set the bias for the logits layer of the classifier subnet. 421 | # This improves training stability in the case of heavy class imbalance. 422 | _C.MODEL.RETINANET.PRIOR_PROB = 0.01 423 | 424 | # Inference cls score threshold, only anchors with score > INFERENCE_TH are 425 | # considered for inference (to improve speed) 426 | _C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05 427 | _C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000 428 | _C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5 429 | 430 | # Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets 431 | _C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 432 | 433 | # Loss parameters 434 | _C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0 435 | _C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25 436 | _C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1 437 | 438 | 439 | # ---------------------------------------------------------------------------- # 440 | # ResNe[X]t options (ResNets = {ResNet, ResNeXt} 441 | # Note that parts of a resnet may be used for both the backbone and the head 442 | # These options apply to both 443 | # ---------------------------------------------------------------------------- # 444 | _C.MODEL.RESNETS = CN() 445 | 446 | _C.MODEL.RESNETS.DEPTH = 50 447 | _C.MODEL.RESNETS.OUT_FEATURES = ["res4"] # res4 for C4 backbone, res2..5 for FPN backbone 448 | 449 | # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt 450 | _C.MODEL.RESNETS.NUM_GROUPS = 1 451 | 452 | # Options: FrozenBN, GN, "SyncBN", "BN" 453 | _C.MODEL.RESNETS.NORM = "FrozenBN" 454 | 455 | # Baseline width of each group. 456 | # Scaling this parameters will scale the width of all bottleneck layers. 457 | _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 458 | 459 | # Place the stride 2 conv on the 1x1 filter 460 | # Use True only for the original MSRA ResNet; use False for C2 and Torch models 461 | _C.MODEL.RESNETS.STRIDE_IN_1X1 = True 462 | 463 | # Apply dilation in stage "res5" 464 | _C.MODEL.RESNETS.RES5_DILATION = 1 465 | 466 | # Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet 467 | _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 468 | _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 469 | 470 | # Apply Deformable Convolution in stages 471 | # Specify if apply deform_conv on Res2, Res3, Res4, Res5 472 | _C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False] 473 | # Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168); 474 | # Use False for DeformableV1. 475 | _C.MODEL.RESNETS.DEFORM_MODULATED = False 476 | # Number of groups in deformable conv. 477 | _C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1 478 | 479 | 480 | # ---------------------------------------------------------------------------- # 481 | # Solver 482 | # ---------------------------------------------------------------------------- # 483 | _C.SOLVER = CN() 484 | 485 | # See detectron2/solver/build.py for LR scheduler options 486 | _C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR" 487 | 488 | _C.SOLVER.MAX_ITER = 40000 489 | 490 | _C.SOLVER.BASE_LR = 0.001 491 | 492 | _C.SOLVER.MOMENTUM = 0.9 493 | 494 | _C.SOLVER.WEIGHT_DECAY = 0.0001 495 | # The weight decay that's applied to parameters of normalization layers 496 | # (typically the affine transformation) 497 | _C.SOLVER.WEIGHT_DECAY_NORM = 0.0 498 | 499 | _C.SOLVER.GAMMA = 0.1 500 | # The iteration number to decrease learning rate by GAMMA. 501 | _C.SOLVER.STEPS = (30000,) 502 | 503 | _C.SOLVER.WARMUP_FACTOR = 1.0 / 1000 504 | _C.SOLVER.WARMUP_ITERS = 1000 505 | _C.SOLVER.WARMUP_METHOD = "linear" 506 | 507 | _C.SOLVER.CHECKPOINT_PERIOD = 5000 508 | 509 | # Number of images per batch across all machines. 510 | # If we have 16 GPUs and IMS_PER_BATCH = 32, 511 | # each GPU will see 2 images per batch. 512 | _C.SOLVER.IMS_PER_BATCH = 16 513 | 514 | # Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for 515 | # biases. This is not useful (at least for recent models). You should avoid 516 | # changing these and they exist only to reproduce Detectron v1 training if 517 | # desired. 518 | _C.SOLVER.BIAS_LR_FACTOR = 1.0 519 | _C.SOLVER.WEIGHT_DECAY_BIAS = _C.SOLVER.WEIGHT_DECAY 520 | 521 | # ---------------------------------------------------------------------------- # 522 | # Specific test options 523 | # ---------------------------------------------------------------------------- # 524 | _C.TEST = CN() 525 | # For end-to-end tests to verify the expected accuracy. 526 | # Each item is [task, metric, value, tolerance] 527 | # e.g.: [['bbox', 'AP', 38.5, 0.2]] 528 | _C.TEST.EXPECTED_RESULTS = [] 529 | # The period (in terms of steps) to evaluate the model during training. 530 | # Set to 0 to disable. 531 | _C.TEST.EVAL_PERIOD = 0 532 | # The sigmas used to calculate keypoint OKS. 533 | # When empty it will use the defaults in COCO. 534 | # Otherwise it should have the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. 535 | _C.TEST.KEYPOINT_OKS_SIGMAS = [] 536 | # Maximum number of detections to return per image during inference (100 is 537 | # based on the limit established for the COCO dataset). 538 | _C.TEST.DETECTIONS_PER_IMAGE = 100 539 | 540 | _C.TEST.AUG = CN({"ENABLED": False}) 541 | _C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200) 542 | _C.TEST.AUG.MAX_SIZE = 4000 543 | _C.TEST.AUG.FLIP = True 544 | 545 | _C.TEST.PRECISE_BN = CN({"ENABLED": False}) 546 | _C.TEST.PRECISE_BN.NUM_ITER = 200 547 | 548 | # ---------------------------------------------------------------------------- # 549 | # Misc options 550 | # ---------------------------------------------------------------------------- # 551 | # Directory where output files are written 552 | _C.OUTPUT_DIR = "./output" 553 | # Set seed to negative to fully randomize everything. 554 | # Set seed to positive to use a fixed seed. Note that a fixed seed does not 555 | # guarantee fully deterministic behavior. 556 | _C.SEED = -1 557 | # Benchmark different cudnn algorithms. 558 | # If input images have very different sizes, this option will have large overhead 559 | # for about 10k iterations. It usually hurts total time, but can benefit for certain models. 560 | # If input images have the same or similar sizes, benchmark is often helpful. 561 | _C.CUDNN_BENCHMARK = False 562 | # The period (in terms of steps) for minibatch visualization at train time. 563 | # Set to 0 to disable. 564 | _C.VIS_PERIOD = 0 565 | 566 | # global config is for quick hack purposes. 567 | # You can set them in command line or config files, 568 | # and access it with: 569 | # 570 | # from detectron2.config import global_cfg 571 | # print(global_cfg.HACK) 572 | # 573 | # Do not commit any configs into it. 574 | _C.GLOBAL = CN() 575 | _C.GLOBAL.HACK = 1.0 576 | 577 | # ---------------------------------------------------------------------------- # 578 | # Misc options 579 | # ---------------------------------------------------------------------------- # 580 | # Everything from here down is added by Bradley Ezard to customize the config for 581 | # training multiple instance learning models for weakly supervised object detection 582 | 583 | # Emulates Caffe "iter_size" parameter, optimizer.step() is called every 584 | # iter_size iterations. This increases the effective batch size without 585 | # increasing the memory requirements by accumulating gradients between forward 586 | # passes (i.e. forward, backward, forward, backward, update) 587 | _C.SOLVER.ITER_SIZE = 1 588 | _C.SOLVER.REFINEMENT_LR_FACTOR = 1.0 589 | _C.SOLVER.TYPE = 'sgd' 590 | _C.SOLVER.MIXED_PRECISION = False 591 | _C.SOLVER.LOSS_SCALE = 0.0 592 | 593 | # Pretrained weights for the backbone. Can either be a path to a file where 594 | # the weights can be loaded, "imagenet" to use the pretrained weights in 595 | # torchvision, or "" to use a random init. 596 | _C.MODEL.BACKBONE.WEIGHTS = "imagenet" 597 | _C.MODEL.BACKBONE.FREEZE_CONVS = 0 598 | 599 | _C.MODEL.MIDN_HEAD = CN() 600 | _C.MODEL.MIDN_HEAD.NUM_CLASSIFIER = 1 601 | _C.MODEL.MIDN_HEAD.NUM_DETECTOR = 1 602 | _C.MODEL.MIDN_HEAD.CLASSIFIER_TEMP = 1.0 603 | _C.MODEL.MIDN_HEAD.DETECTOR_TEMP = 1.0 604 | 605 | _C.MODEL.REFINEMENT_HEAD = CN() 606 | _C.MODEL.REFINEMENT_HEAD.K = 3 607 | 608 | _C.MODEL.LOSS_FN = "oicr_loss" 609 | _C.MODEL.PREDICTION_LAYERS = (0,) 610 | 611 | 612 | def get_cfg(): 613 | return _C.clone() -------------------------------------------------------------------------------- /wsod/data/.ipynb_checkpoints/voc-checkpoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import logging 5 | import multiprocessing as mp 6 | import numpy as np 7 | import os 8 | from pathlib import Path 9 | import tempfile 10 | import xml.etree.ElementTree as ET 11 | from collections import OrderedDict, defaultdict 12 | from functools import lru_cache 13 | import torch 14 | 15 | from detectron2.data import MetadataCatalog 16 | from detectron2.utils import comm 17 | 18 | from detectron2.evaluation.evaluator import DatasetEvaluator 19 | 20 | 21 | class PascalVOCDetectionEvaluator(DatasetEvaluator): 22 | """ 23 | Evaluate Pascal VOC AP. 24 | It contains a synchronization, therefore has to be called from all ranks. 25 | 26 | Note that this is a rewrite of the official Matlab API. 27 | The results should be similar, but not identical to the one produced by 28 | the official API. 29 | """ 30 | 31 | def __init__(self, dataset_name): 32 | """ 33 | Args: 34 | dataset_name (str): name of the dataset, e.g., "voc_2007_test" 35 | """ 36 | self._dataset_name = dataset_name 37 | meta = MetadataCatalog.get(dataset_name) 38 | self._anno_file_template = os.path.join(meta.dirname, "Annotations", "{}.xml") 39 | self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt") 40 | self._class_names = meta.thing_classes 41 | assert meta.year in [2007, 2012], meta.year 42 | self._is_2007 = meta.year == 2007 43 | self._cpu_device = torch.device("cpu") 44 | self._logger = logging.getLogger(__name__) 45 | 46 | def reset(self): 47 | self._predictions = defaultdict(list) # class name -> list of prediction strings 48 | 49 | def process(self, inputs, outputs): 50 | for input, output in zip(inputs, outputs): 51 | image_id = input["image_id"] 52 | instances = output["instances"].to(self._cpu_device) 53 | boxes = instances.pred_boxes.tensor.numpy() 54 | scores = instances.scores.tolist() 55 | classes = instances.pred_classes.tolist() 56 | for box, score, cls in zip(boxes, scores, classes): 57 | xmin, ymin, xmax, ymax = box 58 | # The inverse of data loading logic in `datasets/pascal_voc.py` 59 | xmin += 1 60 | ymin += 1 61 | self._predictions[cls].append( 62 | f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}" 63 | ) 64 | 65 | def evaluate(self): 66 | """ 67 | Returns: 68 | dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". 69 | """ 70 | all_predictions = comm.gather(self._predictions, dst=0) 71 | if not comm.is_main_process(): 72 | return 73 | predictions = defaultdict(list) 74 | for predictions_per_rank in all_predictions: 75 | for clsid, lines in predictions_per_rank.items(): 76 | predictions[clsid].extend(lines) 77 | del all_predictions 78 | 79 | self._logger.info( 80 | "Evaluating {} using {} metric. " 81 | "Note that results do not use the official Matlab API.".format( 82 | self._dataset_name, 2007 if self._is_2007 else 2012 83 | ) 84 | ) 85 | 86 | #with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: 87 | dirname = Path("tmp/dets") 88 | dirname.mkdir(parents=True, exist_ok=True) 89 | res_file_template = os.path.join(dirname, "{}.txt") 90 | 91 | aps = defaultdict(list) # iou -> ap per class 92 | pool = mp.Pool(10) 93 | for cls_id, cls_name in enumerate(self._class_names): 94 | lines = predictions.get(cls_id, [""]) 95 | 96 | with open(res_file_template.format(cls_name), "w") as f: 97 | f.write("\n".join(lines)) 98 | 99 | args = [] 100 | for thresh in range(50, 100, 5): 101 | args.append([ 102 | res_file_template, 103 | self._anno_file_template, 104 | self._image_set_path, 105 | cls_name, 106 | thresh / 100.0, 107 | self._is_2007 108 | ]) 109 | results = pool.starmap(voc_eval, args) 110 | for thresh, result in zip(range(50, 100, 5), results): 111 | rec, prec, ap = result 112 | aps[thresh].append(ap * 100) 113 | pool.close() 114 | pool.join() 115 | 116 | ret = OrderedDict() 117 | mAP = {iou: np.mean(x) for iou, x in aps.items()} 118 | ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]} 119 | return ret 120 | 121 | 122 | ############################################################################## 123 | # 124 | # Below code is modified from 125 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py 126 | # -------------------------------------------------------- 127 | # Fast/er R-CNN 128 | # Licensed under The MIT License [see LICENSE for details] 129 | # Written by Bharath Hariharan 130 | # -------------------------------------------------------- 131 | 132 | """Python implementation of the PASCAL VOC devkit's AP evaluation code.""" 133 | 134 | 135 | @lru_cache(maxsize=None) 136 | def parse_rec(filename): 137 | """Parse a PASCAL VOC xml file.""" 138 | tree = ET.parse(filename) 139 | objects = [] 140 | for obj in tree.findall("object"): 141 | obj_struct = {} 142 | obj_struct["name"] = obj.find("name").text 143 | obj_struct["pose"] = obj.find("pose").text 144 | obj_struct["truncated"] = int(obj.find("truncated").text) 145 | obj_struct["difficult"] = int(obj.find("difficult").text) 146 | bbox = obj.find("bndbox") 147 | obj_struct["bbox"] = [ 148 | int(bbox.find("xmin").text), 149 | int(bbox.find("ymin").text), 150 | int(bbox.find("xmax").text), 151 | int(bbox.find("ymax").text), 152 | ] 153 | objects.append(obj_struct) 154 | 155 | return objects 156 | 157 | 158 | def voc_ap(rec, prec, use_07_metric=False): 159 | """Compute VOC AP given precision and recall. If use_07_metric is true, uses 160 | the VOC 07 11-point method (default:False). 161 | """ 162 | if use_07_metric: 163 | # 11 point metric 164 | ap = 0.0 165 | for t in np.arange(0.0, 1.1, 0.1): 166 | if np.sum(rec >= t) == 0: 167 | p = 0 168 | else: 169 | p = np.max(prec[rec >= t]) 170 | ap = ap + p / 11.0 171 | else: 172 | # correct AP calculation 173 | # first append sentinel values at the end 174 | mrec = np.concatenate(([0.0], rec, [1.0])) 175 | mpre = np.concatenate(([0.0], prec, [0.0])) 176 | 177 | # compute the precision envelope 178 | for i in range(mpre.size - 1, 0, -1): 179 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 180 | 181 | # to calculate area under PR curve, look for points 182 | # where X axis (recall) changes value 183 | i = np.where(mrec[1:] != mrec[:-1])[0] 184 | 185 | # and sum (\Delta recall) * prec 186 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 187 | return ap 188 | 189 | 190 | def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False): 191 | """rec, prec, ap = voc_eval(detpath, 192 | annopath, 193 | imagesetfile, 194 | classname, 195 | [ovthresh], 196 | [use_07_metric]) 197 | 198 | Top level function that does the PASCAL VOC evaluation. 199 | 200 | detpath: Path to detections 201 | detpath.format(classname) should produce the detection results file. 202 | annopath: Path to annotations 203 | annopath.format(imagename) should be the xml annotations file. 204 | imagesetfile: Text file containing the list of images, one image per line. 205 | classname: Category name (duh) 206 | [ovthresh]: Overlap threshold (default = 0.5) 207 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 208 | (default False) 209 | """ 210 | # assumes detections are in detpath.format(classname) 211 | # assumes annotations are in annopath.format(imagename) 212 | # assumes imagesetfile is a text file with each line an image name 213 | 214 | # first load gt 215 | # read list of images 216 | with open(imagesetfile, "r") as f: 217 | lines = f.readlines() 218 | imagenames = [x.strip() for x in lines] 219 | 220 | # load annots 221 | recs = {} 222 | for imagename in imagenames: 223 | recs[imagename] = parse_rec(annopath.format(imagename)) 224 | 225 | # extract gt objects for this class 226 | class_recs = {} 227 | npos = 0 228 | for imagename in imagenames: 229 | R = [obj for obj in recs[imagename] if obj["name"] == classname] 230 | bbox = np.array([x["bbox"] for x in R]) 231 | difficult = np.array([x["difficult"] for x in R]).astype(np.bool) 232 | # difficult = np.array([False for x in R]).astype(np.bool) # treat all "difficult" as GT 233 | det = [False] * len(R) 234 | npos = npos + sum(~difficult) 235 | class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} 236 | 237 | # read dets 238 | detfile = detpath.format(classname) 239 | with open(detfile, "r") as f: 240 | lines = f.readlines() 241 | 242 | splitlines = [x.strip().split(" ") for x in lines] 243 | image_ids = [x[0] for x in splitlines] 244 | confidence = np.array([float(x[1]) for x in splitlines]) 245 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4) 246 | 247 | # sort by confidence 248 | sorted_ind = np.argsort(-confidence) 249 | BB = BB[sorted_ind, :] 250 | image_ids = [image_ids[x] for x in sorted_ind] 251 | 252 | # go down dets and mark TPs and FPs 253 | nd = len(image_ids) 254 | tp = np.zeros(nd) 255 | fp = np.zeros(nd) 256 | for d in range(nd): 257 | R = class_recs[image_ids[d]] 258 | bb = BB[d, :].astype(float) 259 | ovmax = -np.inf 260 | BBGT = R["bbox"].astype(float) 261 | 262 | if BBGT.size > 0: 263 | # compute overlaps 264 | # intersection 265 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 266 | iymin = np.maximum(BBGT[:, 1], bb[1]) 267 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 268 | iymax = np.minimum(BBGT[:, 3], bb[3]) 269 | iw = np.maximum(ixmax - ixmin + 1.0, 0.0) 270 | ih = np.maximum(iymax - iymin + 1.0, 0.0) 271 | inters = iw * ih 272 | 273 | # union 274 | uni = ( 275 | (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) 276 | + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) 277 | - inters 278 | ) 279 | 280 | overlaps = inters / uni 281 | ovmax = np.max(overlaps) 282 | jmax = np.argmax(overlaps) 283 | 284 | if ovmax > ovthresh: 285 | if not R["difficult"][jmax]: 286 | if not R["det"][jmax]: 287 | tp[d] = 1.0 288 | R["det"][jmax] = 1 289 | else: 290 | fp[d] = 1.0 291 | else: 292 | fp[d] = 1.0 293 | 294 | # compute precision recall 295 | fp = np.cumsum(fp) 296 | tp = np.cumsum(tp) 297 | rec = tp / float(npos) 298 | # avoid divide by zero in case the first detection matches a difficult 299 | # ground truth 300 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 301 | ap = voc_ap(rec, prec, use_07_metric) 302 | 303 | return rec, prec, ap 304 | 305 | 306 | if __name__ == '__main__': 307 | res_file_template = os.path.join('tmp/dets/', '{}.txt') 308 | meta = MetadataCatalog.get('voc_2007_test') 309 | print('Reval VOC07 Test') 310 | avg = 0 311 | for cls_id, cls_name in enumerate(meta.thing_classes): 312 | print(f'{cls_name:20}: ', end='') 313 | rec, prec, ap = voc_eval( 314 | res_file_template, 315 | os.path.join(meta.dirname, 'Annotations', '{}.xml'), 316 | os.path.join(meta.dirname, 'ImageSets', 'Main', meta.split + '.txt'), 317 | cls_name, 318 | 0.50, 319 | True 320 | ) 321 | ap *= 100 322 | print(f'{ap:7.04f}') 323 | avg += ap 324 | print(f'{"total":20}: {avg/20.:7.04f}') -------------------------------------------------------------------------------- /wsod/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/data/__init__.py -------------------------------------------------------------------------------- /wsod/data/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/data/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/data/__pycache__/voc.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/data/__pycache__/voc.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/data/voc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import logging 5 | import multiprocessing as mp 6 | import numpy as np 7 | import os 8 | from pathlib import Path 9 | import tempfile 10 | import xml.etree.ElementTree as ET 11 | from collections import OrderedDict, defaultdict 12 | from functools import lru_cache 13 | import torch 14 | 15 | from detectron2.data import MetadataCatalog 16 | from detectron2.utils import comm 17 | 18 | from detectron2.evaluation.evaluator import DatasetEvaluator 19 | 20 | 21 | class PascalVOCDetectionEvaluator(DatasetEvaluator): 22 | """ 23 | Evaluate Pascal VOC AP. 24 | It contains a synchronization, therefore has to be called from all ranks. 25 | 26 | Note that this is a rewrite of the official Matlab API. 27 | The results should be similar, but not identical to the one produced by 28 | the official API. 29 | """ 30 | 31 | def __init__(self, dataset_name): 32 | """ 33 | Args: 34 | dataset_name (str): name of the dataset, e.g., "voc_2007_test" 35 | """ 36 | self._dataset_name = dataset_name 37 | meta = MetadataCatalog.get(dataset_name) 38 | self._anno_file_template = os.path.join(meta.dirname, "Annotations", "{}.xml") 39 | self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt") 40 | self._class_names = meta.thing_classes 41 | assert meta.year in [2007, 2012], meta.year 42 | self._is_2007 = meta.year == 2007 43 | self._cpu_device = torch.device("cpu") 44 | self._logger = logging.getLogger(__name__) 45 | 46 | def reset(self): 47 | self._predictions = defaultdict(list) # class name -> list of prediction strings 48 | 49 | def process(self, inputs, outputs): 50 | for input, output in zip(inputs, outputs): 51 | image_id = input["image_id"] 52 | instances = output["instances"].to(self._cpu_device) 53 | boxes = instances.pred_boxes.tensor.numpy() 54 | scores = instances.scores.tolist() 55 | classes = instances.pred_classes.tolist() 56 | for box, score, cls in zip(boxes, scores, classes): 57 | xmin, ymin, xmax, ymax = box 58 | # The inverse of data loading logic in `datasets/pascal_voc.py` 59 | xmin += 1 60 | ymin += 1 61 | self._predictions[cls].append( 62 | f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}" 63 | ) 64 | 65 | def evaluate(self): 66 | """ 67 | Returns: 68 | dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". 69 | """ 70 | all_predictions = comm.gather(self._predictions, dst=0) 71 | if not comm.is_main_process(): 72 | return 73 | predictions = defaultdict(list) 74 | for predictions_per_rank in all_predictions: 75 | for clsid, lines in predictions_per_rank.items(): 76 | predictions[clsid].extend(lines) 77 | del all_predictions 78 | 79 | self._logger.info( 80 | "Evaluating {} using {} metric. " 81 | "Note that results do not use the official Matlab API.".format( 82 | self._dataset_name, 2007 if self._is_2007 else 2012 83 | ) 84 | ) 85 | 86 | #with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: 87 | dirname = Path("tmp/dets") 88 | dirname.mkdir(parents=True, exist_ok=True) 89 | res_file_template = os.path.join(dirname, "{}.txt") 90 | 91 | aps = defaultdict(list) # iou -> ap per class 92 | pool = mp.Pool(10) 93 | for cls_id, cls_name in enumerate(self._class_names): 94 | lines = predictions.get(cls_id, [""]) 95 | 96 | with open(res_file_template.format(cls_name), "w") as f: 97 | f.write("\n".join(lines)) 98 | 99 | args = [] 100 | for thresh in range(50, 100, 5): 101 | args.append([ 102 | res_file_template, 103 | self._anno_file_template, 104 | self._image_set_path, 105 | cls_name, 106 | thresh / 100.0, 107 | self._is_2007 108 | ]) 109 | results = pool.starmap(voc_eval, args) 110 | for thresh, result in zip(range(50, 100, 5), results): 111 | rec, prec, ap = result 112 | aps[thresh].append(ap * 100) 113 | pool.close() 114 | pool.join() 115 | 116 | ret = OrderedDict() 117 | mAP = {iou: np.mean(x) for iou, x in aps.items()} 118 | ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]} 119 | return ret 120 | 121 | 122 | ############################################################################## 123 | # 124 | # Below code is modified from 125 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py 126 | # -------------------------------------------------------- 127 | # Fast/er R-CNN 128 | # Licensed under The MIT License [see LICENSE for details] 129 | # Written by Bharath Hariharan 130 | # -------------------------------------------------------- 131 | 132 | """Python implementation of the PASCAL VOC devkit's AP evaluation code.""" 133 | 134 | 135 | @lru_cache(maxsize=None) 136 | def parse_rec(filename): 137 | """Parse a PASCAL VOC xml file.""" 138 | tree = ET.parse(filename) 139 | objects = [] 140 | for obj in tree.findall("object"): 141 | obj_struct = {} 142 | obj_struct["name"] = obj.find("name").text 143 | obj_struct["pose"] = obj.find("pose").text 144 | obj_struct["truncated"] = int(obj.find("truncated").text) 145 | obj_struct["difficult"] = int(obj.find("difficult").text) 146 | bbox = obj.find("bndbox") 147 | obj_struct["bbox"] = [ 148 | int(bbox.find("xmin").text), 149 | int(bbox.find("ymin").text), 150 | int(bbox.find("xmax").text), 151 | int(bbox.find("ymax").text), 152 | ] 153 | objects.append(obj_struct) 154 | 155 | return objects 156 | 157 | 158 | def voc_ap(rec, prec, use_07_metric=False): 159 | """Compute VOC AP given precision and recall. If use_07_metric is true, uses 160 | the VOC 07 11-point method (default:False). 161 | """ 162 | if use_07_metric: 163 | # 11 point metric 164 | ap = 0.0 165 | for t in np.arange(0.0, 1.1, 0.1): 166 | if np.sum(rec >= t) == 0: 167 | p = 0 168 | else: 169 | p = np.max(prec[rec >= t]) 170 | ap = ap + p / 11.0 171 | else: 172 | # correct AP calculation 173 | # first append sentinel values at the end 174 | mrec = np.concatenate(([0.0], rec, [1.0])) 175 | mpre = np.concatenate(([0.0], prec, [0.0])) 176 | 177 | # compute the precision envelope 178 | for i in range(mpre.size - 1, 0, -1): 179 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 180 | 181 | # to calculate area under PR curve, look for points 182 | # where X axis (recall) changes value 183 | i = np.where(mrec[1:] != mrec[:-1])[0] 184 | 185 | # and sum (\Delta recall) * prec 186 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 187 | return ap 188 | 189 | 190 | def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False): 191 | """rec, prec, ap = voc_eval(detpath, 192 | annopath, 193 | imagesetfile, 194 | classname, 195 | [ovthresh], 196 | [use_07_metric]) 197 | 198 | Top level function that does the PASCAL VOC evaluation. 199 | 200 | detpath: Path to detections 201 | detpath.format(classname) should produce the detection results file. 202 | annopath: Path to annotations 203 | annopath.format(imagename) should be the xml annotations file. 204 | imagesetfile: Text file containing the list of images, one image per line. 205 | classname: Category name (duh) 206 | [ovthresh]: Overlap threshold (default = 0.5) 207 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 208 | (default False) 209 | """ 210 | # assumes detections are in detpath.format(classname) 211 | # assumes annotations are in annopath.format(imagename) 212 | # assumes imagesetfile is a text file with each line an image name 213 | 214 | # first load gt 215 | # read list of images 216 | with open(imagesetfile, "r") as f: 217 | lines = f.readlines() 218 | imagenames = [x.strip() for x in lines] 219 | 220 | # load annots 221 | recs = {} 222 | for imagename in imagenames: 223 | recs[imagename] = parse_rec(annopath.format(imagename)) 224 | 225 | # extract gt objects for this class 226 | class_recs = {} 227 | npos = 0 228 | for imagename in imagenames: 229 | R = [obj for obj in recs[imagename] if obj["name"] == classname] 230 | bbox = np.array([x["bbox"] for x in R]) 231 | difficult = np.array([x["difficult"] for x in R]).astype(np.bool) 232 | # difficult = np.array([False for x in R]).astype(np.bool) # treat all "difficult" as GT 233 | det = [False] * len(R) 234 | npos = npos + sum(~difficult) 235 | class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} 236 | 237 | # read dets 238 | detfile = detpath.format(classname) 239 | with open(detfile, "r") as f: 240 | lines = f.readlines() 241 | 242 | splitlines = [x.strip().split(" ") for x in lines] 243 | image_ids = [x[0] for x in splitlines] 244 | confidence = np.array([float(x[1]) for x in splitlines]) 245 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4) 246 | 247 | # sort by confidence 248 | sorted_ind = np.argsort(-confidence) 249 | BB = BB[sorted_ind, :] 250 | image_ids = [image_ids[x] for x in sorted_ind] 251 | 252 | # go down dets and mark TPs and FPs 253 | nd = len(image_ids) 254 | tp = np.zeros(nd) 255 | fp = np.zeros(nd) 256 | for d in range(nd): 257 | R = class_recs[image_ids[d]] 258 | bb = BB[d, :].astype(float) 259 | ovmax = -np.inf 260 | BBGT = R["bbox"].astype(float) 261 | 262 | if BBGT.size > 0: 263 | # compute overlaps 264 | # intersection 265 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 266 | iymin = np.maximum(BBGT[:, 1], bb[1]) 267 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 268 | iymax = np.minimum(BBGT[:, 3], bb[3]) 269 | iw = np.maximum(ixmax - ixmin + 1.0, 0.0) 270 | ih = np.maximum(iymax - iymin + 1.0, 0.0) 271 | inters = iw * ih 272 | 273 | # union 274 | uni = ( 275 | (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) 276 | + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) 277 | - inters 278 | ) 279 | 280 | overlaps = inters / uni 281 | ovmax = np.max(overlaps) 282 | jmax = np.argmax(overlaps) 283 | 284 | if ovmax > ovthresh: 285 | if not R["difficult"][jmax]: 286 | if not R["det"][jmax]: 287 | tp[d] = 1.0 288 | R["det"][jmax] = 1 289 | else: 290 | fp[d] = 1.0 291 | else: 292 | fp[d] = 1.0 293 | 294 | # compute precision recall 295 | fp = np.cumsum(fp) 296 | tp = np.cumsum(tp) 297 | rec = tp / float(npos) 298 | # avoid divide by zero in case the first detection matches a difficult 299 | # ground truth 300 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 301 | ap = voc_ap(rec, prec, use_07_metric) 302 | 303 | return rec, prec, ap 304 | 305 | 306 | if __name__ == '__main__': 307 | res_file_template = os.path.join('tmp/dets/', '{}.txt') 308 | meta = MetadataCatalog.get('voc_2007_test') 309 | print('Reval VOC07 Test') 310 | avg = 0 311 | for cls_id, cls_name in enumerate(meta.thing_classes): 312 | print(f'{cls_name:20}: ', end='') 313 | rec, prec, ap = voc_eval( 314 | res_file_template, 315 | os.path.join(meta.dirname, 'Annotations', '{}.xml'), 316 | os.path.join(meta.dirname, 'ImageSets', 'Main', meta.split + '.txt'), 317 | cls_name, 318 | 0.50, 319 | True 320 | ) 321 | ap *= 100 322 | print(f'{ap:7.04f}') 323 | avg += ap 324 | print(f'{"total":20}: {avg/20.:7.04f}') -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- 1 | from .models import GeneralisedMIL -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/cmil-checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torchvision import models, ops 4 | 5 | 6 | class InstanceSelector(nn.Module): 7 | def __init__(self, in_features, out_features, epochs=20, schedule='log'): 8 | self.fc = nn.Linear(in_features, out_features) 9 | 10 | def partition(self, x, rois, lamb): 11 | pred_scores, pred_classes = x.max(dim=1) 12 | sort_idx = pred_scores.argsort(descending=True) 13 | subsets = [] 14 | while sort_idx.size(0) > 0: 15 | idx = sort_idx[0] 16 | 17 | overlaps = ops.box_iou(rois[idx].view(1,4), rois) 18 | subset = (overlaps >= lamb).nonzero()[:,1] 19 | keep = (overlaps < lamb).nonzero()[:,1] 20 | 21 | subsets.append(sort_idx[subset]) 22 | sort_idx = sort_idx[keep] 23 | rois = rois[keep] 24 | return subsets 25 | 26 | def forward(self, x, rois, lamb=1, targets=None): 27 | x = self.fc(x) 28 | partitions = self.partition(x, rois, lamb) 29 | 30 | if self.training: 31 | # Convert 0,1 targets to -1,1 32 | targets[targets == 0] = -1 33 | losses = torch.zeros(x.size(-1), dtype=x.dtype, device=x.device) 34 | for partition in partitions: 35 | scores = x[partition].mean(dim=0) 36 | partition_losses = 1 - targets * scores 37 | losses = torch.max(losses, partition_losses) 38 | loss = losses.mean() 39 | else: 40 | loss = 0 41 | return partitions, loss 42 | 43 | 44 | class DetectorEstimator(nn.Module): 45 | def __init__(self, in_features, out_features): 46 | self.fc = nn.Linear(in_features, out_features) 47 | 48 | def forward(self, x, rois, lamb=1, target=None): 49 | x = self.fc(x).softmax(-1) 50 | 51 | if self.training: 52 | score = x[:,target] 53 | top = score.argmax() 54 | overlaps = ops.box_iou(rois[top].view(1,4), rois) 55 | fg = overlaps >= 1 - lamb / 2 56 | bg = overlaps < lamb / 2 57 | # ignore lamb / 2 <= x < 1-lamb/2 58 | loss = 0 59 | loss -= score[fg].log().sum() 60 | loss -= (1-score[bg]).log().sum() 61 | return loss 62 | else: 63 | return x 64 | 65 | 66 | class ContinuationMIL(nn.Module): 67 | def __init__(self): 68 | super().__init__() 69 | 70 | self.base = models.vgg16(pretrained=True) 71 | self.convs = base.features[:-1] 72 | self.pooler = ops.RoIPool((7,7), 1./16.) 73 | self.fc = base.classifier[:-1] 74 | 75 | self.instance_selector = InstanceSelector(in_features, 20) 76 | self.detector_estimator = DetectorEstimator(in_features, 21) 77 | 78 | self.lambda = 0.0 79 | self.schedule = self.build_schedule(epochs, schedule) 80 | 81 | def build_schedule(self, epochs, schedule): 82 | if schedule == 'linear': 83 | return torch.linspace(0, 1, epochs) 84 | elif schedule == 'log': 85 | return torch.logspace(0,1,20)/10 - torch.linspace(0.1, 0, 20) 86 | elif schedule == 'sigmoid': 87 | return torch.linspace(-13,13,epochs).sigmoid() 88 | elif schedule == 'exp': 89 | backward = 1 - self.build_schedule(self, epochs, 'log') 90 | return torch.tensor([backward[i] for i in range(-1, -(len(backward)+1), -1)]) 91 | elif schedule == 'piecewise': 92 | return torch.tensor( 93 | [0.2] * epochs // 5 + 94 | [0.4] * epochs // 5 + 95 | [0.6] * epochs // 5 + 96 | [0.8] * epochs // 5 + 97 | [1.0] * epochs - 4 * (epochs // 5) 98 | ) 99 | 100 | def step(self): 101 | self.lambda = self.schedule[0] 102 | self.schedule = self.schedule[1:] -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/heads-checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class MultipleMidnHead(nn.Module): 6 | def __init__(self, in_features, out_features, t_cls=1., t_det=1., k_cls=1, k_det=1): 7 | super().__init__() 8 | self.in_features, self.out_features = in_features, out_features 9 | self.classifiers = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(k_cls)]) 10 | self.detectors = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(k_det)]) 11 | self.t_cls = t_cls 12 | self.t_det = t_det 13 | self.k = k_cls * k_det 14 | 15 | def forward(self, x): 16 | result = [] 17 | for cls in self.classifiers: 18 | c = (cls(x) / self.t_cls).softmax(1) 19 | for det in self.detectors: 20 | d = (det(x) / self.t_det).softmax(0) 21 | result.append(c * d) 22 | return result 23 | 24 | 25 | class RefinementHeads(nn.Module): 26 | def __init__(self, in_features, out_features, k=3): 27 | super().__init__() 28 | self.in_features, self.out_features = in_features, out_features 29 | self.refinements = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(k)]) 30 | self.k = k 31 | 32 | def forward(self, x): 33 | result = [] 34 | for refinement in self.refinements: 35 | result.append(refinement(x)) 36 | return result -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/losses-checkpoint.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from torchvision import ops 6 | from sklearn.cluster import KMeans 7 | 8 | from typing import Tuple, Dict, List 9 | 10 | 11 | EPS = 1e-12 12 | 13 | 14 | def weighted_softmax_with_loss(score:torch.Tensor, labels:torch.Tensor, weights:torch.Tensor) -> torch.Tensor: 15 | loss = -weights * F.log_softmax(score, dim=-1).gather(-1, labels.long().unsqueeze(-1)).squeeze(-1) 16 | valid_sum = weights.gt(EPS).float().sum() 17 | if valid_sum < EPS: 18 | return loss.sum() / loss.numel() 19 | else: 20 | return loss.sum() / valid_sum 21 | 22 | 23 | @torch.no_grad() 24 | def oicr_label(boxes:torch.Tensor, cls_prob:torch.Tensor, image_labels:torch.Tensor, fg_thresh:float=0.5, bg_thresh:float=-1) -> Tuple[torch.Tensor, torch.Tensor]: 25 | boxes = boxes.to(cls_prob.device) 26 | cls_prob = (cls_prob if cls_prob.size(-1) == image_labels.size(-1) else cls_prob[..., 1:]).clone() 27 | gt_boxes = [] 28 | gt_classes = torch.jit.annotate(List[int], []) 29 | gt_scores = torch.jit.annotate(List[float], []) 30 | for i in image_labels.nonzero()[:,1]: 31 | max_index = cls_prob[:,i].argmax(dim=0) 32 | gt_boxes.append(boxes[max_index]) 33 | gt_classes.append(int(i)+1) 34 | gt_scores.append(float(cls_prob[max_index, i])) 35 | cls_prob[max_index] = 0 36 | max_overlaps, gt_assignment = ops.box_iou(boxes, torch.stack(gt_boxes)).max(dim=1) 37 | 38 | pseudo_labels = torch.gather(torch.tensor(gt_classes, dtype=torch.long, device=cls_prob.device), 0, gt_assignment) 39 | pseudo_labels[max_overlaps <= fg_thresh] = 0 40 | weights = torch.gather(torch.tensor(gt_scores, device=cls_prob.device), 0, gt_assignment) 41 | weights[max_overlaps < bg_thresh] = 0 42 | 43 | return pseudo_labels.detach(), weights.detach() 44 | 45 | 46 | def _get_top_ranking_propoals(probs): 47 | """Get top ranking proposals by k-means""" 48 | dev = probs.device 49 | kmeans = KMeans(n_clusters=5).fit(probs.cpu().numpy()) 50 | high_score_label = np.argmax(kmeans.cluster_centers_) 51 | 52 | index = np.where(kmeans.labels_ == high_score_label)[0] 53 | 54 | if len(index) == 0: 55 | index = np.array([np.argmax(probs)]) 56 | 57 | return torch.from_numpy(index).to(dev) 58 | 59 | 60 | def _get_graph_centers(boxes, cls_prob, im_labels): 61 | """Get graph centers.""" 62 | 63 | num_images, num_classes = im_labels.shape 64 | assert num_images == 1, 'batch size shoud be equal to 1' 65 | dev = cls_prob.device 66 | gt_boxes = torch.zeros((0, 4), dtype=boxes.dtype, device=dev) 67 | gt_classes = torch.zeros((0, 1), dtype=torch.long, device=dev) 68 | gt_scores = torch.zeros((0, 1), dtype=cls_prob.dtype, device=dev) 69 | for i in im_labels.nonzero()[:,1]: 70 | cls_prob_tmp = cls_prob[:, i] 71 | idxs = (cls_prob_tmp >= 0).nonzero()[:,0] 72 | idxs_tmp = _get_top_ranking_propoals(cls_prob_tmp[idxs].reshape(-1, 1)) 73 | idxs = idxs[idxs_tmp] 74 | boxes_tmp = boxes[idxs, :] 75 | cls_prob_tmp = cls_prob_tmp[idxs] 76 | 77 | graph = (ops.box_iou(boxes_tmp, boxes_tmp) > 0.4).float() 78 | 79 | keep_idxs = [] 80 | gt_scores_tmp = [] 81 | count = cls_prob_tmp.size(0) 82 | while True: 83 | order = graph.sum(dim=1).argsort(descending=True) 84 | tmp = order[0] 85 | keep_idxs.append(tmp) 86 | inds = (graph[tmp, :] > 0).nonzero()[:,0] 87 | gt_scores_tmp.append(cls_prob_tmp[inds].max()) 88 | 89 | graph[:, inds] = 0 90 | graph[inds, :] = 0 91 | count = count - len(inds) 92 | if count <= 5: 93 | break 94 | 95 | gt_boxes_tmp = boxes_tmp[keep_idxs, :].view(-1, 4).to(dev) 96 | gt_scores_tmp = torch.tensor(gt_scores_tmp, device=dev) 97 | 98 | keep_idxs_new = torch.from_numpy((gt_scores_tmp.argsort().to('cpu').numpy()[-1:(-1 - min(len(gt_scores_tmp), 5)):-1]).copy()).to(dev) 99 | 100 | gt_boxes = torch.cat((gt_boxes, gt_boxes_tmp[keep_idxs_new, :])) 101 | gt_scores = torch.cat((gt_scores, gt_scores_tmp[keep_idxs_new].reshape(-1, 1))) 102 | gt_classes = torch.cat((gt_classes, (i + 1) * torch.ones((len(keep_idxs_new), 1), dtype=torch.long, device=dev))) 103 | 104 | # If a proposal is chosen as a cluster center, 105 | # we simply delete a proposal from the candidata proposal pool, 106 | # because we found that the results of different strategies are similar and this strategy is more efficient 107 | another_tmp = idxs.to('cpu')[torch.tensor(keep_idxs)][keep_idxs_new.to('cpu')].numpy() 108 | cls_prob = torch.from_numpy(np.delete(cls_prob.to('cpu').numpy(), another_tmp, axis=0)).to(dev) 109 | boxes = torch.from_numpy(np.delete(boxes.to('cpu').numpy(), another_tmp, axis=0)).to(dev) 110 | 111 | proposals = {'gt_boxes' : gt_boxes.to(dev), 112 | 'gt_classes': gt_classes.to(dev), 113 | 'gt_scores': gt_scores.to(dev)} 114 | 115 | return proposals 116 | 117 | 118 | def _get_proposal_clusters(all_rois, proposals, im_labels, cls_prob): 119 | """Generate a random sample of RoIs comprising foreground and background 120 | examples. 121 | """ 122 | num_images, num_classes = im_labels.shape 123 | assert num_images == 1, 'batch size shoud be equal to 1' 124 | # overlaps: (rois x gt_boxes) 125 | gt_boxes = proposals['gt_boxes'] 126 | gt_labels = proposals['gt_classes'] 127 | gt_scores = proposals['gt_scores'] 128 | overlaps = ops.box_iou(all_rois.to(gt_boxes.device), gt_boxes) 129 | max_overlaps, gt_assignment = overlaps.max(dim=1) 130 | labels = gt_labels[gt_assignment, 0] 131 | cls_loss_weights = gt_scores[gt_assignment, 0] 132 | 133 | # Select foreground RoIs as those with >= FG_THRESH overlap 134 | fg_inds = (max_overlaps >= 0.5).nonzero()[:,0] 135 | 136 | # Select background RoIs as those with < FG_THRESH overlap 137 | bg_inds = (max_overlaps < 0.5).nonzero()[:,0] 138 | 139 | ig_inds = (max_overlaps < 0.1).nonzero()[:,0] 140 | cls_loss_weights[ig_inds] = 0.0 141 | 142 | labels[bg_inds] = 0 143 | gt_assignment[bg_inds] = -1 144 | 145 | img_cls_loss_weights = torch.zeros(gt_boxes.shape[0], dtype=cls_prob.dtype, device=cls_prob.device) 146 | pc_probs = torch.zeros(gt_boxes.shape[0], dtype=cls_prob.dtype, device=cls_prob.device) 147 | pc_labels = torch.zeros(gt_boxes.shape[0], dtype=torch.long, device=cls_prob.device) 148 | pc_count = torch.zeros(gt_boxes.shape[0], dtype=torch.long, device=cls_prob.device) 149 | 150 | for i in range(gt_boxes.shape[0]): 151 | po_index = (gt_assignment == i).nonzero()[:,0] 152 | img_cls_loss_weights[i] = torch.sum(cls_loss_weights[po_index]) 153 | pc_labels[i] = gt_labels[i, 0] 154 | pc_count[i] = len(po_index) 155 | pc_probs[i] = (cls_prob[po_index, pc_labels[i]]).mean() 156 | 157 | return labels, cls_loss_weights, gt_assignment, pc_labels, pc_probs, pc_count, img_cls_loss_weights 158 | 159 | 160 | @torch.no_grad() 161 | def pcl_label(boxes:torch.Tensor, cls_prob:torch.Tensor, im_labels:torch.Tensor, cls_prob_new:torch.Tensor): 162 | if cls_prob.shape[1] != im_labels.shape[1]: 163 | cls_prob = cls_prob[:, 1:] 164 | cls_prob = cls_prob.clamp(EPS, 1-EPS) 165 | cls_prob_new = cls_prob_new.clamp(EPS, 1-EPS) 166 | 167 | proposals = _get_graph_centers(boxes, cls_prob, im_labels) 168 | 169 | labels, cls_loss_weights, gt_assignment, pc_labels, pc_probs, \ 170 | pc_count, img_cls_loss_weights = _get_proposal_clusters(boxes, 171 | proposals, im_labels, cls_prob_new) 172 | 173 | return {'labels' : labels.reshape(1, -1), 174 | 'cls_loss_weights' : cls_loss_weights.reshape(1, -1), 175 | 'gt_assignment' : gt_assignment.reshape(1, -1), 176 | 'pc_labels' : pc_labels.reshape(1, -1), 177 | 'pc_probs' : pc_probs.reshape(1, -1), 178 | 'pc_count' : pc_count.reshape(1, -1), 179 | 'img_cls_loss_weights' : img_cls_loss_weights.reshape(1, -1), 180 | 'im_labels_real' : torch.cat((torch.tensor([[1.]], dtype=im_labels.dtype, device=im_labels.device), im_labels), dim=1)} 181 | 182 | 183 | class PCLFunction(torch.autograd.Function): 184 | @staticmethod 185 | def forward(ctx, pcl_probs, labels, cls_loss_weights, 186 | gt_assignment, pc_labels, pc_probs, pc_count, 187 | img_cls_loss_weights, im_labels): 188 | ctx.pcl_probs = pcl_probs 189 | ctx.labels = labels 190 | ctx.cls_loss_weights = cls_loss_weights 191 | ctx.gt_assignment = gt_assignment 192 | ctx.pc_labels = pc_labels 193 | ctx.pc_probs = pc_probs 194 | ctx.pc_count = pc_count 195 | ctx.img_cls_loss_weights = img_cls_loss_weights 196 | ctx.im_labels = im_labels 197 | 198 | batch_size, channels = pcl_probs.size() 199 | loss = 0 200 | ctx.mark_non_differentiable(labels, cls_loss_weights, 201 | gt_assignment, pc_labels, pc_probs, 202 | pc_count, img_cls_loss_weights, im_labels) 203 | 204 | for c in im_labels.nonzero()[:,1]: 205 | if c == 0: 206 | i = (labels[0,:] == 0).nonzero()[:,0] 207 | loss -= (cls_loss_weights[0, i] * pcl_probs[i,c].log()).sum() 208 | else: 209 | i = (pc_labels[0,:] == c).nonzero()[:,0] 210 | loss -= (img_cls_loss_weights[0, i] * pc_probs[0,i].log()).sum() 211 | 212 | return loss / batch_size 213 | 214 | @staticmethod 215 | def backward(ctx, grad_output): 216 | pcl_probs = ctx.pcl_probs 217 | labels = ctx.labels 218 | cls_loss_weights = ctx.cls_loss_weights 219 | gt_assignment = ctx.gt_assignment 220 | pc_labels = ctx.pc_labels 221 | pc_probs = ctx.pc_probs 222 | pc_count = ctx.pc_count 223 | img_cls_loss_weights = ctx.img_cls_loss_weights 224 | im_labels = ctx.im_labels 225 | 226 | grad_input = grad_output.new(pcl_probs.size()).zero_() 227 | 228 | batch_size, channels = pcl_probs.size() 229 | 230 | for c in im_labels.nonzero()[:,1]: 231 | i = (labels[0] == c) 232 | if c == 0: 233 | grad_input[i, c] = -cls_loss_weights[0, i] / pcl_probs[i, c] 234 | else: 235 | pc_index = gt_assignment[0, i] 236 | if (c != pc_labels[0, pc_index]).all(): 237 | print('labels mismatch.') 238 | grad_input[i, c] = -img_cls_loss_weights[0, pc_index] / (pc_count[0, pc_index] * pc_probs[0, pc_index]) 239 | 240 | grad_input /= batch_size 241 | return grad_input, None, None, None, None, None, None, None, None 242 | 243 | 244 | def scs(boxes): 245 | left_inside = boxes[:,0].view(-1,1) > boxes[:,0].view(1,-1) 246 | top_inside = boxes[:,1].view(-1,1) > boxes[:,1].view(1,-1) 247 | right_inside = boxes[:,2].view(-1,1) < boxes[:,2].view(1,-1) 248 | bottom_inside = boxes[:,3].view(-1,1) < boxes[:,3].view(1,-1) 249 | 250 | surrounded = left_inside & right_inside & top_inside & bottom_inside 251 | surrounded = surrounded.any(0) 252 | 253 | return (~surrounded).nonzero()[:,0] 254 | 255 | 256 | def scs_label(rois, predictions, image_labels): 257 | rois = rois.to(predictions[0].device) 258 | predictions = [p[:,1:] if p.size(-1) > image_labels.size(-1) else p for p in predictions] 259 | predictions = torch.stack(predictions) 260 | gt_classes = torch.zeros((0,), dtype=torch.long, device=predictions.device) 261 | gt_scores = torch.zeros((0,), dtype=predictions.dtype, device=predictions.device) 262 | gt_boxes = torch.zeros((0,4), dtype=rois.dtype, device=rois.device) 263 | for c in image_labels.nonzero()[:,1]: 264 | top_scores, top_idxs = predictions[:,:,c].max(dim=1) 265 | top_boxes = rois[top_idxs.flatten(0)] 266 | keep = scs(top_boxes) 267 | 268 | gt_scores = torch.cat([gt_scores, top_scores[keep]]) 269 | gt_classes = torch.cat([gt_classes, torch.full_like(keep, c+1, device=gt_classes.device)]) 270 | gt_boxes = torch.cat([gt_boxes, top_boxes[keep]]) 271 | 272 | predictions[:,top_idxs[keep],:] = 0 273 | 274 | keep = ops.boxes.batched_nms(gt_boxes, gt_scores, gt_classes, 0.1) 275 | gt_classes, gt_scores, gt_boxes = gt_classes[keep], gt_scores[keep], gt_boxes[keep] 276 | 277 | max_overlap, gt_assignment = ops.box_iou(rois, gt_boxes).max(dim=1) 278 | pseudo_labels = gt_classes.gather(-1, gt_assignment) 279 | pseudo_labels[max_overlap < 0.5] = 0 280 | weights = gt_scores.gather(-1, gt_assignment) 281 | 282 | return pseudo_labels, weights 283 | 284 | 285 | def midn_loss(prediction, image_labels, reduction): 286 | image_prediction = prediction.sum(dim=0, keepdim=True).clamp(EPS, 1-EPS) 287 | return F.binary_cross_entropy(image_prediction, image_labels, reduction=reduction) 288 | 289 | 290 | def wsddn_loss(predictions, rois, image_labels, **kwargs): 291 | losses = {} 292 | image_labels = image_labels.to(predictions[0].device) 293 | 294 | for i, prediction in enumerate(predictions): 295 | losses['midn' + str(i) + '_loss'] = midn_loss(prediction, image_labels, reduction='sum') 296 | 297 | return losses 298 | 299 | 300 | def oicr_loss(predictions, rois, image_labels, **kwargs): 301 | losses = {} 302 | dev = predictions[0].device 303 | image_labels = image_labels.to(dev) 304 | 305 | losses['midn_loss'] = midn_loss(predictions[0], image_labels, reduction='mean') 306 | 307 | pseudo_labels, weights = oicr_label(rois, predictions[0], image_labels) 308 | i = 0 309 | for prediction in predictions[1:-1]: 310 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(prediction, pseudo_labels, weights) 311 | pseudo_labels, weights = oicr_label(rois, prediction.softmax(-1), image_labels) 312 | i += 1 313 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(predictions[-1], pseudo_labels, weights) 314 | 315 | return losses 316 | 317 | 318 | def pcl_loss(predictions, rois, image_labels, **kwargs): 319 | losses = {} 320 | dev = predictions[0].device 321 | image_labels = image_labels.to(dev) 322 | 323 | losses['midn_loss'] = midn_loss(predictions[0], image_labels, reduction='mean') 324 | 325 | prev = predictions[0] 326 | pcl = PCLFunction.apply 327 | for i, pred in enumerate(predictions[1:]): 328 | pred = pred.softmax(-1) 329 | dct = pcl_label(rois, prev, image_labels, pred) 330 | args = [pred] + list(dct.values()) 331 | losses['ref' + str(i) + '_loss'] = pcl(*args) 332 | prev = pred 333 | 334 | return losses 335 | 336 | 337 | def instability_loss(predictions, rois, image_labels, **kwargs): 338 | losses = {} 339 | 340 | for i, prediction in enumerate(predictions[:3]): 341 | losses['midn' + str(i) + '_loss'] = midn_loss(prediction, image_labels, reduction='mean') 342 | 343 | pseudo_labels, weights = scs_label(rois, predictions[:3], image_labels) 344 | i = 0 345 | for prediction in predictions[3:-1]: 346 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(prediction, pseudo_labels, weights) 347 | pseudo_labels, weights = oicr_label(rois, prediction.softmax(-1), image_labels) 348 | i += 1 349 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(predictions[-1], pseudo_labels, weights) 350 | 351 | return losses 352 | 353 | 354 | def gt_injected_midn_loss(predictions, rois, gt_boxes, gt_classes): 355 | max_overlap, gt_assignment = ops.box_iou(rois, gt_boxes).max(dim=1) 356 | fg = max_overlap >= EPS 357 | bg = ~fg 358 | loss = -(1-predictions[bg]).log().mean() 359 | 360 | for c in gt_classes.unique(): 361 | targets = gt_assignment == c and fg 362 | weight = max_overlap[targets] 363 | weight = (normed - normed.min()) / (normed.max() - normed.min()) 364 | loss -= (weight * predictions[targets].log()).mean() 365 | 366 | return loss 367 | 368 | 369 | def gt_injected_oicr_loss(predictions, rois, gt_boxes, gt_classes): 370 | losses['midn_loss'] = gt_injected_midn_loss(predictions[0], rois, gt_boxes, gt_classes) 371 | 372 | for i, prediction in enumerate(predictions[1:]): 373 | max_overlap, gt_assignment = ops.box_iou(rois, gt_boxes).max(dim=1) 374 | class_assignment = gt_classes.gather(-1, gt_assignment) + 1 375 | class_assignment[max_overlap < 0.5] = 0 376 | max_overlap[max_overlap < 0.5] = 1 - max_overlap[max_overlap < 0.5] 377 | losses['ref' + str(i) + '_loss'] = -(max_overlap * predictions.log_softmax().gather(-1, class_assignment)).mean() 378 | 379 | return losses 380 | 381 | 382 | def semi_supervised_oicr_loss(predictions, rois, image_labels, **kwargs): 383 | if 'gt_boxes' in kwargs and 'gt_classes' in kwargs: 384 | losses = gt_injected_oicr_loss(predictions, rois, kwargs['gt_boxes'], kwargs['gt_classes']) 385 | else: 386 | losses = oicr_loss(predictions, rois, image_labels, **kwargs) 387 | 388 | return losses 389 | 390 | 391 | LOSS_FUNCTIONS = { 392 | 'wsddn_loss': wsddn_loss, 393 | 'oicr_loss': oicr_loss, 394 | 'pcl_loss': pcl_loss, 395 | 'instability_loss': instability_loss, 396 | 'semi_supervised_oicr_loss': semi_supervised_oicr_loss 397 | } -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/melm-checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torchvision import models, ops 4 | 5 | 6 | __backbones = { 7 | 'vgg16': models.vgg16 8 | } 9 | 10 | 11 | class DiscoveryModule(nn.Module): 12 | def __init__(self, in_features, out_features): 13 | super().__init__() 14 | self.lin = nn.Linear(in_features, out_features) 15 | 16 | def forward(self, x, rois, targets=None): 17 | x = self.lin(x) 18 | n, c = x.shape 19 | x = x.view(-1).softmax(0).view(n,c) 20 | boxes = rois 21 | cliques = [] 22 | while x.size(0) > 0: 23 | scores, classes = x.max(1) 24 | top_score, top_idx = scores.max(0) 25 | box = boxes[top_idx] 26 | overlaps = ops.box_iou(box.view(1,4), boxes) 27 | clique_ids = (overlaps > 0.7).nonzero()[:,1] 28 | cliques.append(clique_ids) 29 | keep = (overlaps <= 0.7).nonzero()[:,1] 30 | boxes = boxes[keep, keep] 31 | x = x[keep] 32 | 33 | if self.training: 34 | assert targets is not None, "Must have targets for training" 35 | loss = 0 36 | for clique in cliques: 37 | scores = x[clique] 38 | weights = 1. / len(cliques) * (scores / scores.sum(1)).sum(0) 39 | clique_scores = scores.sum(0) 40 | loss -= ((weights * clique_scores) * targets).log().sum() 41 | loss -= ((1 - targets) * (1 - x).log()).sum() 42 | return cliques, x, loss 43 | else: 44 | return cliques, x, 0 45 | 46 | 47 | class LocalisationModule(nn.Module): 48 | 49 | 50 | 51 | 52 | class MinEntropyLatentModel(nn.Module): 53 | def __init__(self): 54 | super().__init__() 55 | base = __backbones['vgg16'](pretrained=True) 56 | 57 | self.convs = base.features[:-1] 58 | self.pooler = ops.RoIPool((7,7), 1./16.) 59 | self.fc = base.classifier[:-1] 60 | 61 | self.loc = LocalisationModule() 62 | self.dis = DiscoveryModule(4096, 20) 63 | 64 | def predict_on_batch(self, images, rois): 65 | x = self.convs(x) 66 | x = self.pooler(x, rois) 67 | x = x.flatten(1) 68 | x = self.fc(x) 69 | 70 | for rois_per_image in rois: 71 | n = rois.size(0) 72 | x_i = x[:n] 73 | x = x[n:] 74 | cliques, g, loss = self.dis(x_i, rois_per_image) 75 | l = self.loc(x, g) -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/models-checkpoint.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torchvision import models as M, ops 5 | 6 | from detectron2.data import transforms as T 7 | from detectron2.modeling import META_ARCH_REGISTRY 8 | from detectron2.structures import Instances, Boxes 9 | 10 | from . import heads, utils 11 | from .losses import LOSS_FUNCTIONS 12 | from .backbones.vggm import vggm_1024 13 | 14 | from typing import List 15 | 16 | 17 | _backbones = { 18 | 'alexnet': lambda p: extract_components(M.alexnet, p), 19 | 'vgg16': lambda p: extract_components(M.vgg16, p), 20 | 'vggm': lambda p: extract_components(vggm_1024, p), 21 | } 22 | 23 | 24 | def extract_components(model_fn, pretrained=False): 25 | model = model_fn(pretrained) 26 | convs = model.features[:-1] 27 | fc = model.classifier[:-1] 28 | return convs, fc 29 | 30 | 31 | def dilate_convs(convs): 32 | i = -1 33 | while not isinstance(convs[i], nn.MaxPool2d): 34 | if isinstance(convs[i], nn.Conv2d): 35 | convs[i].dilation = (2, 2) 36 | convs[i].padding = (2, 2) 37 | i -= 1 38 | del convs[i] 39 | return convs 40 | 41 | 42 | @META_ARCH_REGISTRY.register() 43 | class GeneralisedMIL(nn.Module): 44 | def __init__(self, cfg): 45 | super().__init__() 46 | 47 | self.device = cfg.MODEL.DEVICE 48 | self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES 49 | 50 | # Test mode details 51 | self.test_nms_threshold = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST 52 | self.test_score_threshold = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST 53 | self.test_max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 54 | self.test_out_layers = cfg.MODEL.PREDICTION_LAYERS 55 | 56 | # Normalization details 57 | self.pixel_mean = torch.tensor(cfg.MODEL.PIXEL_MEAN).view(1,3,1,1).to(self.device) 58 | self.pixel_std = torch.tensor(cfg.MODEL.PIXEL_STD).view(1,3,1,1).to(self.device) 59 | 60 | # Set up the model base 61 | backbone_name = cfg.MODEL.BACKBONE.NAME 62 | dilated = backbone_name.endswith('_dilated') 63 | backbone_name = backbone_name[:-len('_dilated')] if dilated else backbone_name 64 | 65 | pretrained = cfg.MODEL.BACKBONE.WEIGHTS 66 | convs, fc = _backbones[backbone_name](pretrained=='imagenet') 67 | if pretrained not in ['imagenet', '']: 68 | utils.load_weights(convs, fc, pretrained) 69 | 70 | if dilated: 71 | convs = dilate_convs(convs) 72 | 73 | utils.freeze_convs(convs, cfg.MODEL.BACKBONE.FREEZE_CONVS) 74 | self.convs = convs 75 | self.fc = fc 76 | 77 | # Set up the pooling layer 78 | scale = utils.get_conv_scale(convs) 79 | res = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 80 | pool_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE 81 | if pool_type.lower() == 'roipool': 82 | self.pooler = ops.RoIPool((res, res), scale) 83 | else: 84 | raise NotImplementedError(f'Pooler type {pool_type} not implemented') 85 | 86 | # Set up the heads 87 | fc_features = utils.get_out_features(fc) 88 | nc, nd = cfg.MODEL.MIDN_HEAD.NUM_CLASSIFIER, cfg.MODEL.MIDN_HEAD.NUM_DETECTOR 89 | if nc > 0 and nd > 0: 90 | self.midn = heads.MultipleMidnHead( 91 | in_features=fc_features, 92 | out_features=self.num_classes, 93 | t_cls=cfg.MODEL.MIDN_HEAD.CLASSIFIER_TEMP, 94 | t_det=cfg.MODEL.MIDN_HEAD.DETECTOR_TEMP, 95 | k_cls=nc, 96 | k_det=nd 97 | ) 98 | 99 | nr = cfg.MODEL.REFINEMENT_HEAD.K 100 | if nr > 0: 101 | self.refinement = heads.RefinementHeads( 102 | in_features=fc_features, 103 | out_features=self.num_classes+1, #BG Class 104 | k=3 105 | ) 106 | 107 | if cfg.TEST.AUG.ENABLED: 108 | self.tta = self._init_tta_fn(cfg) 109 | else: 110 | self.tta = lambda x: x 111 | 112 | self.build_loss = LOSS_FUNCTIONS[cfg.MODEL.LOSS_FN] 113 | self.init_layers() 114 | 115 | @torch.no_grad() 116 | def init_layers(self): 117 | params = list(self.midn.classifiers.named_parameters()) 118 | if hasattr(self, 'refinement'): 119 | params += list(self.refinement.named_parameters()) 120 | 121 | if len(self.midn.detectors) > 1: 122 | utils.orthogonal_init(self.midn.detectors) 123 | else: 124 | params += list(self.midn.detectors.named_parameters()) 125 | 126 | for k, v in params: 127 | if 'bias' in k: 128 | nn.init.zeros_(v) 129 | else: 130 | nn.init.normal_(v, mean=0.0, std=0.01) 131 | 132 | def hack(self): 133 | for p_source, p_dest in zip(torch.load('/home/Deep_Learner/work/cleaned/outputs/oicr_vgg_dilated/model_final.pth')['model'].values(), 134 | self.parameters()): 135 | p_dest.copy_(p_source) 136 | 137 | def to(self, device): 138 | self.device = device 139 | self.pixel_mean = self.pixel_mean.to(device) 140 | self.pixel_std = self.pixel_std.to(device) 141 | return super().to(device) 142 | 143 | def normalize(self, x:torch.Tensor) -> torch.Tensor: 144 | return (x - self.pixel_mean) / self.pixel_std 145 | 146 | def predict_on_example(self, image:torch.Tensor, rois:List[torch.Tensor]) -> List[List[torch.Tensor]]: 147 | x = self.normalize(image) 148 | x = self.convs(x) 149 | x = self.pooler(x, [r.type(x.dtype) for r in rois]) 150 | x = x.flatten(1) 151 | x = self.fc(x) 152 | r = self.refinement(x) if hasattr(self, 'refinement') else [] 153 | 154 | outputs = [] 155 | for rois_per_image in rois: 156 | n = rois_per_image.size(0) 157 | x_i = x[:n] 158 | r_i = [tmp[:n] for tmp in r] 159 | x = x[n:] 160 | r = [tmp[n:] for tmp in r] 161 | m = self.midn(x_i) if hasattr(self, 'midn') and (self.training or not hasattr(self, 'refinement')) else [] 162 | outputs.append(m + r_i) 163 | return outputs 164 | 165 | def _init_tta_fn(self, cfg): 166 | max_size = cfg.TEST.AUG.MAX_SIZE 167 | size_gens = [T.ResizeShortestEdge(sz, max_size, 'choice') for sz in cfg.TEST.AUG.MIN_SIZES] 168 | flip = T.RandomFlip(1.0) 169 | 170 | def tta_fn(image, rois): 171 | image = image.permute(1, 2, 0).to('cpu').numpy() 172 | dtype = image.dtype 173 | image = image.astype(np.uint8) 174 | 175 | out_images, out_rois = [], [] 176 | for tfm_gen in size_gens: 177 | resized_image, tfm = T.apply_transform_gens([tfm_gen], image) 178 | resized_rois = tfm.transforms[0].apply_box(rois.to('cpu').numpy()) 179 | 180 | if cfg.TEST.AUG.FLIP: 181 | flipped_image, tfm = T.apply_transform_gens([flip], resized_image) 182 | flipped_rois = tfm.transforms[0].apply_box(resized_rois) 183 | 184 | img_batch = torch.stack([ 185 | torch.from_numpy(resized_image.astype(dtype)).permute(2,0,1), 186 | torch.from_numpy(flipped_image.astype(dtype)).permute(2,0,1) 187 | ]) 188 | roi_batch = [ 189 | torch.from_numpy(resized_rois), 190 | torch.from_numpy(flipped_rois) 191 | ] 192 | else: 193 | img_batch = torch.from_numpy(resized_image.astype(dtype)).permute(2,0,1).unsqueeze(0) 194 | roi_batch = [torch.from_numpy(resized_rois),] 195 | out_images.append(img_batch) 196 | out_rois.append(roi_batch) 197 | return out_images, out_rois 198 | 199 | return tta_fn 200 | 201 | def forward(self, batch, use_gt=False): 202 | losses = {} 203 | batch_predictions = [] 204 | bs = len(batch) 205 | for element in batch: 206 | image, rois, gt_classes, gt_boxes = utils.extract_data(element) 207 | 208 | if self.training: 209 | predictions = self.predict_on_example(image.unsqueeze(0).to(self.device), [rois.to(self.device)]) 210 | image_labels = torch.zeros((1, self.num_classes,), dtype=image.dtype, device=self.device) 211 | image_labels[0, gt_classes.unique()] = 1. 212 | for prediction in predictions: 213 | loss = self.build_loss(prediction, rois, image_labels, gt_boxes=gt_boxes, gt_classes=gt_classes) 214 | for k, v in loss.items(): 215 | v = v.float() 216 | running_total = losses.setdefault(k, torch.zeros_like(v)) 217 | losses[k] = running_total + (v / bs) 218 | else: 219 | aug_images, aug_rois = self.tta(image, rois) 220 | scores = None 221 | for batch_images, batch_rois in zip(aug_images, aug_rois): 222 | predictions = self.predict_on_example(batch_images.to(self.device), [r.to(self.device) for r in batch_rois]) 223 | for prediction in predictions: 224 | if hasattr(self, 'refinement'): 225 | p = sum([pred.softmax(-1)[:,1:] for pred in prediction]) 226 | else: 227 | p = sum(prediction) 228 | if scores is None: 229 | scores = p 230 | else: 231 | scores += p 232 | boxes, scores, classes = utils.filter_predictions(scores, rois, self.test_nms_threshold, self.test_score_threshold) 233 | instances = Instances((element['height'], element['width'])) 234 | instances.scores = scores[:self.test_max_detections_per_image] 235 | instances.pred_classes = classes[:self.test_max_detections_per_image] 236 | instances.pred_boxes = Boxes(boxes[:self.test_max_detections_per_image]) 237 | batch_predictions.append({ 238 | 'instances': instances 239 | }) 240 | return losses if self.training else batch_predictions -------------------------------------------------------------------------------- /wsod/models/.ipynb_checkpoints/utils-checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | #from detectron2.layers import batched_nms 4 | from torchvision.ops.boxes import batched_nms 5 | 6 | 7 | @torch.no_grad() 8 | def orthogonal_init(layers, mean=0.0, std=0.01): 9 | k = len(layers) 10 | ou_f = layers[0].out_features 11 | in_f = layers[0].in_features 12 | random = torch.randn((ou_f, in_f, k)) * std + mean 13 | q, r = torch.qr(random, some=True) 14 | 15 | for detector, init in zip(layers, q.permute(2, 0, 1)): 16 | detector.weight.data.copy_(init) 17 | nn.init.zeros_(detector.bias) 18 | 19 | 20 | @torch.no_grad() 21 | def filter_predictions(scores, rois, nms_threshold, score_threshold): 22 | rois = rois.to(scores.device) 23 | idxs, cls_ids = (scores > score_threshold).nonzero().T 24 | cls_scores = scores[idxs, cls_ids] 25 | boxes = rois[idxs] 26 | keep = batched_nms(boxes, cls_scores, cls_ids, nms_threshold) 27 | return boxes[keep], cls_scores[keep], cls_ids[keep] 28 | 29 | 30 | @torch.no_grad() 31 | def load_weights(convs, fc, pretrained): 32 | m = torch.load(pretrained) 33 | for model_param, pretrained_param in zip(list(convs.parameters()) + list(fc.parameters()), 34 | m.parameters()): 35 | model_param.weight.copy_(pretrained_param.weight) 36 | model_param.bias.copy_(pretrained_param.bias) 37 | 38 | 39 | def get_conv_scale(convs): 40 | """ 41 | Determines the downscaling performed by a sequence of convolutional and pooling layers 42 | """ 43 | scale = 1. 44 | for c in convs: 45 | stride = getattr(c, 'stride', 1.) 46 | scale /= stride if isinstance(stride, (int, float)) else stride[0] 47 | return scale 48 | 49 | 50 | def get_out_features(fc): 51 | """ 52 | Determines the size of the output from a sequence of fully connected layers 53 | """ 54 | i = -1 55 | while i < 0: # will be set to out features to exit 56 | i = getattr(fc[i], 'out_features', i-1) 57 | return i 58 | 59 | 60 | def freeze_convs(convs, k): 61 | """ 62 | Freezes `k` conv layers 63 | """ 64 | i = 0 65 | while k > 0: 66 | if isinstance(convs[i], nn.Conv2d): 67 | k -= 1 68 | for p in convs[i].parameters(): 69 | p.requires_grad = False 70 | i += 1 71 | 72 | 73 | def extract_data(element): 74 | image = element['image'] 75 | 76 | instances = element.get('instances') 77 | if instances: 78 | gt_boxes = instances.gt_boxes 79 | gt_classes = instances.gt_classes 80 | else: 81 | gt_boxes = torch.zeros((0, 4), dtype=torch.float) 82 | gt_classes = torch.zeros((0,), dtype=torch.long) 83 | 84 | rois = element['proposals'].proposal_boxes.tensor 85 | 86 | return image, rois, gt_classes, gt_boxes -------------------------------------------------------------------------------- /wsod/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import GeneralisedMIL -------------------------------------------------------------------------------- /wsod/models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/__pycache__/heads.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/__pycache__/heads.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/__pycache__/losses.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/__pycache__/losses.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/__pycache__/models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/__pycache__/models.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/backbones/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/backbones/.ipynb_checkpoints/__init__-checkpoint.py -------------------------------------------------------------------------------- /wsod/models/backbones/.ipynb_checkpoints/vggm-checkpoint.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | #from torch.legacy import nn as nnl 6 | import torch.utils.model_zoo as model_zoo 7 | 8 | __all__ = ['vggm'] 9 | 10 | pretrained_settings = { 11 | 'vggm': { 12 | 'imagenet': { 13 | 'url': 'http://data.lip6.fr/cadene/pretrainedmodels/vggm-786f2434.pth', 14 | 'input_space': 'BGR', 15 | 'input_size': [3, 221, 221], 16 | 'input_range': [0, 255], 17 | 'mean': [123.68, 116.779, 103.939], 18 | 'std': [1, 1, 1], 19 | 'num_classes': 1000 20 | } 21 | } 22 | } 23 | 24 | class SpatialCrossMapLRN(nn.Module): 25 | def __init__(self, local_size=1, alpha=1.0, beta=0.75, k=1, ACROSS_CHANNELS=True): 26 | super(SpatialCrossMapLRN, self).__init__() 27 | self.ACROSS_CHANNELS = ACROSS_CHANNELS 28 | if ACROSS_CHANNELS: 29 | self.average=nn.AvgPool3d(kernel_size=(local_size, 1, 1), 30 | stride=1, 31 | padding=(int((local_size-1.0)/2), 0, 0)) 32 | else: 33 | self.average=nn.AvgPool2d(kernel_size=local_size, 34 | stride=1, 35 | padding=int((local_size-1.0)/2)) 36 | self.alpha = alpha 37 | self.beta = beta 38 | self.k = k 39 | 40 | def forward(self, x): 41 | if self.ACROSS_CHANNELS: 42 | div = x.pow(2).unsqueeze(1) 43 | div = self.average(div).squeeze(1) 44 | div = div.mul(self.alpha).add(self.k).pow(self.beta) 45 | else: 46 | div = x.pow(2) 47 | div = self.average(div) 48 | div = div.mul(self.alpha).add(self.k).pow(self.beta) 49 | x = x.div(div) 50 | return x 51 | 52 | class LambdaBase(nn.Sequential): 53 | def __init__(self, fn, *args): 54 | super(LambdaBase, self).__init__(*args) 55 | self.lambda_func = fn 56 | 57 | def forward_prepare(self, input): 58 | output = [] 59 | for module in self._modules.values(): 60 | output.append(module(input)) 61 | return output if output else input 62 | 63 | class Lambda(LambdaBase): 64 | def forward(self, input): 65 | return self.lambda_func(self.forward_prepare(input)) 66 | 67 | class VGGM(nn.Module): 68 | 69 | def __init__(self, num_classes=1000, hdim=4096): 70 | super(VGGM, self).__init__() 71 | self.num_classes = num_classes 72 | self.features = nn.Sequential( 73 | nn.Conv2d(3,96,(7, 7),(2, 2)), 74 | nn.ReLU(), 75 | SpatialCrossMapLRN(5, 0.0005, 0.75, 2), 76 | nn.MaxPool2d((3, 3),(2, 2),(0, 0),ceil_mode=True), 77 | nn.Conv2d(96,256,(5, 5),(2, 2),(1, 1)), 78 | nn.ReLU(), 79 | SpatialCrossMapLRN(5, 0.0005, 0.75, 2), 80 | nn.MaxPool2d((3, 3),(2, 2),(0, 0),ceil_mode=True), 81 | nn.Conv2d(256,512,(3, 3),(1, 1),(1, 1)), 82 | nn.ReLU(), 83 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 84 | nn.ReLU(), 85 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 86 | nn.ReLU(), 87 | nn.MaxPool2d((3, 3),(2, 2),(0, 0),ceil_mode=True) 88 | ) 89 | self.classifier = nn.Sequential( 90 | nn.Linear(18432,4096), 91 | nn.ReLU(), 92 | nn.Dropout(0.5), 93 | nn.Linear(4096,hdim), 94 | nn.ReLU(), 95 | nn.Dropout(0.5), 96 | nn.Linear(int(hdim) ,num_classes) 97 | ) 98 | 99 | def forward(self, x): 100 | x = self.features(x) 101 | x = x.view(x.size(0), -1) 102 | x = self.classif(x) 103 | return x 104 | 105 | def vggm(num_classes=1000, pretrained='imagenet'): 106 | if pretrained: 107 | settings = pretrained_settings['vggm'][pretrained] 108 | assert num_classes == settings['num_classes'], \ 109 | "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes) 110 | 111 | model = VGGM(num_classes=1000) 112 | model.load_state_dict(model_zoo.load_url(settings['url'])) 113 | 114 | model.input_space = settings['input_space'] 115 | model.input_size = settings['input_size'] 116 | model.input_range = settings['input_range'] 117 | model.mean = settings['mean'] 118 | model.std = settings['std'] 119 | else: 120 | model = VGGM(num_classes=num_classes) 121 | return model 122 | 123 | def vggm_1024(pretrained=True, num_classes=1000): 124 | m = VGGM(num_classes=num_classes, hdim=1024) 125 | if pretrained: 126 | d = torch.load('/home/Deep_Learner/work/cleaned/models/backbones/vggm1024-caffe.pt') 127 | m.load_state_dict(d) 128 | return m -------------------------------------------------------------------------------- /wsod/models/backbones/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/backbones/__init__.py -------------------------------------------------------------------------------- /wsod/models/backbones/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/backbones/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/backbones/__pycache__/vggm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/models/backbones/__pycache__/vggm.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/models/backbones/vggm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | #from torch.legacy import nn as nnl 6 | import torch.utils.model_zoo as model_zoo 7 | 8 | __all__ = ['vggm'] 9 | 10 | pretrained_settings = { 11 | 'vggm': { 12 | 'imagenet': { 13 | 'url': 'http://data.lip6.fr/cadene/pretrainedmodels/vggm-786f2434.pth', 14 | 'input_space': 'BGR', 15 | 'input_size': [3, 221, 221], 16 | 'input_range': [0, 255], 17 | 'mean': [123.68, 116.779, 103.939], 18 | 'std': [1, 1, 1], 19 | 'num_classes': 1000 20 | } 21 | } 22 | } 23 | 24 | class SpatialCrossMapLRN(nn.Module): 25 | def __init__(self, local_size=1, alpha=1.0, beta=0.75, k=1, ACROSS_CHANNELS=True): 26 | super(SpatialCrossMapLRN, self).__init__() 27 | self.ACROSS_CHANNELS = ACROSS_CHANNELS 28 | if ACROSS_CHANNELS: 29 | self.average=nn.AvgPool3d(kernel_size=(local_size, 1, 1), 30 | stride=1, 31 | padding=(int((local_size-1.0)/2), 0, 0)) 32 | else: 33 | self.average=nn.AvgPool2d(kernel_size=local_size, 34 | stride=1, 35 | padding=int((local_size-1.0)/2)) 36 | self.alpha = alpha 37 | self.beta = beta 38 | self.k = k 39 | 40 | def forward(self, x): 41 | if self.ACROSS_CHANNELS: 42 | div = x.pow(2).unsqueeze(1) 43 | div = self.average(div).squeeze(1) 44 | div = div.mul(self.alpha).add(self.k).pow(self.beta) 45 | else: 46 | div = x.pow(2) 47 | div = self.average(div) 48 | div = div.mul(self.alpha).add(self.k).pow(self.beta) 49 | x = x.div(div) 50 | return x 51 | 52 | class LambdaBase(nn.Sequential): 53 | def __init__(self, fn, *args): 54 | super(LambdaBase, self).__init__(*args) 55 | self.lambda_func = fn 56 | 57 | def forward_prepare(self, input): 58 | output = [] 59 | for module in self._modules.values(): 60 | output.append(module(input)) 61 | return output if output else input 62 | 63 | class Lambda(LambdaBase): 64 | def forward(self, input): 65 | return self.lambda_func(self.forward_prepare(input)) 66 | 67 | class VGGM(nn.Module): 68 | 69 | def __init__(self, num_classes=1000, hdim=4096): 70 | super(VGGM, self).__init__() 71 | self.num_classes = num_classes 72 | self.features = nn.Sequential( 73 | nn.Conv2d(3,96,(7, 7),(2, 2)), 74 | nn.ReLU(), 75 | SpatialCrossMapLRN(5, 0.0005, 0.75, 2), 76 | nn.MaxPool2d((3, 3),(2, 2),(0, 0),ceil_mode=True), 77 | nn.Conv2d(96,256,(5, 5),(2, 2),(1, 1)), 78 | nn.ReLU(), 79 | SpatialCrossMapLRN(5, 0.0005, 0.75, 2), 80 | nn.MaxPool2d((3, 3),(2, 2),(0, 0),ceil_mode=True), 81 | nn.Conv2d(256,512,(3, 3),(1, 1),(1, 1)), 82 | nn.ReLU(), 83 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 84 | nn.ReLU(), 85 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 86 | nn.ReLU(), 87 | nn.MaxPool2d((3, 3),(2, 2),(0, 0),ceil_mode=True) 88 | ) 89 | self.classifier = nn.Sequential( 90 | nn.Linear(18432,4096), 91 | nn.ReLU(), 92 | nn.Dropout(0.5), 93 | nn.Linear(4096,hdim), 94 | nn.ReLU(), 95 | nn.Dropout(0.5), 96 | nn.Linear(int(hdim) ,num_classes) 97 | ) 98 | 99 | def forward(self, x): 100 | x = self.features(x) 101 | x = x.view(x.size(0), -1) 102 | x = self.classif(x) 103 | return x 104 | 105 | def vggm(num_classes=1000, pretrained='imagenet'): 106 | if pretrained: 107 | settings = pretrained_settings['vggm'][pretrained] 108 | assert num_classes == settings['num_classes'], \ 109 | "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes) 110 | 111 | model = VGGM(num_classes=1000) 112 | model.load_state_dict(model_zoo.load_url(settings['url'])) 113 | 114 | model.input_space = settings['input_space'] 115 | model.input_size = settings['input_size'] 116 | model.input_range = settings['input_range'] 117 | model.mean = settings['mean'] 118 | model.std = settings['std'] 119 | else: 120 | model = VGGM(num_classes=num_classes) 121 | return model 122 | 123 | def vggm_1024(pretrained=True, num_classes=1000): 124 | m = VGGM(num_classes=num_classes, hdim=1024) 125 | if pretrained: 126 | d = torch.load('/home/Deep_Learner/work/cleaned/models/backbones/vggm1024-caffe.pt') 127 | m.load_state_dict(d) 128 | return m -------------------------------------------------------------------------------- /wsod/models/cmil.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torchvision import models, ops 4 | 5 | 6 | class InstanceSelector(nn.Module): 7 | def __init__(self, in_features, out_features, epochs=20, schedule='log'): 8 | self.fc = nn.Linear(in_features, out_features) 9 | 10 | def partition(self, x, rois, lamb): 11 | pred_scores, pred_classes = x.max(dim=1) 12 | sort_idx = pred_scores.argsort(descending=True) 13 | subsets = [] 14 | while sort_idx.size(0) > 0: 15 | idx = sort_idx[0] 16 | 17 | overlaps = ops.box_iou(rois[idx].view(1,4), rois) 18 | subset = (overlaps >= lamb).nonzero()[:,1] 19 | keep = (overlaps < lamb).nonzero()[:,1] 20 | 21 | subsets.append(sort_idx[subset]) 22 | sort_idx = sort_idx[keep] 23 | rois = rois[keep] 24 | return subsets 25 | 26 | def forward(self, x, rois, lamb=1, targets=None): 27 | x = self.fc(x) 28 | partitions = self.partition(x, rois, lamb) 29 | 30 | if self.training: 31 | # Convert 0,1 targets to -1,1 32 | targets[targets == 0] = -1 33 | losses = torch.zeros(x.size(-1), dtype=x.dtype, device=x.device) 34 | for partition in partitions: 35 | scores = x[partition].mean(dim=0) 36 | partition_losses = 1 - targets * scores 37 | losses = torch.max(losses, partition_losses) 38 | loss = losses.mean() 39 | else: 40 | loss = 0 41 | return partitions, loss 42 | 43 | 44 | class DetectorEstimator(nn.Module): 45 | def __init__(self, in_features, out_features): 46 | self.fc = nn.Linear(in_features, out_features) 47 | 48 | def forward(self, x, rois, lamb=1, target=None): 49 | x = self.fc(x).softmax(-1) 50 | 51 | if self.training: 52 | score = x[:,target] 53 | top = score.argmax() 54 | overlaps = ops.box_iou(rois[top].view(1,4), rois) 55 | fg = overlaps >= 1 - lamb / 2 56 | bg = overlaps < lamb / 2 57 | # ignore lamb / 2 <= x < 1-lamb/2 58 | loss = 0 59 | loss -= score[fg].log().sum() 60 | loss -= (1-score[bg]).log().sum() 61 | return loss 62 | else: 63 | return x 64 | 65 | 66 | class ContinuationMIL(nn.Module): 67 | def __init__(self): 68 | super().__init__() 69 | 70 | self.base = models.vgg16(pretrained=True) 71 | self.convs = base.features[:-1] 72 | self.pooler = ops.RoIPool((7,7), 1./16.) 73 | self.fc = base.classifier[:-1] 74 | 75 | self.instance_selector = InstanceSelector(in_features, 20) 76 | self.detector_estimator = DetectorEstimator(in_features, 21) 77 | 78 | self.lambda = 0.0 79 | self.schedule = self.build_schedule(epochs, schedule) 80 | 81 | def build_schedule(self, epochs, schedule): 82 | if schedule == 'linear': 83 | return torch.linspace(0, 1, epochs) 84 | elif schedule == 'log': 85 | return torch.logspace(0,1,20)/10 - torch.linspace(0.1, 0, 20) 86 | elif schedule == 'sigmoid': 87 | return torch.linspace(-13,13,epochs).sigmoid() 88 | elif schedule == 'exp': 89 | backward = 1 - self.build_schedule(self, epochs, 'log') 90 | return torch.tensor([backward[i] for i in range(-1, -(len(backward)+1), -1)]) 91 | elif schedule == 'piecewise': 92 | return torch.tensor( 93 | [0.2] * epochs // 5 + 94 | [0.4] * epochs // 5 + 95 | [0.6] * epochs // 5 + 96 | [0.8] * epochs // 5 + 97 | [1.0] * epochs - 4 * (epochs // 5) 98 | ) 99 | 100 | def step(self): 101 | self.lambda = self.schedule[0] 102 | self.schedule = self.schedule[1:] -------------------------------------------------------------------------------- /wsod/models/heads.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class MultipleMidnHead(nn.Module): 6 | def __init__(self, in_features, out_features, t_cls=1., t_det=1., k_cls=1, k_det=1): 7 | super().__init__() 8 | self.in_features, self.out_features = in_features, out_features 9 | self.classifiers = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(k_cls)]) 10 | self.detectors = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(k_det)]) 11 | self.t_cls = t_cls 12 | self.t_det = t_det 13 | self.k = k_cls * k_det 14 | 15 | def forward(self, x): 16 | result = [] 17 | for cls in self.classifiers: 18 | c = (cls(x) / self.t_cls).softmax(1) 19 | for det in self.detectors: 20 | d = (det(x) / self.t_det).softmax(0) 21 | result.append(c * d) 22 | return result 23 | 24 | 25 | class RefinementHeads(nn.Module): 26 | def __init__(self, in_features, out_features, k=3): 27 | super().__init__() 28 | self.in_features, self.out_features = in_features, out_features 29 | self.refinements = nn.ModuleList([nn.Linear(in_features, out_features) for _ in range(k)]) 30 | self.k = k 31 | 32 | def forward(self, x): 33 | result = [] 34 | for refinement in self.refinements: 35 | result.append(refinement(x)) 36 | return result -------------------------------------------------------------------------------- /wsod/models/losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from torchvision import ops 6 | from sklearn.cluster import KMeans 7 | 8 | from typing import Tuple, Dict, List 9 | 10 | 11 | EPS = 1e-12 12 | 13 | 14 | def weighted_softmax_with_loss(score:torch.Tensor, labels:torch.Tensor, weights:torch.Tensor) -> torch.Tensor: 15 | loss = -weights * F.log_softmax(score, dim=-1).gather(-1, labels.long().unsqueeze(-1)).squeeze(-1) 16 | valid_sum = weights.gt(EPS).float().sum() 17 | if valid_sum < EPS: 18 | return loss.sum() / loss.numel() 19 | else: 20 | return loss.sum() / valid_sum 21 | 22 | 23 | @torch.no_grad() 24 | def oicr_label(boxes:torch.Tensor, cls_prob:torch.Tensor, image_labels:torch.Tensor, fg_thresh:float=0.5, bg_thresh:float=-1) -> Tuple[torch.Tensor, torch.Tensor]: 25 | boxes = boxes.to(cls_prob.device) 26 | cls_prob = (cls_prob if cls_prob.size(-1) == image_labels.size(-1) else cls_prob[..., 1:]).clone() 27 | gt_boxes = [] 28 | gt_classes = torch.jit.annotate(List[int], []) 29 | gt_scores = torch.jit.annotate(List[float], []) 30 | for i in image_labels.nonzero()[:,1]: 31 | max_index = cls_prob[:,i].argmax(dim=0) 32 | gt_boxes.append(boxes[max_index]) 33 | gt_classes.append(int(i)+1) 34 | gt_scores.append(float(cls_prob[max_index, i])) 35 | cls_prob[max_index] = 0 36 | max_overlaps, gt_assignment = ops.box_iou(boxes, torch.stack(gt_boxes)).max(dim=1) 37 | 38 | pseudo_labels = torch.gather(torch.tensor(gt_classes, dtype=torch.long, device=cls_prob.device), 0, gt_assignment) 39 | pseudo_labels[max_overlaps <= fg_thresh] = 0 40 | weights = torch.gather(torch.tensor(gt_scores, device=cls_prob.device), 0, gt_assignment) 41 | weights[max_overlaps < bg_thresh] = 0 42 | 43 | return pseudo_labels.detach(), weights.detach() 44 | 45 | 46 | def _get_top_ranking_propoals(probs): 47 | """Get top ranking proposals by k-means""" 48 | dev = probs.device 49 | kmeans = KMeans(n_clusters=5).fit(probs.cpu().numpy()) 50 | high_score_label = np.argmax(kmeans.cluster_centers_) 51 | 52 | index = np.where(kmeans.labels_ == high_score_label)[0] 53 | 54 | if len(index) == 0: 55 | index = np.array([np.argmax(probs)]) 56 | 57 | return torch.from_numpy(index).to(dev) 58 | 59 | 60 | def _get_graph_centers(boxes, cls_prob, im_labels): 61 | """Get graph centers.""" 62 | 63 | num_images, num_classes = im_labels.shape 64 | assert num_images == 1, 'batch size shoud be equal to 1' 65 | dev = cls_prob.device 66 | gt_boxes = torch.zeros((0, 4), dtype=boxes.dtype, device=dev) 67 | gt_classes = torch.zeros((0, 1), dtype=torch.long, device=dev) 68 | gt_scores = torch.zeros((0, 1), dtype=cls_prob.dtype, device=dev) 69 | for i in im_labels.nonzero()[:,1]: 70 | cls_prob_tmp = cls_prob[:, i] 71 | idxs = (cls_prob_tmp >= 0).nonzero()[:,0] 72 | idxs_tmp = _get_top_ranking_propoals(cls_prob_tmp[idxs].reshape(-1, 1)) 73 | idxs = idxs[idxs_tmp] 74 | boxes_tmp = boxes[idxs, :] 75 | cls_prob_tmp = cls_prob_tmp[idxs] 76 | 77 | graph = (ops.box_iou(boxes_tmp, boxes_tmp) > 0.4).float() 78 | 79 | keep_idxs = [] 80 | gt_scores_tmp = [] 81 | count = cls_prob_tmp.size(0) 82 | while True: 83 | order = graph.sum(dim=1).argsort(descending=True) 84 | tmp = order[0] 85 | keep_idxs.append(tmp) 86 | inds = (graph[tmp, :] > 0).nonzero()[:,0] 87 | gt_scores_tmp.append(cls_prob_tmp[inds].max()) 88 | 89 | graph[:, inds] = 0 90 | graph[inds, :] = 0 91 | count = count - len(inds) 92 | if count <= 5: 93 | break 94 | 95 | gt_boxes_tmp = boxes_tmp[keep_idxs, :].view(-1, 4).to(dev) 96 | gt_scores_tmp = torch.tensor(gt_scores_tmp, device=dev) 97 | 98 | keep_idxs_new = torch.from_numpy((gt_scores_tmp.argsort().to('cpu').numpy()[-1:(-1 - min(len(gt_scores_tmp), 5)):-1]).copy()).to(dev) 99 | 100 | gt_boxes = torch.cat((gt_boxes, gt_boxes_tmp[keep_idxs_new, :])) 101 | gt_scores = torch.cat((gt_scores, gt_scores_tmp[keep_idxs_new].reshape(-1, 1))) 102 | gt_classes = torch.cat((gt_classes, (i + 1) * torch.ones((len(keep_idxs_new), 1), dtype=torch.long, device=dev))) 103 | 104 | # If a proposal is chosen as a cluster center, 105 | # we simply delete a proposal from the candidata proposal pool, 106 | # because we found that the results of different strategies are similar and this strategy is more efficient 107 | another_tmp = idxs.to('cpu')[torch.tensor(keep_idxs)][keep_idxs_new.to('cpu')].numpy() 108 | cls_prob = torch.from_numpy(np.delete(cls_prob.to('cpu').numpy(), another_tmp, axis=0)).to(dev) 109 | boxes = torch.from_numpy(np.delete(boxes.to('cpu').numpy(), another_tmp, axis=0)).to(dev) 110 | 111 | proposals = {'gt_boxes' : gt_boxes.to(dev), 112 | 'gt_classes': gt_classes.to(dev), 113 | 'gt_scores': gt_scores.to(dev)} 114 | 115 | return proposals 116 | 117 | 118 | def _get_proposal_clusters(all_rois, proposals, im_labels, cls_prob): 119 | """Generate a random sample of RoIs comprising foreground and background 120 | examples. 121 | """ 122 | num_images, num_classes = im_labels.shape 123 | assert num_images == 1, 'batch size shoud be equal to 1' 124 | # overlaps: (rois x gt_boxes) 125 | gt_boxes = proposals['gt_boxes'] 126 | gt_labels = proposals['gt_classes'] 127 | gt_scores = proposals['gt_scores'] 128 | overlaps = ops.box_iou(all_rois.to(gt_boxes.device), gt_boxes) 129 | max_overlaps, gt_assignment = overlaps.max(dim=1) 130 | labels = gt_labels[gt_assignment, 0] 131 | cls_loss_weights = gt_scores[gt_assignment, 0] 132 | 133 | # Select foreground RoIs as those with >= FG_THRESH overlap 134 | fg_inds = (max_overlaps >= 0.5).nonzero()[:,0] 135 | 136 | # Select background RoIs as those with < FG_THRESH overlap 137 | bg_inds = (max_overlaps < 0.5).nonzero()[:,0] 138 | 139 | ig_inds = (max_overlaps < 0.1).nonzero()[:,0] 140 | cls_loss_weights[ig_inds] = 0.0 141 | 142 | labels[bg_inds] = 0 143 | gt_assignment[bg_inds] = -1 144 | 145 | img_cls_loss_weights = torch.zeros(gt_boxes.shape[0], dtype=cls_prob.dtype, device=cls_prob.device) 146 | pc_probs = torch.zeros(gt_boxes.shape[0], dtype=cls_prob.dtype, device=cls_prob.device) 147 | pc_labels = torch.zeros(gt_boxes.shape[0], dtype=torch.long, device=cls_prob.device) 148 | pc_count = torch.zeros(gt_boxes.shape[0], dtype=torch.long, device=cls_prob.device) 149 | 150 | for i in range(gt_boxes.shape[0]): 151 | po_index = (gt_assignment == i).nonzero()[:,0] 152 | img_cls_loss_weights[i] = torch.sum(cls_loss_weights[po_index]) 153 | pc_labels[i] = gt_labels[i, 0] 154 | pc_count[i] = len(po_index) 155 | pc_probs[i] = (cls_prob[po_index, pc_labels[i]]).mean() 156 | 157 | return labels, cls_loss_weights, gt_assignment, pc_labels, pc_probs, pc_count, img_cls_loss_weights 158 | 159 | 160 | @torch.no_grad() 161 | def pcl_label(boxes:torch.Tensor, cls_prob:torch.Tensor, im_labels:torch.Tensor, cls_prob_new:torch.Tensor): 162 | if cls_prob.shape[1] != im_labels.shape[1]: 163 | cls_prob = cls_prob[:, 1:] 164 | cls_prob = cls_prob.clamp(EPS, 1-EPS) 165 | cls_prob_new = cls_prob_new.clamp(EPS, 1-EPS) 166 | 167 | proposals = _get_graph_centers(boxes, cls_prob, im_labels) 168 | 169 | labels, cls_loss_weights, gt_assignment, pc_labels, pc_probs, \ 170 | pc_count, img_cls_loss_weights = _get_proposal_clusters(boxes, 171 | proposals, im_labels, cls_prob_new) 172 | 173 | return {'labels' : labels.reshape(1, -1), 174 | 'cls_loss_weights' : cls_loss_weights.reshape(1, -1), 175 | 'gt_assignment' : gt_assignment.reshape(1, -1), 176 | 'pc_labels' : pc_labels.reshape(1, -1), 177 | 'pc_probs' : pc_probs.reshape(1, -1), 178 | 'pc_count' : pc_count.reshape(1, -1), 179 | 'img_cls_loss_weights' : img_cls_loss_weights.reshape(1, -1), 180 | 'im_labels_real' : torch.cat((torch.tensor([[1.]], dtype=im_labels.dtype, device=im_labels.device), im_labels), dim=1)} 181 | 182 | 183 | class PCLFunction(torch.autograd.Function): 184 | @staticmethod 185 | def forward(ctx, pcl_probs, labels, cls_loss_weights, 186 | gt_assignment, pc_labels, pc_probs, pc_count, 187 | img_cls_loss_weights, im_labels): 188 | ctx.pcl_probs = pcl_probs 189 | ctx.labels = labels 190 | ctx.cls_loss_weights = cls_loss_weights 191 | ctx.gt_assignment = gt_assignment 192 | ctx.pc_labels = pc_labels 193 | ctx.pc_probs = pc_probs 194 | ctx.pc_count = pc_count 195 | ctx.img_cls_loss_weights = img_cls_loss_weights 196 | ctx.im_labels = im_labels 197 | 198 | batch_size, channels = pcl_probs.size() 199 | loss = 0 200 | ctx.mark_non_differentiable(labels, cls_loss_weights, 201 | gt_assignment, pc_labels, pc_probs, 202 | pc_count, img_cls_loss_weights, im_labels) 203 | 204 | for c in im_labels.nonzero()[:,1]: 205 | if c == 0: 206 | i = (labels[0,:] == 0).nonzero()[:,0] 207 | loss -= (cls_loss_weights[0, i] * pcl_probs[i,c].log()).sum() 208 | else: 209 | i = (pc_labels[0,:] == c).nonzero()[:,0] 210 | loss -= (img_cls_loss_weights[0, i] * pc_probs[0,i].log()).sum() 211 | 212 | return loss / batch_size 213 | 214 | @staticmethod 215 | def backward(ctx, grad_output): 216 | pcl_probs = ctx.pcl_probs 217 | labels = ctx.labels 218 | cls_loss_weights = ctx.cls_loss_weights 219 | gt_assignment = ctx.gt_assignment 220 | pc_labels = ctx.pc_labels 221 | pc_probs = ctx.pc_probs 222 | pc_count = ctx.pc_count 223 | img_cls_loss_weights = ctx.img_cls_loss_weights 224 | im_labels = ctx.im_labels 225 | 226 | grad_input = grad_output.new(pcl_probs.size()).zero_() 227 | 228 | batch_size, channels = pcl_probs.size() 229 | 230 | for c in im_labels.nonzero()[:,1]: 231 | i = (labels[0] == c) 232 | if c == 0: 233 | grad_input[i, c] = -cls_loss_weights[0, i] / pcl_probs[i, c] 234 | else: 235 | pc_index = gt_assignment[0, i] 236 | if (c != pc_labels[0, pc_index]).all(): 237 | print('labels mismatch.') 238 | grad_input[i, c] = -img_cls_loss_weights[0, pc_index] / (pc_count[0, pc_index] * pc_probs[0, pc_index]) 239 | 240 | grad_input /= batch_size 241 | return grad_input, None, None, None, None, None, None, None, None 242 | 243 | 244 | def scs(boxes): 245 | left_inside = boxes[:,0].view(-1,1) > boxes[:,0].view(1,-1) 246 | top_inside = boxes[:,1].view(-1,1) > boxes[:,1].view(1,-1) 247 | right_inside = boxes[:,2].view(-1,1) < boxes[:,2].view(1,-1) 248 | bottom_inside = boxes[:,3].view(-1,1) < boxes[:,3].view(1,-1) 249 | 250 | surrounded = left_inside & right_inside & top_inside & bottom_inside 251 | surrounded = surrounded.any(0) 252 | 253 | return (~surrounded).nonzero()[:,0] 254 | 255 | 256 | def scs_label(rois, predictions, image_labels): 257 | rois = rois.to(predictions[0].device) 258 | predictions = [p[:,1:] if p.size(-1) > image_labels.size(-1) else p for p in predictions] 259 | predictions = torch.stack(predictions) 260 | gt_classes = torch.zeros((0,), dtype=torch.long, device=predictions.device) 261 | gt_scores = torch.zeros((0,), dtype=predictions.dtype, device=predictions.device) 262 | gt_boxes = torch.zeros((0,4), dtype=rois.dtype, device=rois.device) 263 | for c in image_labels.nonzero()[:,1]: 264 | top_scores, top_idxs = predictions[:,:,c].max(dim=1) 265 | top_boxes = rois[top_idxs.flatten(0)] 266 | keep = scs(top_boxes) 267 | 268 | gt_scores = torch.cat([gt_scores, top_scores[keep]]) 269 | gt_classes = torch.cat([gt_classes, torch.full_like(keep, c+1, device=gt_classes.device)]) 270 | gt_boxes = torch.cat([gt_boxes, top_boxes[keep]]) 271 | 272 | predictions[:,top_idxs[keep],:] = 0 273 | 274 | keep = ops.boxes.batched_nms(gt_boxes, gt_scores, gt_classes, 0.1) 275 | gt_classes, gt_scores, gt_boxes = gt_classes[keep], gt_scores[keep], gt_boxes[keep] 276 | 277 | max_overlap, gt_assignment = ops.box_iou(rois, gt_boxes).max(dim=1) 278 | pseudo_labels = gt_classes.gather(-1, gt_assignment) 279 | pseudo_labels[max_overlap < 0.5] = 0 280 | weights = gt_scores.gather(-1, gt_assignment) 281 | 282 | return pseudo_labels, weights 283 | 284 | 285 | def midn_loss(prediction, image_labels, reduction): 286 | image_prediction = prediction.sum(dim=0, keepdim=True).clamp(EPS, 1-EPS) 287 | return F.binary_cross_entropy(image_prediction, image_labels, reduction=reduction) 288 | 289 | 290 | def wsddn_loss(predictions, rois, image_labels, **kwargs): 291 | losses = {} 292 | image_labels = image_labels.to(predictions[0].device) 293 | 294 | for i, prediction in enumerate(predictions): 295 | losses['midn' + str(i) + '_loss'] = midn_loss(prediction, image_labels, reduction='sum') 296 | 297 | return losses 298 | 299 | 300 | def oicr_loss(predictions, rois, image_labels, **kwargs): 301 | losses = {} 302 | dev = predictions[0].device 303 | image_labels = image_labels.to(dev) 304 | 305 | losses['midn_loss'] = midn_loss(predictions[0], image_labels, reduction='mean') 306 | 307 | pseudo_labels, weights = oicr_label(rois, predictions[0], image_labels) 308 | i = 0 309 | for prediction in predictions[1:-1]: 310 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(prediction, pseudo_labels, weights) 311 | pseudo_labels, weights = oicr_label(rois, prediction.softmax(-1), image_labels) 312 | i += 1 313 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(predictions[-1], pseudo_labels, weights) 314 | 315 | return losses 316 | 317 | 318 | def pcl_loss(predictions, rois, image_labels, **kwargs): 319 | losses = {} 320 | dev = predictions[0].device 321 | image_labels = image_labels.to(dev) 322 | 323 | losses['midn_loss'] = midn_loss(predictions[0], image_labels, reduction='mean') 324 | 325 | prev = predictions[0] 326 | pcl = PCLFunction.apply 327 | for i, pred in enumerate(predictions[1:]): 328 | pred = pred.softmax(-1) 329 | dct = pcl_label(rois, prev, image_labels, pred) 330 | args = [pred] + list(dct.values()) 331 | losses['ref' + str(i) + '_loss'] = pcl(*args) 332 | prev = pred 333 | 334 | return losses 335 | 336 | 337 | def instability_loss(predictions, rois, image_labels, **kwargs): 338 | losses = {} 339 | 340 | for i, prediction in enumerate(predictions[:3]): 341 | losses['midn' + str(i) + '_loss'] = midn_loss(prediction, image_labels, reduction='mean') 342 | 343 | pseudo_labels, weights = scs_label(rois, predictions[:3], image_labels) 344 | i = 0 345 | for prediction in predictions[3:-1]: 346 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(prediction, pseudo_labels, weights) 347 | pseudo_labels, weights = oicr_label(rois, prediction.softmax(-1), image_labels) 348 | i += 1 349 | losses['ref' + str(i) + '_loss'] = weighted_softmax_with_loss(predictions[-1], pseudo_labels, weights) 350 | 351 | return losses 352 | 353 | 354 | def gt_injected_midn_loss(predictions, rois, gt_boxes, gt_classes): 355 | max_overlap, gt_assignment = ops.box_iou(rois, gt_boxes).max(dim=1) 356 | fg = max_overlap >= EPS 357 | bg = ~fg 358 | loss = -(1-predictions[bg]).log().mean() 359 | 360 | for c in gt_classes.unique(): 361 | targets = gt_assignment == c and fg 362 | weight = max_overlap[targets] 363 | weight = (normed - normed.min()) / (normed.max() - normed.min()) 364 | loss -= (weight * predictions[targets].log()).mean() 365 | 366 | return loss 367 | 368 | 369 | def gt_injected_oicr_loss(predictions, rois, gt_boxes, gt_classes): 370 | losses['midn_loss'] = gt_injected_midn_loss(predictions[0], rois, gt_boxes, gt_classes) 371 | 372 | for i, prediction in enumerate(predictions[1:]): 373 | max_overlap, gt_assignment = ops.box_iou(rois, gt_boxes).max(dim=1) 374 | class_assignment = gt_classes.gather(-1, gt_assignment) + 1 375 | class_assignment[max_overlap < 0.5] = 0 376 | max_overlap[max_overlap < 0.5] = 1 - max_overlap[max_overlap < 0.5] 377 | losses['ref' + str(i) + '_loss'] = -(max_overlap * predictions.log_softmax().gather(-1, class_assignment)).mean() 378 | 379 | return losses 380 | 381 | 382 | def semi_supervised_oicr_loss(predictions, rois, image_labels, **kwargs): 383 | if 'gt_boxes' in kwargs and 'gt_classes' in kwargs: 384 | losses = gt_injected_oicr_loss(predictions, rois, kwargs['gt_boxes'], kwargs['gt_classes']) 385 | else: 386 | losses = oicr_loss(predictions, rois, image_labels, **kwargs) 387 | 388 | return losses 389 | 390 | 391 | LOSS_FUNCTIONS = { 392 | 'wsddn_loss': wsddn_loss, 393 | 'oicr_loss': oicr_loss, 394 | 'pcl_loss': pcl_loss, 395 | 'instability_loss': instability_loss, 396 | 'semi_supervised_oicr_loss': semi_supervised_oicr_loss 397 | } -------------------------------------------------------------------------------- /wsod/models/melm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torchvision import models, ops 4 | 5 | 6 | __backbones = { 7 | 'vgg16': models.vgg16 8 | } 9 | 10 | 11 | class DiscoveryModule(nn.Module): 12 | def __init__(self, in_features, out_features): 13 | super().__init__() 14 | self.lin = nn.Linear(in_features, out_features) 15 | 16 | def forward(self, x, rois, targets=None): 17 | x = self.lin(x) 18 | n, c = x.shape 19 | x = x.view(-1).softmax(0).view(n,c) 20 | boxes = rois 21 | cliques = [] 22 | while x.size(0) > 0: 23 | scores, classes = x.max(1) 24 | top_score, top_idx = scores.max(0) 25 | box = boxes[top_idx] 26 | overlaps = ops.box_iou(box.view(1,4), boxes) 27 | clique_ids = (overlaps > 0.7).nonzero()[:,1] 28 | cliques.append(clique_ids) 29 | keep = (overlaps <= 0.7).nonzero()[:,1] 30 | boxes = boxes[keep, keep] 31 | x = x[keep] 32 | 33 | if self.training: 34 | assert targets is not None, "Must have targets for training" 35 | loss = 0 36 | for clique in cliques: 37 | scores = x[clique] 38 | weights = 1. / len(cliques) * (scores / scores.sum(1)).sum(0) 39 | clique_scores = scores.sum(0) 40 | loss -= ((weights * clique_scores) * targets).log().sum() 41 | loss -= ((1 - targets) * (1 - x).log()).sum() 42 | return cliques, x, loss 43 | else: 44 | return cliques, x, 0 45 | 46 | 47 | class LocalisationModule(nn.Module): 48 | 49 | 50 | 51 | 52 | class MinEntropyLatentModel(nn.Module): 53 | def __init__(self): 54 | super().__init__() 55 | base = __backbones['vgg16'](pretrained=True) 56 | 57 | self.convs = base.features[:-1] 58 | self.pooler = ops.RoIPool((7,7), 1./16.) 59 | self.fc = base.classifier[:-1] 60 | 61 | self.loc = LocalisationModule() 62 | self.dis = DiscoveryModule(4096, 20) 63 | 64 | def predict_on_batch(self, images, rois): 65 | x = self.convs(x) 66 | x = self.pooler(x, rois) 67 | x = x.flatten(1) 68 | x = self.fc(x) 69 | 70 | for rois_per_image in rois: 71 | n = rois.size(0) 72 | x_i = x[:n] 73 | x = x[n:] 74 | cliques, g, loss = self.dis(x_i, rois_per_image) 75 | l = self.loc(x, g) -------------------------------------------------------------------------------- /wsod/models/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torchvision import models as M, ops 5 | 6 | from detectron2.data import transforms as T 7 | from detectron2.modeling import META_ARCH_REGISTRY 8 | from detectron2.structures import Instances, Boxes 9 | 10 | from . import heads, utils 11 | from .losses import LOSS_FUNCTIONS 12 | from .backbones.vggm import vggm_1024 13 | 14 | from typing import List 15 | 16 | 17 | _backbones = { 18 | 'alexnet': lambda p: extract_components(M.alexnet, p), 19 | 'vgg16': lambda p: extract_components(M.vgg16, p), 20 | 'vggm': lambda p: extract_components(vggm_1024, p), 21 | } 22 | 23 | 24 | def extract_components(model_fn, pretrained=False): 25 | model = model_fn(pretrained) 26 | convs = model.features[:-1] 27 | fc = model.classifier[:-1] 28 | return convs, fc 29 | 30 | 31 | def dilate_convs(convs): 32 | i = -1 33 | while not isinstance(convs[i], nn.MaxPool2d): 34 | if isinstance(convs[i], nn.Conv2d): 35 | convs[i].dilation = (2, 2) 36 | convs[i].padding = (2, 2) 37 | i -= 1 38 | del convs[i] 39 | return convs 40 | 41 | 42 | @META_ARCH_REGISTRY.register() 43 | class GeneralisedMIL(nn.Module): 44 | def __init__(self, cfg): 45 | super().__init__() 46 | 47 | self.device = cfg.MODEL.DEVICE 48 | self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES 49 | 50 | # Test mode details 51 | self.test_nms_threshold = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST 52 | self.test_score_threshold = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST 53 | self.test_max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 54 | self.test_out_layers = cfg.MODEL.PREDICTION_LAYERS 55 | 56 | # Normalization details 57 | self.pixel_mean = torch.tensor(cfg.MODEL.PIXEL_MEAN).view(1,3,1,1).to(self.device) 58 | self.pixel_std = torch.tensor(cfg.MODEL.PIXEL_STD).view(1,3,1,1).to(self.device) 59 | 60 | # Set up the model base 61 | backbone_name = cfg.MODEL.BACKBONE.NAME 62 | dilated = backbone_name.endswith('_dilated') 63 | backbone_name = backbone_name[:-len('_dilated')] if dilated else backbone_name 64 | 65 | pretrained = cfg.MODEL.BACKBONE.WEIGHTS 66 | convs, fc = _backbones[backbone_name](pretrained=='imagenet') 67 | if pretrained not in ['imagenet', '']: 68 | utils.load_weights(convs, fc, pretrained) 69 | 70 | if dilated: 71 | convs = dilate_convs(convs) 72 | 73 | utils.freeze_convs(convs, cfg.MODEL.BACKBONE.FREEZE_CONVS) 74 | self.convs = convs 75 | self.fc = fc 76 | 77 | # Set up the pooling layer 78 | scale = utils.get_conv_scale(convs) 79 | res = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 80 | pool_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE 81 | if pool_type.lower() == 'roipool': 82 | self.pooler = ops.RoIPool((res, res), scale) 83 | else: 84 | raise NotImplementedError(f'Pooler type {pool_type} not implemented') 85 | 86 | # Set up the heads 87 | fc_features = utils.get_out_features(fc) 88 | nc, nd = cfg.MODEL.MIDN_HEAD.NUM_CLASSIFIER, cfg.MODEL.MIDN_HEAD.NUM_DETECTOR 89 | if nc > 0 and nd > 0: 90 | self.midn = heads.MultipleMidnHead( 91 | in_features=fc_features, 92 | out_features=self.num_classes, 93 | t_cls=cfg.MODEL.MIDN_HEAD.CLASSIFIER_TEMP, 94 | t_det=cfg.MODEL.MIDN_HEAD.DETECTOR_TEMP, 95 | k_cls=nc, 96 | k_det=nd 97 | ) 98 | 99 | nr = cfg.MODEL.REFINEMENT_HEAD.K 100 | if nr > 0: 101 | self.refinement = heads.RefinementHeads( 102 | in_features=fc_features, 103 | out_features=self.num_classes+1, #BG Class 104 | k=3 105 | ) 106 | 107 | if cfg.TEST.AUG.ENABLED: 108 | self.tta = self._init_tta_fn(cfg) 109 | else: 110 | self.tta = lambda x: x 111 | 112 | self.build_loss = LOSS_FUNCTIONS[cfg.MODEL.LOSS_FN] 113 | self.init_layers() 114 | 115 | @torch.no_grad() 116 | def init_layers(self): 117 | params = list(self.midn.classifiers.named_parameters()) 118 | if hasattr(self, 'refinement'): 119 | params += list(self.refinement.named_parameters()) 120 | 121 | if len(self.midn.detectors) > 1: 122 | utils.orthogonal_init(self.midn.detectors) 123 | else: 124 | params += list(self.midn.detectors.named_parameters()) 125 | 126 | for k, v in params: 127 | if 'bias' in k: 128 | nn.init.zeros_(v) 129 | else: 130 | nn.init.normal_(v, mean=0.0, std=0.01) 131 | 132 | def hack(self): 133 | for p_source, p_dest in zip(torch.load('/home/Deep_Learner/work/cleaned/outputs/oicr_vgg_dilated/model_final.pth')['model'].values(), 134 | self.parameters()): 135 | p_dest.copy_(p_source) 136 | 137 | def to(self, device): 138 | self.device = device 139 | self.pixel_mean = self.pixel_mean.to(device) 140 | self.pixel_std = self.pixel_std.to(device) 141 | return super().to(device) 142 | 143 | def normalize(self, x:torch.Tensor) -> torch.Tensor: 144 | return (x - self.pixel_mean) / self.pixel_std 145 | 146 | def predict_on_example(self, image:torch.Tensor, rois:List[torch.Tensor]) -> List[List[torch.Tensor]]: 147 | x = self.normalize(image) 148 | x = self.convs(x) 149 | x = self.pooler(x, [r.type(x.dtype) for r in rois]) 150 | x = x.flatten(1) 151 | x = self.fc(x) 152 | r = self.refinement(x) if hasattr(self, 'refinement') else [] 153 | 154 | outputs = [] 155 | for rois_per_image in rois: 156 | n = rois_per_image.size(0) 157 | x_i = x[:n] 158 | r_i = [tmp[:n] for tmp in r] 159 | x = x[n:] 160 | r = [tmp[n:] for tmp in r] 161 | m = self.midn(x_i) if hasattr(self, 'midn') and (self.training or not hasattr(self, 'refinement')) else [] 162 | outputs.append(m + r_i) 163 | return outputs 164 | 165 | def _init_tta_fn(self, cfg): 166 | max_size = cfg.TEST.AUG.MAX_SIZE 167 | size_gens = [T.ResizeShortestEdge(sz, max_size, 'choice') for sz in cfg.TEST.AUG.MIN_SIZES] 168 | flip = T.RandomFlip(1.0) 169 | 170 | def tta_fn(image, rois): 171 | image = image.permute(1, 2, 0).to('cpu').numpy() 172 | dtype = image.dtype 173 | image = image.astype(np.uint8) 174 | 175 | out_images, out_rois = [], [] 176 | for tfm_gen in size_gens: 177 | resized_image, tfm = T.apply_transform_gens([tfm_gen], image) 178 | resized_rois = tfm.transforms[0].apply_box(rois.to('cpu').numpy()) 179 | 180 | if cfg.TEST.AUG.FLIP: 181 | flipped_image, tfm = T.apply_transform_gens([flip], resized_image) 182 | flipped_rois = tfm.transforms[0].apply_box(resized_rois) 183 | 184 | img_batch = torch.stack([ 185 | torch.from_numpy(resized_image.astype(dtype)).permute(2,0,1), 186 | torch.from_numpy(flipped_image.astype(dtype)).permute(2,0,1) 187 | ]) 188 | roi_batch = [ 189 | torch.from_numpy(resized_rois), 190 | torch.from_numpy(flipped_rois) 191 | ] 192 | else: 193 | img_batch = torch.from_numpy(resized_image.astype(dtype)).permute(2,0,1).unsqueeze(0) 194 | roi_batch = [torch.from_numpy(resized_rois),] 195 | out_images.append(img_batch) 196 | out_rois.append(roi_batch) 197 | return out_images, out_rois 198 | 199 | return tta_fn 200 | 201 | def forward(self, batch, use_gt=False): 202 | losses = {} 203 | batch_predictions = [] 204 | bs = len(batch) 205 | for element in batch: 206 | image, rois, gt_classes, gt_boxes = utils.extract_data(element) 207 | 208 | if self.training: 209 | predictions = self.predict_on_example(image.unsqueeze(0).to(self.device), [rois.to(self.device)]) 210 | image_labels = torch.zeros((1, self.num_classes,), dtype=image.dtype, device=self.device) 211 | image_labels[0, gt_classes.unique()] = 1. 212 | for prediction in predictions: 213 | loss = self.build_loss(prediction, rois, image_labels, gt_boxes=gt_boxes, gt_classes=gt_classes) 214 | for k, v in loss.items(): 215 | v = v.float() 216 | running_total = losses.setdefault(k, torch.zeros_like(v)) 217 | losses[k] = running_total + (v / bs) 218 | else: 219 | aug_images, aug_rois = self.tta(image, rois) 220 | scores = None 221 | for batch_images, batch_rois in zip(aug_images, aug_rois): 222 | predictions = self.predict_on_example(batch_images.to(self.device), [r.to(self.device) for r in batch_rois]) 223 | for prediction in predictions: 224 | if hasattr(self, 'refinement'): 225 | p = sum([pred.softmax(-1)[:,1:] for pred in prediction]) 226 | else: 227 | p = sum(prediction) 228 | if scores is None: 229 | scores = p 230 | else: 231 | scores += p 232 | boxes, scores, classes = utils.filter_predictions(scores, rois, self.test_nms_threshold, self.test_score_threshold) 233 | instances = Instances((element['height'], element['width'])) 234 | instances.scores = scores[:self.test_max_detections_per_image] 235 | instances.pred_classes = classes[:self.test_max_detections_per_image] 236 | instances.pred_boxes = Boxes(boxes[:self.test_max_detections_per_image]) 237 | batch_predictions.append({ 238 | 'instances': instances 239 | }) 240 | return losses if self.training else batch_predictions -------------------------------------------------------------------------------- /wsod/models/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | #from detectron2.layers import batched_nms 4 | from torchvision.ops.boxes import batched_nms 5 | 6 | 7 | @torch.no_grad() 8 | def orthogonal_init(layers, mean=0.0, std=0.01): 9 | k = len(layers) 10 | ou_f = layers[0].out_features 11 | in_f = layers[0].in_features 12 | random = torch.randn((ou_f, in_f, k)) * std + mean 13 | q, r = torch.qr(random, some=True) 14 | 15 | for detector, init in zip(layers, q.permute(2, 0, 1)): 16 | detector.weight.data.copy_(init) 17 | nn.init.zeros_(detector.bias) 18 | 19 | 20 | @torch.no_grad() 21 | def filter_predictions(scores, rois, nms_threshold, score_threshold): 22 | rois = rois.to(scores.device) 23 | idxs, cls_ids = (scores > score_threshold).nonzero().T 24 | cls_scores = scores[idxs, cls_ids] 25 | boxes = rois[idxs] 26 | keep = batched_nms(boxes, cls_scores, cls_ids, nms_threshold) 27 | return boxes[keep], cls_scores[keep], cls_ids[keep] 28 | 29 | 30 | @torch.no_grad() 31 | def load_weights(convs, fc, pretrained): 32 | m = torch.load(pretrained) 33 | for model_param, pretrained_param in zip(list(convs.parameters()) + list(fc.parameters()), 34 | m.parameters()): 35 | model_param.weight.copy_(pretrained_param.weight) 36 | model_param.bias.copy_(pretrained_param.bias) 37 | 38 | 39 | def get_conv_scale(convs): 40 | """ 41 | Determines the downscaling performed by a sequence of convolutional and pooling layers 42 | """ 43 | scale = 1. 44 | for c in convs: 45 | stride = getattr(c, 'stride', 1.) 46 | scale /= stride if isinstance(stride, (int, float)) else stride[0] 47 | return scale 48 | 49 | 50 | def get_out_features(fc): 51 | """ 52 | Determines the size of the output from a sequence of fully connected layers 53 | """ 54 | i = -1 55 | while i < 0: # will be set to out features to exit 56 | i = getattr(fc[i], 'out_features', i-1) 57 | return i 58 | 59 | 60 | def freeze_convs(convs, k): 61 | """ 62 | Freezes `k` conv layers 63 | """ 64 | i = 0 65 | while k > 0: 66 | if isinstance(convs[i], nn.Conv2d): 67 | k -= 1 68 | for p in convs[i].parameters(): 69 | p.requires_grad = False 70 | i += 1 71 | 72 | 73 | def extract_data(element): 74 | image = element['image'] 75 | 76 | instances = element.get('instances') 77 | if instances: 78 | gt_boxes = instances.gt_boxes 79 | gt_classes = instances.gt_classes 80 | else: 81 | gt_boxes = torch.zeros((0, 4), dtype=torch.float) 82 | gt_classes = torch.zeros((0,), dtype=torch.long) 83 | 84 | rois = element['proposals'].proposal_boxes.tensor 85 | 86 | return image, rois, gt_classes, gt_boxes -------------------------------------------------------------------------------- /wsod/optim/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/optim/.ipynb_checkpoints/__init__-checkpoint.py -------------------------------------------------------------------------------- /wsod/optim/.ipynb_checkpoints/caffesgd-checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import SGD 3 | from detectron2.solver.lr_scheduler import WarmupMultiStepLR, _get_warmup_factor_at_iter 4 | from bisect import bisect_right 5 | 6 | 7 | class CaffeSGD(SGD): 8 | def step(self, closure=None): 9 | """Performs a single optimization step. 10 | 11 | Arguments: 12 | closure (callable, optional): A closure that reevaluates the model 13 | and returns the loss. 14 | """ 15 | loss = None 16 | if closure is not None: 17 | loss = closure() 18 | 19 | for group in self.param_groups: 20 | weight_decay = group['weight_decay'] 21 | momentum = group['momentum'] 22 | dampening = group['dampening'] 23 | nesterov = group['nesterov'] 24 | 25 | for p in group['params']: 26 | if p.grad is None: 27 | continue 28 | d_p = p.grad.data 29 | if weight_decay != 0: 30 | d_p = d_p.add(weight_decay, p.data) 31 | d_p.mul_(group['lr']) 32 | if momentum != 0: 33 | param_state = self.state[p] 34 | if 'momentum_buffer' not in param_state: 35 | buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() 36 | else: 37 | buf = param_state['momentum_buffer'] 38 | buf.mul_(momentum).add_(1 - dampening, d_p) 39 | if nesterov: 40 | d_p = d_p.add(momentum, buf) 41 | else: 42 | d_p = buf 43 | 44 | p.data.add_(-1, d_p) 45 | 46 | return loss 47 | 48 | class CaffeLRScheduler(WarmupMultiStepLR): 49 | def _get_lr_ratio(self) -> float: 50 | warmup_factor = _get_warmup_factor_at_iter( 51 | self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor 52 | ) 53 | return warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) 54 | 55 | def step(self, epoch=None): 56 | super().step(epoch) 57 | 58 | #Adjust Momentum 59 | factor = 1. / self._get_lr_ratio() 60 | for param in self.optimizer.param_groups: 61 | p_keys = param['params'] 62 | for p_key in p_keys: 63 | if 'momentum_buffer' in self.optimizer.state[p_key].keys(): 64 | self.optimizer.state[p_key]['momentum_buffer'] *= factor -------------------------------------------------------------------------------- /wsod/optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/optim/__init__.py -------------------------------------------------------------------------------- /wsod/optim/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/optim/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/optim/__pycache__/caffesgd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradezard131/wsod/ea145476e2e95762985647d992d07f0f725e20df/wsod/optim/__pycache__/caffesgd.cpython-37.pyc -------------------------------------------------------------------------------- /wsod/optim/caffesgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import SGD 3 | from detectron2.solver.lr_scheduler import WarmupMultiStepLR, _get_warmup_factor_at_iter 4 | from bisect import bisect_right 5 | 6 | 7 | class CaffeSGD(SGD): 8 | def step(self, closure=None): 9 | """Performs a single optimization step. 10 | 11 | Arguments: 12 | closure (callable, optional): A closure that reevaluates the model 13 | and returns the loss. 14 | """ 15 | loss = None 16 | if closure is not None: 17 | loss = closure() 18 | 19 | for group in self.param_groups: 20 | weight_decay = group['weight_decay'] 21 | momentum = group['momentum'] 22 | dampening = group['dampening'] 23 | nesterov = group['nesterov'] 24 | 25 | for p in group['params']: 26 | if p.grad is None: 27 | continue 28 | d_p = p.grad.data 29 | if weight_decay != 0: 30 | d_p = d_p.add(weight_decay, p.data) 31 | d_p.mul_(group['lr']) 32 | if momentum != 0: 33 | param_state = self.state[p] 34 | if 'momentum_buffer' not in param_state: 35 | buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() 36 | else: 37 | buf = param_state['momentum_buffer'] 38 | buf.mul_(momentum).add_(1 - dampening, d_p) 39 | if nesterov: 40 | d_p = d_p.add(momentum, buf) 41 | else: 42 | d_p = buf 43 | 44 | p.data.add_(-1, d_p) 45 | 46 | return loss 47 | 48 | class CaffeLRScheduler(WarmupMultiStepLR): 49 | def _get_lr_ratio(self) -> float: 50 | warmup_factor = _get_warmup_factor_at_iter( 51 | self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor 52 | ) 53 | return warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) 54 | 55 | def step(self, epoch=None): 56 | super().step(epoch) 57 | 58 | #Adjust Momentum 59 | factor = 1. / self._get_lr_ratio() 60 | for param in self.optimizer.param_groups: 61 | p_keys = param['params'] 62 | for p_key in p_keys: 63 | if 'momentum_buffer' in self.optimizer.state[p_key].keys(): 64 | self.optimizer.state[p_key]['momentum_buffer'] *= factor --------------------------------------------------------------------------------