├── retrieval ├── loss │ └── __init__.py ├── models │ ├── __init__.py │ ├── prompts │ │ └── __init__.py │ └── clip │ │ ├── __init__.py │ │ └── bpe_simple_vocab_16e6.txt.gz ├── utils │ ├── __init__.py │ └── factory.py ├── methods │ └── __init__.py ├── .gitignore ├── shell │ └── run.sh ├── util │ └── __init__.py ├── configs │ ├── domainnet_slip.json │ ├── core50_slip.json │ ├── cddb_sip.json │ ├── cddb_slip.json │ ├── lpi │ │ ├── coco_l2p.json │ │ ├── coco_clip.json │ │ ├── coco_lpi.json │ │ └── coco_sprompts.json │ └── coco_slip.json ├── LICENSE └── main.py ├── grounding ├── DATASET ├── tools │ ├── utils │ │ ├── __init__.py │ │ └── colors.py │ └── cityscapes │ │ └── instances2dict_with_polygons.py ├── shell │ ├── s1.py │ ├── s2.py │ ├── s3.py │ ├── base.sh │ ├── lpim.sh │ ├── refcoco+.sh │ ├── l2p.sh │ ├── maple.sh │ ├── lpip.sh │ ├── vis.sh │ ├── depth.sh │ ├── prompt_lora.sh │ ├── sprompt.sh │ └── cmd.sh ├── MODEL ├── maskrcnn_benchmark │ ├── utils │ │ ├── __init__.py │ │ ├── README.md │ │ ├── collect_env.py │ │ ├── miscellaneous.py │ │ ├── amp.py │ │ ├── cv2_util.py │ │ ├── imports.py │ │ ├── logger.py │ │ ├── env.py │ │ ├── registry.py │ │ ├── ema.py │ │ ├── pretrain_model_loading.py │ │ ├── shallow_contrastive_loss_helper.py │ │ └── model_zoo.py │ ├── modeling │ │ ├── __init__.py │ │ ├── bert │ │ │ └── __init__.py │ │ ├── prompt │ │ │ └── __init__.py │ │ ├── roi_heads │ │ │ ├── box_head │ │ │ │ ├── __init__.py │ │ │ │ ├── roi_box_predictors.py │ │ │ │ └── box_head.py │ │ │ ├── mask_head │ │ │ │ ├── __init__.py │ │ │ │ └── hourglass.py │ │ │ └── keypoint_head │ │ │ │ ├── roi_keypoint_predictors.py │ │ │ │ └── keypoint_head.py │ │ ├── language_backbone │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── __init__.py │ │ │ ├── test_clip_tokenizer.py │ │ │ ├── build.py │ │ │ └── backbone.py │ │ ├── registry.py │ │ ├── detector │ │ │ └── __init__.py │ │ ├── rpn │ │ │ ├── __init__.py │ │ │ └── transformer.py │ │ ├── backbone │ │ │ ├── mixer.py │ │ │ └── ops.py │ │ ├── balanced_positive_negative_sampler.py │ │ └── utils.py │ ├── structures │ │ ├── __init__.py │ │ └── image_list.py │ ├── data │ │ ├── datasets │ │ │ ├── evaluation │ │ │ │ ├── od_eval.py │ │ │ │ ├── flickr │ │ │ │ │ └── __init__.py │ │ │ │ ├── lvis │ │ │ │ │ └── _change_lvis_annotation.py │ │ │ │ ├── vg │ │ │ │ │ └── __init__.py │ │ │ │ ├── voc │ │ │ │ │ └── __init__.py │ │ │ │ ├── coco │ │ │ │ │ └── __init__.py │ │ │ │ ├── od_to_grounding │ │ │ │ │ └── __init__.py │ │ │ │ └── __init__.py │ │ │ ├── flickr.py │ │ │ ├── object365.py │ │ │ ├── phrasecut.py │ │ │ ├── concat_dataset.py │ │ │ ├── duplicate_dataset.py │ │ │ ├── __init__.py │ │ │ ├── list_dataset.py │ │ │ ├── background.py │ │ │ └── imagenet.py │ │ ├── __init__.py │ │ ├── transforms │ │ │ ├── __init__.py │ │ │ └── build.py │ │ └── samplers │ │ │ ├── __init__.py │ │ │ ├── iteration_based_batch_sampler.py │ │ │ └── distributed.py │ ├── __init__.py │ ├── engine │ │ └── __init__.py │ ├── config │ │ └── __init__.py │ ├── solver │ │ └── __init__.py │ ├── layers │ │ ├── nms.py │ │ ├── smooth_l1_loss.py │ │ ├── evonorm.py │ │ ├── __init__.py │ │ ├── se.py │ │ ├── roi_pool.py │ │ ├── iou_loss.py │ │ └── roi_align.py │ └── csrc │ │ ├── ml_nms.h │ │ ├── cpu │ │ ├── vision.h │ │ └── nms_cpu.cpp │ │ ├── SigmoidFocalLoss.h │ │ ├── nms.h │ │ ├── vision.cpp │ │ ├── ROIPool.h │ │ ├── ROIAlign.h │ │ └── deform_pool.h ├── bert-base-uncased ├── docs │ ├── lead.png │ ├── word_cloud_od.png │ └── benchmark_example_od.png ├── MID │ ├── task_visual.png │ └── task_visual_2.png ├── configs │ ├── pretrain │ │ ├── _coco.yaml │ │ ├── glip_Swin_T_O365.yaml │ │ ├── glip_A_Swin_T_O365.yaml │ │ └── glip_Swin_T_O365_GoldG.yaml │ ├── flickr │ │ ├── test.yaml │ │ ├── val.yaml │ │ └── flickr.yaml │ ├── refcoco │ │ ├── val.yaml │ │ ├── script.txt │ │ ├── finetune.yaml │ │ ├── refcoco.yaml │ │ └── org │ │ │ ├── finetune_A.yaml │ │ │ ├── finetune_A_tt.yaml │ │ │ ├── finetune_A_base.yaml │ │ │ ├── finetune_A_decompose_layer.yaml │ │ │ ├── finetune_A_decompose_task.yaml │ │ │ ├── finetune_A_decompose_interact.yaml │ │ │ ├── finetune_A_decompose_layer_task.yaml │ │ │ ├── finetune_A_decompose_task_interact.yaml │ │ │ ├── finetune_A_decompose_layer_interact.yaml │ │ │ ├── finetune_A_decompose_task_layer_interact.yaml │ │ │ └── finetune_A_test.yaml │ ├── lvis │ │ ├── val.yaml │ │ └── minival.yaml │ └── odinw_35 │ │ └── _all.json ├── test │ ├── task_div.py │ ├── tt.py │ ├── vis_2.py │ ├── task_sim_matrix.py │ ├── colors.py │ ├── task.vis.py │ └── task_visual.py ├── webui │ └── flagged │ │ ├── Image │ │ └── d60a16e47c3a575b719d │ │ │ └── download.jpg │ │ ├── Bounding box │ │ └── 46774ff9412648535a6f │ │ │ └── image.png │ │ └── log.csv ├── CODE_OF_CONDUCT.md ├── test.py ├── requirements.txt ├── .gitignore ├── cmd │ └── cmd.txt ├── LICENSE ├── SUPPORT.md ├── odinw │ └── download.py ├── matrix │ └── matrix.py ├── setup.py └── SECURITY.md ├── assets └── webui.png ├── script ├── retrieval │ └── lpi.sh └── grounding │ └── lpi_p.sh └── .gitignore /retrieval/loss/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /retrieval/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /retrieval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/DATASET: -------------------------------------------------------------------------------- 1 | /root/autodl-tmp -------------------------------------------------------------------------------- /grounding/tools/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /retrieval/methods/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/shell/s1.py: -------------------------------------------------------------------------------- 1 | print('s1 test') -------------------------------------------------------------------------------- /grounding/shell/s2.py: -------------------------------------------------------------------------------- 1 | print('s2 test') -------------------------------------------------------------------------------- /grounding/shell/s3.py: -------------------------------------------------------------------------------- 1 | print('s3 test') -------------------------------------------------------------------------------- /retrieval/models/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/MODEL: -------------------------------------------------------------------------------- 1 | /home1/yanweicai/MODEL/glip -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/structures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/bert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /retrieval/models/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /grounding/bert-base-uncased: -------------------------------------------------------------------------------- 1 | /home1/yanweicai/MODEL/bert-base-uncased/ -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/od_eval.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /retrieval/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea/ 3 | logs/ 4 | logss/ 5 | res/ -------------------------------------------------------------------------------- /assets/webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/assets/webui.png -------------------------------------------------------------------------------- /grounding/docs/lead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/docs/lead.png -------------------------------------------------------------------------------- /grounding/MID/task_visual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/MID/task_visual.png -------------------------------------------------------------------------------- /grounding/MID/task_visual_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/MID/task_visual_2.png -------------------------------------------------------------------------------- /grounding/configs/pretrain/_coco.yaml: -------------------------------------------------------------------------------- 1 | DATASETS: 2 | TRAIN: ("coco_2017_train",) 3 | TEST: ("coco_2017_val", ) -------------------------------------------------------------------------------- /grounding/docs/word_cloud_od.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/docs/word_cloud_od.png -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/flickr/__init__.py: -------------------------------------------------------------------------------- 1 | from .flickr_eval import FlickrEvaluator 2 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /grounding/docs/benchmark_example_od.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/docs/benchmark_example_od.png -------------------------------------------------------------------------------- /script/retrieval/lpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd /home1/yanweicai/workspace/prompt/lpi/retrieval 3 | python main.py --config ./configs/lpi/coco_lpi.json -------------------------------------------------------------------------------- /retrieval/models/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/retrieval/models/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .build import make_data_loader 3 | -------------------------------------------------------------------------------- /grounding/test/task_div.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | mat = np.loadtxt('../MID/task_sim_matrix.txt') 4 | threshold = 0.4 5 | mat = (mat>threshold).astype(int) 6 | print(mat) 7 | -------------------------------------------------------------------------------- /grounding/webui/flagged/Image/d60a16e47c3a575b719d/download.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/webui/flagged/Image/d60a16e47c3a575b719d/download.jpg -------------------------------------------------------------------------------- /grounding/webui/flagged/Bounding box/46774ff9412648535a6f/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/webui/flagged/Bounding box/46774ff9412648535a6f/image.png -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .defaults import _C as cfg 3 | from .paths_catalog import try_to_find -------------------------------------------------------------------------------- /retrieval/shell/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /root/workspace0401/S-Prompts-lpi 3 | python main.py > ./logss/hip.txt 4 | cd /root/workspace0401/S-Prompts-sprompt 5 | python main.py > ./logss/sprompt.txt -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/language_backbone/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/maskrcnn_benchmark/modeling/language_backbone/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /grounding/webui/flagged/log.csv: -------------------------------------------------------------------------------- 1 | Caption,Image,Bounding box,flag,username,timestamp 2 | ,flagged/Image/d60a16e47c3a575b719d/download.jpg,flagged/Bounding box/46774ff9412648535a6f/image.png,,,2024-04-16 15:26:28.521612 3 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utility functions 2 | 3 | This folder contain utility functions that are not used in the 4 | core library, but are useful for building models or training 5 | code using the config system. 6 | -------------------------------------------------------------------------------- /retrieval/utils/factory.py: -------------------------------------------------------------------------------- 1 | from methods.sprompt import SPrompts 2 | 3 | def get_model(model_name, args): 4 | name = model_name.lower() 5 | options = {'sprompts': SPrompts, 6 | } 7 | return options[name](args) 8 | 9 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/solver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .build import make_optimizer 3 | from .build import make_lr_scheduler 4 | from .lr_scheduler import WarmupMultiStepLR 5 | -------------------------------------------------------------------------------- /script/grounding/lpi_p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /home1/yanweicai/workspace/prompt/lpi/grounding 4 | 5 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/flickr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data as data 4 | from maskrcnn_benchmark.data.datasets.modulated_coco import ModulatedDataset 5 | 6 | 7 | class FlickrDataset(ModulatedDataset): 8 | pass 9 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/object365.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data as data 4 | from maskrcnn_benchmark.data.datasets.coco_dt import CocoDetectionTSV 5 | 6 | 7 | class Object365DetectionTSV(CocoDetectionTSV): 8 | pass 9 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/phrasecut.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data as data 4 | from maskrcnn_benchmark.data.datasets.modulated_coco import ModulatedDataset 5 | 6 | 7 | class PhrasecutDetection(ModulatedDataset): 8 | pass 9 | -------------------------------------------------------------------------------- /grounding/test/tt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | def cal_dif( v1, v2): 4 | return np.sum((v1 - v2) ** 2) 5 | 6 | v1 = [1,2,3] 7 | v2 = [4,5,6] 8 | v1 = torch.tensor(v1) 9 | v2 = torch.tensor(v2) 10 | # print(cal_dif(v1, v2)) 11 | print(torch.sum((v1-v2)**2)) -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/language_backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone as build_language_backbone 2 | from .build import build_tokenizer 3 | 4 | from .hfpt_tokenizer import HFPTTokenizer 5 | from .simple_tokenizer import SimpleTokenizer 6 | from .clip_model import CLIPTransformer 7 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from maskrcnn_benchmark.utils.registry import Registry 4 | 5 | BACKBONES = Registry() 6 | 7 | LANGUAGE_BACKBONES = Registry() 8 | 9 | ROI_BOX_FEATURE_EXTRACTORS = Registry() 10 | RPN_HEADS = Registry() 11 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .transforms import Compose 3 | from .transforms import Resize 4 | from .transforms import RandomHorizontalFlip 5 | from .transforms import ToTensor 6 | from .transforms import Normalize 7 | 8 | from .build import build_transforms 9 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .distributed import DistributedSampler 3 | from .grouped_batch_sampler import GroupedBatchSampler 4 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 5 | 6 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] 7 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/language_backbone/test_clip_tokenizer.py: -------------------------------------------------------------------------------- 1 | from maskrcnn_benchmark.modeling.language_backbone import build_tokenizer 2 | 3 | if __name__ == '__main__': 4 | 5 | tokenizer2 = build_tokenizer("clip") 6 | tokenized2 = tokenizer2( 7 | ["Detectest : fishid. jellyfishioasod. penguinasd. puffin.asd shark. starfish. round stingray"]) 8 | print(tokenized2) 9 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from maskrcnn_benchmark import _C 3 | 4 | try: 5 | import torchvision 6 | from torchvision.ops import nms 7 | except: 8 | nms = _C.nms 9 | 10 | ml_nms = _C.ml_nms 11 | soft_nms = _C.soft_nms 12 | 13 | # nms.__doc__ = """ 14 | # This function performs Non-maximum suppresion""" 15 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/lvis/_change_lvis_annotation.py: -------------------------------------------------------------------------------- 1 | path = "DATASET/coco/annotations/lvis_v1_minival.json" 2 | import json 3 | with open(path) as f: 4 | all = json.load(f) 5 | 6 | for i in all["images"]: 7 | i["file_name"] = "/".join(i["coco_url"].split("/")[-2:]) 8 | 9 | with open("DATASET/coco/annotations/lvis_v1_minival_inserted_image_name.json", "w") as f: 10 | json.dump(all, f) -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/collect_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import PIL 3 | 4 | from torch.utils.collect_env import get_pretty_env_info 5 | 6 | 7 | def get_pil_version(): 8 | return "\n Pillow ({})".format(PIL.__version__) 9 | 10 | 11 | def collect_env_info(): 12 | env_str = get_pretty_env_info() 13 | env_str += get_pil_version() 14 | return env_str 15 | -------------------------------------------------------------------------------- /grounding/shell/base.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_base.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcoco+_base.txt 6 | 7 | python tools/finetune.py --config-file configs/refcocog/finetune_A_base.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcocog_base.txt -------------------------------------------------------------------------------- /grounding/shell/lpim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcoco+_layer_task.txt 6 | 7 | python tools/finetune.py --config-file configs/refcocog/finetune_A_decompose_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcocog_layer_task.txt -------------------------------------------------------------------------------- /grounding/shell/refcoco+.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco+_interact_layer_task.txt 6 | 7 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco+_layer_task.txt -------------------------------------------------------------------------------- /grounding/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | from .generalized_rcnn import GeneralizedRCNN 2 | from .generalized_vl_rcnn import GeneralizedVLRCNN 3 | 4 | _DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN, 5 | "GeneralizedVLRCNN": GeneralizedVLRCNN 6 | } 7 | 8 | 9 | def build_detection_model(cfg): 10 | meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] 11 | return meta_arch(cfg) 12 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/miscellaneous.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import errno 3 | import os 4 | from .comm import is_main_process 5 | 6 | def mkdir(path): 7 | try: 8 | os.makedirs(path) 9 | except OSError as e: 10 | if e.errno != errno.EEXIST: 11 | raise 12 | 13 | 14 | def save_config(cfg, path): 15 | if is_main_process(): 16 | with open(path, 'w') as f: 17 | f.write(cfg.dump()) 18 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/amp.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | @contextmanager 4 | def nullcontext(enter_result=None, **kwargs): 5 | yield enter_result 6 | 7 | try: 8 | from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 9 | except: 10 | print('[Warning] Library for automatic mixed precision is not found, AMP is disabled!!') 11 | GradScaler = nullcontext 12 | autocast = nullcontext 13 | custom_fwd = nullcontext 14 | custom_bwd = nullcontext -------------------------------------------------------------------------------- /retrieval/util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | -------------------------------------------------------------------------------- /grounding/test.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | def run1(event): 4 | print("开启线程1") 5 | for i in range(10): 6 | print('a' + str(i)) 7 | event.wait() 8 | 9 | def run2(event): 10 | print("开启线程2") 11 | for i in range(10): 12 | print('b' + str(i)) 13 | event.wait() 14 | 15 | event = threading.Event() 16 | t1 = threading.Thread(target=run1, args=(event,)) 17 | t2 = threading.Thread(target=run2, args=(event,)) 18 | 19 | t1.start() 20 | t2.start() 21 | 22 | print("运行结束") 23 | t1.join() 24 | t2.join() 25 | 26 | print("结束") -------------------------------------------------------------------------------- /grounding/configs/flickr/test.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # Placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 # Placeholder 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 # Placeholder 8 | DYHEAD: 9 | NUM_CLASSES: 8 # Placeholder 10 | DATASETS: 11 | TRAIN: ("flickr30k_test", ) 12 | TEST: ("flickr30k_test", ) 13 | FLICKR_GT_TYPE: "separate" 14 | 15 | INPUT: 16 | MIN_SIZE_TRAIN: 800 17 | MAX_SIZE_TRAIN: 1333 18 | MIN_SIZE_TEST: 800 19 | MAX_SIZE_TEST: 1333 20 | DATALOADER: 21 | SIZE_DIVISIBILITY: 32 22 | ASPECT_RATIO_GROUPING: False -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | 5 | # TODO maybe push this to nn? 6 | def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): 7 | """ 8 | very similar to the smooth_l1_loss from pytorch, but with 9 | the extra beta parameter 10 | """ 11 | n = torch.abs(input - target) 12 | cond = n < beta 13 | loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) 14 | if size_average: 15 | return loss.mean() 16 | return loss.sum() 17 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/vg/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .vg_eval import do_vg_evaluation 4 | 5 | 6 | def vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes=False, **_): 7 | logger = logging.getLogger("maskrcnn_benchmark.inference") 8 | logger.info("performing vg evaluation, ignored iou_types.") 9 | return do_vg_evaluation( 10 | dataset=dataset, 11 | predictions=predictions, 12 | output_folder=output_folder, 13 | box_only=box_only, 14 | eval_attributes=eval_attributes, 15 | logger=logger, 16 | ) 17 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .voc_eval import do_voc_evaluation 4 | 5 | 6 | def voc_evaluation(dataset, predictions, output_folder, box_only, **_): 7 | logger = logging.getLogger("maskrcnn_benchmark.inference") 8 | if box_only: 9 | logger.warning("voc evaluation doesn't support box_only, ignored.") 10 | logger.info("performing voc evaluation, ignored iou_types.") 11 | return do_voc_evaluation( 12 | dataset=dataset, 13 | predictions=predictions, 14 | output_folder=output_folder, 15 | logger=logger, 16 | ) 17 | -------------------------------------------------------------------------------- /grounding/shell/l2p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | #python tools/finetune.py --config-file configs/maple/finetune_A_decompose.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcoco.txt 6 | 7 | python tools/finetune.py --config-file configs/l2p/finetune_A_decompose_refcoco+.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/l2p_refcoco+.txt 8 | 9 | python tools/finetune.py --config-file configs/l2p/finetune_A_decompose_refcocog.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/l2p_refcocog.txt 10 | -------------------------------------------------------------------------------- /grounding/shell/maple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | #python tools/finetune.py --config-file configs/maple/finetune_A_decompose.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcoco.txt 6 | 7 | python tools/finetune.py --config-file configs/maple/finetune_A_decompose_refcoco+.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcoco+.txt 8 | 9 | python tools/finetune.py --config-file configs/maple/finetune_A_decompose_refcocog.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcocog.txt 10 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py: -------------------------------------------------------------------------------- 1 | from .coco_eval import do_coco_evaluation 2 | 3 | 4 | def coco_evaluation( 5 | dataset, 6 | predictions, 7 | output_folder, 8 | box_only=False, 9 | iou_types=("bbox",), 10 | expected_results=(), 11 | expected_results_sigma_tol=4, 12 | ): 13 | return do_coco_evaluation( 14 | dataset=dataset, 15 | predictions=predictions, 16 | box_only=box_only, 17 | output_folder=output_folder, 18 | iou_types=iou_types, 19 | expected_results=expected_results, 20 | expected_results_sigma_tol=expected_results_sigma_tol, 21 | ) 22 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/val.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # Placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 # Placeholder 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 # Placeholder 8 | DYHEAD: 9 | NUM_CLASSES: 8 # Placeholder 10 | DATASETS: 11 | TRAIN: ("refexp_val", ) 12 | TEST: ("refexp_val", ) 13 | FLICKR_GT_TYPE: "separate" 14 | 15 | INPUT: 16 | MIN_SIZE_TRAIN: 800 17 | MAX_SIZE_TRAIN: 1333 18 | MIN_SIZE_TEST: 800 19 | MAX_SIZE_TEST: 1333 20 | DATALOADER: 21 | SIZE_DIVISIBILITY: 32 22 | ASPECT_RATIO_GROUPING: False 23 | SOLVER: 24 | WARMUP_ITERS: 0 25 | MAX_EPOCH: 12 26 | CHECKPOINT_PERIOD: 100 27 | TEST: 28 | IMS_PER_BATCH: 1 -------------------------------------------------------------------------------- /grounding/configs/flickr/val.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # Placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 # Placeholder 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 # Placeholder 8 | DYHEAD: 9 | NUM_CLASSES: 8 # Placeholder 10 | DATASETS: 11 | TRAIN: ("flickr30k_val", ) 12 | TEST: ("flickr30k_val", ) 13 | FLICKR_GT_TYPE: "separate" 14 | 15 | INPUT: 16 | MIN_SIZE_TRAIN: 800 17 | MAX_SIZE_TRAIN: 1333 18 | MIN_SIZE_TEST: 800 19 | MAX_SIZE_TEST: 1333 20 | DATALOADER: 21 | SIZE_DIVISIBILITY: 32 22 | ASPECT_RATIO_GROUPING: False 23 | SOLVER: 24 | WARMUP_ITERS: 0 25 | MAX_EPOCH: 12 26 | CHECKPOINT_PERIOD: 100 27 | TEST: 28 | IMS_PER_BATCH: 8 -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/language_backbone/build.py: -------------------------------------------------------------------------------- 1 | from .simple_tokenizer import SimpleTokenizer 2 | 3 | 4 | def build_tokenizer(tokenizer_name): 5 | tokenizer = None 6 | if tokenizer_name == 'clip': 7 | tokenizer = SimpleTokenizer() 8 | elif 'hf_' in tokenizer_name: 9 | from .hfpt_tokenizer import HFPTTokenizer 10 | 11 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:]) 12 | elif 'hfc_' in tokenizer_name: 13 | from .hfpt_tokenizer import HFPTTokenizer 14 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:]) 15 | else: 16 | raise ValueError('Unknown tokenizer') 17 | 18 | return tokenizer 19 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/od_to_grounding/__init__.py: -------------------------------------------------------------------------------- 1 | from .od_eval import do_od_evaluation 2 | 3 | 4 | def od_to_grounding_evaluation( 5 | dataset, 6 | predictions, 7 | output_folder, 8 | box_only=False, 9 | iou_types=("bbox",), 10 | expected_results=(), 11 | expected_results_sigma_tol=4, ): 12 | return do_od_evaluation( 13 | dataset=dataset, 14 | predictions=predictions, 15 | box_only=box_only, 16 | output_folder=output_folder, 17 | iou_types=iou_types, 18 | expected_results=expected_results, 19 | expected_results_sigma_tol=expected_results_sigma_tol, 20 | ) 21 | -------------------------------------------------------------------------------- /grounding/requirements.txt: -------------------------------------------------------------------------------- 1 | cityscapesscripts==2.2.2 2 | einops==0.7.0 3 | ftfy==6.2.0 4 | gradio==4.26.0 5 | h5py==3.11.0 6 | inflect==7.2.0 7 | matplotlib==3.5.2 8 | nltk==3.8.1 9 | numpy==1.22.4 10 | openai==1.19.0 11 | opencv_python==4.9.0.80 12 | pandas==2.0.3 13 | Pillow==9.1.1 14 | Pillow==10.3.0 15 | prettytable==3.10.0 16 | pycocotools==2.0.7 17 | PyYAML==6.0.1 18 | PyYAML==6.0.1 19 | qd==0.8.9 20 | regex==2023.12.25 21 | Requests==2.31.0 22 | scikit_learn==1.3.2 23 | scipy==1.13.0 24 | setuptools==52.0.0.post20210125 25 | tensorboardX==2.6.2.2 26 | tensorboardX==2.6.2.2 27 | tensorflow==2.16.1 28 | timm==0.9.16 29 | torch==1.11.0+cu113 30 | torchvision==0.12.0+cu113 31 | tqdm==4.61.2 32 | transformers==4.38.2 33 | yacs==0.1.8 34 | -------------------------------------------------------------------------------- /grounding/shell/lpip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | torchrun --nnodes=1 ----nproc_per_node=4 tools/finetune.py --config-file configs/refcoco/val/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/lpip_v1.txt 6 | 7 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcoco+_layer_task.txt 8 | 9 | python tools/finetune.py --config-file configs/refcocog/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcocog_layer_task.txt -------------------------------------------------------------------------------- /grounding/shell/vis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/sprompt_vis_refcoco.txt 6 | 7 | mv visualize visualize_sprompts 8 | mkdir visualize 9 | cd visualize 10 | mkdir 0 11 | mkdir 1 12 | mkdir 2 13 | mkdir 3 14 | mkdir 4 15 | mkdir 5 16 | mkdir 6 17 | mkdir 7 18 | mkdir 8 19 | mkdir 9 20 | mkdir 10 21 | mkdir 11 22 | cd .. 23 | 24 | python tools/finetune.py --config-file configs/refcoco/val/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/lpi_vis_refcoco.txt 25 | -------------------------------------------------------------------------------- /grounding/test/vis_2.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | api_key = "sk-MazpnWiEWQhrgtP8526a79F8D7254a5894296d2d81Ea6c7a" 3 | api_base = "https://oneapi.xty.app/v1" 4 | client = OpenAI( 5 | api_key=api_key, 6 | base_url=api_base 7 | ) 8 | model = "text-embedding-3-large" 9 | 10 | 11 | def get_embedding(text, model): 12 | text = text.replace("\n", " ") 13 | embeddings = client.embeddings 14 | creation = embeddings.create(input =[text], model=model) 15 | 16 | return creation.data[0].embedding 17 | 18 | 19 | task_names =['appliance', 'sports', 'outdoor','electronic', 'accessory', 'indoor','kitchen', 'furniture', 'vehicle','food', 'animal', 'person'] 20 | 21 | task_senmantic_embedding =[get_embedding(task_name, model) for task_name in task_names] -------------------------------------------------------------------------------- /grounding/shell/depth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | 5 | python tools/finetune.py --config-file configs/ablation/prompt_depth/finetune_A_decompose_interact_layer_task_d8.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco_interact_layer_task_d8.txt 6 | 7 | python tools/finetune.py --config-file configs/ablation/prompt_depth/finetune_A_decompose_interact_layer_task_d10.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco_interact_layer_task_d10.txt 8 | 9 | python tools/finetune.py --config-file configs/ablation/prompt_depth/finetune_A_decompose_interact_layer_task_d12.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco_interact_layer_task_d12.txt -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/cv2_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for cv2 utility functions and maintaining version compatibility 3 | between 3.x and 4.x 4 | """ 5 | import cv2 6 | 7 | 8 | def findContours(*args, **kwargs): 9 | """ 10 | Wraps cv2.findContours to maintain compatiblity between versions 11 | 3 and 4 12 | 13 | Returns: 14 | contours, hierarchy 15 | """ 16 | if cv2.__version__.startswith('4'): 17 | contours, hierarchy = cv2.findContours(*args, **kwargs) 18 | elif cv2.__version__.startswith('3'): 19 | _, contours, hierarchy = cv2.findContours(*args, **kwargs) 20 | else: 21 | raise AssertionError( 22 | 'cv2 must be either version 3 or 4 to call this method') 23 | 24 | return contours, hierarchy 25 | -------------------------------------------------------------------------------- /grounding/shell/prompt_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r1.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r1.txt 5 | 6 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r2.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r2.txt 7 | 8 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r8.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r8.txt 9 | 10 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r16.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r16.txt -------------------------------------------------------------------------------- /grounding/shell/sprompt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | #python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r1.txt 5 | 6 | python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose_refcoco+.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/sprompt_refcoco+.txt 7 | 8 | python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose_refcocog.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/sprompt_refcocog.txt 9 | 10 | #python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r16.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r16.txt -------------------------------------------------------------------------------- /grounding/configs/lvis/val.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # these fields are not used; just a placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 8 | DYHEAD: 9 | NUM_CLASSES: 8 10 | DATASETS: 11 | REGISTER: 12 | lvis_evaluation_mini_val: 13 | img_dir: "coco" 14 | ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json" 15 | lvis_evaluation_val: 16 | img_dir: "coco" 17 | ann_file: "coco/annotations/lvis_od_val.json" 18 | TRAIN: ("lvis_evaluation_val",) 19 | TEST: ("lvis_evaluation_val",) 20 | 21 | INPUT: 22 | MIN_SIZE_TRAIN: 800 23 | MAX_SIZE_TRAIN: 1333 24 | MIN_SIZE_TEST: 800 25 | MAX_SIZE_TEST: 1333 26 | DATALOADER: 27 | SIZE_DIVISIBILITY: 32 28 | ASPECT_RATIO_GROUPING: False 29 | TEST: 30 | IMS_PER_BATCH: 8 31 | -------------------------------------------------------------------------------- /grounding/configs/lvis/minival.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # these fields are not used; just a placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 8 | DYHEAD: 9 | NUM_CLASSES: 8 10 | DATASETS: 11 | REGISTER: 12 | lvis_evaluation_mini_val: 13 | img_dir: "coco" 14 | ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json" 15 | lvis_evaluation_val: 16 | img_dir: "coco" 17 | ann_file: "coco/annotations/lvis_od_val.json" 18 | TRAIN: ("lvis_evaluation_mini_val",) 19 | TEST: ("lvis_evaluation_mini_val",) 20 | 21 | INPUT: 22 | MIN_SIZE_TRAIN: 800 23 | MAX_SIZE_TRAIN: 1333 24 | MIN_SIZE_TEST: 800 25 | MAX_SIZE_TEST: 1333 26 | DATALOADER: 27 | SIZE_DIVISIBILITY: 32 28 | ASPECT_RATIO_GROUPING: False 29 | TEST: 30 | IMS_PER_BATCH: 8 31 | -------------------------------------------------------------------------------- /retrieval/configs/domainnet_slip.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "domainnet", 4 | "data_path": "/home/wangyabin/workspace/datasets/domainnet", 5 | "memory_size": 0, 6 | "memory_per_class": 0, 7 | "fixed_memory": true, 8 | "shuffle": false, 9 | "init_cls": 345, 10 | "increment": 345, 11 | "model_name": "sprompts", 12 | "net_type": "slip", 13 | "embd_dim" : 768, 14 | "prompt_length" : 10, 15 | "total_sessions" : 6, 16 | "device": ["2","3"], 17 | "seed": [1993], 18 | "EPSILON" : 1e-8, 19 | "init_epoch" : 30, 20 | "init_lr" : 0.01, 21 | "init_lr_decay" : 0.1, 22 | "init_weight_decay" : 0.0005, 23 | "epochs" : 30, 24 | "lrate" : 0.01, 25 | "lrate_decay" : 0.1, 26 | "batch_size" : 128, 27 | "weight_decay" : 2e-4, 28 | "num_workers" : 16 29 | } -------------------------------------------------------------------------------- /grounding/test/task_sim_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # task_vector = np.loadtxt('../MID/tasks_array.txt') 4 | # print(task_vector) 5 | 6 | # 12个1024维的向量 7 | vectors = np.loadtxt('../MID/tasks_array.txt') # 填入实际的向量值 8 | 9 | # 初始化一个12x12的矩阵,用于存储相似度 10 | cosine_similarity_matrix = np.zeros((12, 12)) 11 | 12 | # 计算余弦相似度 13 | for i in range(12): 14 | for j in range(12): 15 | # embedding1 = vectors[i] 16 | # embedding2 = vectors[j] 17 | # cosine_similarity_matrix[i, j] = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) 18 | cosine_similarity_matrix[i, j] = np.dot(vectors[i], vectors[j]) / (np.linalg.norm(vectors[i]) * np.linalg.norm(vectors[j])) 19 | 20 | # 输出余弦相似度矩阵 21 | print(cosine_similarity_matrix) 22 | 23 | np.savetxt('../MID/task_sim_matrix.txt', cosine_similarity_matrix) -------------------------------------------------------------------------------- /grounding/test/colors.py: -------------------------------------------------------------------------------- 1 | import colorsys 2 | import random 3 | 4 | 5 | def get_n_hls_colors(num): 6 | hls_colors = [] 7 | i = 0 8 | step = 360.0 / num 9 | while i < 360: 10 | h = i 11 | s = 90 + random.random() * 10 12 | l = 50 + random.random() * 10 13 | _hlsc = [h / 360.0, l / 100.0, s / 100.0] 14 | hls_colors.append(_hlsc) 15 | i += step 16 | 17 | return hls_colors 18 | 19 | 20 | def ncolors(num): 21 | rgb_colors = [] 22 | if num < 1: 23 | return rgb_colors 24 | hls_colors = get_n_hls_colors(num) 25 | for hlsc in hls_colors: 26 | _r, _g, _b = colorsys.hls_to_rgb(hlsc[0], hlsc[1], hlsc[2]) 27 | r, g, b = [int(x * 255.0) for x in (_r, _g, _b)] 28 | rgb_colors.append([r, g, b]) 29 | 30 | 31 | return rgb_colors 32 | 33 | 34 | print(ncolors(10)) -------------------------------------------------------------------------------- /retrieval/configs/core50_slip.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "core50", 4 | "data_path": "/home/wangyabin/workspace/datasets/core50/data/core50_128x128", 5 | "memory_size": 0, 6 | "memory_per_class": 0, 7 | "fixed_memory": true, 8 | "shuffle": false, 9 | "init_cls": 50, 10 | "increment": 50, 11 | "model_name": "sprompts", 12 | "net_type": "slip", 13 | "embd_dim" : 768, 14 | "prompt_length" : 10, 15 | "total_sessions" : 8, 16 | "device": ["0","1"], 17 | "seed": [1993], 18 | "EPSILON" : 1e-8, 19 | "init_epoch" : 20, 20 | "init_lr" : 0.01, 21 | "init_lr_decay" : 0.1, 22 | "init_weight_decay" : 0.0005, 23 | "epochs" : 20, 24 | "lrate" : 0.01, 25 | "lrate_decay" : 0.1, 26 | "batch_size" : 128, 27 | "weight_decay" : 2e-4, 28 | "num_workers" : 16 29 | } -------------------------------------------------------------------------------- /grounding/tools/utils/colors.py: -------------------------------------------------------------------------------- 1 | import colorsys 2 | import random 3 | 4 | 5 | def get_n_hls_colors(num): 6 | hls_colors = [] 7 | i = 0 8 | step = 360.0 / num 9 | while i < 360: 10 | h = i 11 | s = 90 + random.random() * 10 12 | l = 50 + random.random() * 10 13 | _hlsc = [h / 360.0, l / 100.0, s / 100.0] 14 | hls_colors.append(_hlsc) 15 | i += step 16 | 17 | return hls_colors 18 | 19 | 20 | def ncolors(num): 21 | rgb_colors = [] 22 | if num < 1: 23 | return rgb_colors 24 | hls_colors = get_n_hls_colors(num) 25 | for hlsc in hls_colors: 26 | _r, _g, _b = colorsys.hls_to_rgb(hlsc[0], hlsc[1], hlsc[2]) 27 | r, g, b = [int(x * 255.0) for x in (_r, _g, _b)] 28 | rgb_colors.append([r, g, b]) 29 | 30 | 31 | return rgb_colors 32 | 33 | 34 | # print(ncolors(10)) -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import bisect 3 | 4 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset 5 | 6 | 7 | class ConcatDataset(_ConcatDataset): 8 | """ 9 | Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra 10 | method for querying the sizes of the image 11 | """ 12 | 13 | def get_idxs(self, idx): 14 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 15 | if dataset_idx == 0: 16 | sample_idx = idx 17 | else: 18 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 19 | return dataset_idx, sample_idx 20 | 21 | def get_img_info(self, idx): 22 | dataset_idx, sample_idx = self.get_idxs(idx) 23 | return self.datasets[dataset_idx].get_img_info(sample_idx) 24 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/ml_nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor ml_nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const at::Tensor& labels, 13 | const float threshold) { 14 | 15 | if (dets.device().is_cuda()) { 16 | #ifdef WITH_CUDA 17 | // TODO raise error if not compiled with CUDA 18 | if (dets.numel() == 0) 19 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 20 | auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1); 21 | return ml_nms_cuda(b, threshold); 22 | #else 23 | AT_ERROR("Not compiled with GPU support"); 24 | #endif 25 | } 26 | AT_ERROR("CPU version not implemented"); 27 | } 28 | -------------------------------------------------------------------------------- /retrieval/configs/cddb_sip.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "cddb", 4 | "task_name": ["gaugan", "biggan", "cyclegan", "imle", "deepfake", "crn", "wild"], 5 | "data_path": "/home/wangyabin/workspace/datasets/DeepFake_Data/CL_data/", 6 | "multiclass": [0, 0, 1, 0, 0, 0, 0], 7 | "class_order": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], 8 | "memory_size": 0, 9 | "memory_per_class": 0, 10 | "fixed_memory": true, 11 | "shuffle": false, 12 | "init_cls": 2, 13 | "increment": 2, 14 | "model_name": "sprompts", 15 | "net_type": "slip", 16 | "embd_dim" : 768, 17 | "prompt_length" : 10, 18 | "total_sessions" : 7, 19 | "device": ["0","1"], 20 | "seed": [1993], 21 | "EPSILON" : 1e-8, 22 | "epochs" : 10, 23 | "lrate" : 0.01, 24 | "milestones" : [20, 30], 25 | "lrate_decay" : 0.1, 26 | "batch_size" : 128, 27 | "weight_decay" : 2e-4, 28 | "num_workers" : 16 29 | } -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from .rpn import build_rpn 3 | from .rpn import RPNModule 4 | from .retina import RetinaNetModule 5 | from .fcos import FCOSModule 6 | from .atss import ATSSModule 7 | from .dyhead import DyHeadModule 8 | from .vldyhead import VLDyHeadModule 9 | 10 | _RPN_META_ARCHITECTURES = {"RPN": RPNModule, 11 | "RETINA": RetinaNetModule, 12 | "FCOS": FCOSModule, 13 | "ATSS": ATSSModule, 14 | "DYHEAD": DyHeadModule, 15 | "VLDYHEAD": VLDyHeadModule 16 | } 17 | 18 | 19 | def build_rpn(cfg): 20 | """ 21 | This gives the gist of it. Not super important because it doesn't change as much 22 | """ 23 | rpn_arch = _RPN_META_ARCHITECTURES[cfg.MODEL.RPN_ARCHITECTURE] 24 | return rpn_arch(cfg) 25 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/script.txt: -------------------------------------------------------------------------------- 1 | finetune: 2 | 3 | python -m torch.distributed.launch --nproc_per_node=4 tools/finetune.py \ 4 | --config-file configs/refcoco/finetune_A.yaml --skip-test \ 5 | --custom_shot_and_epoch_and_general_copy 0_1_1 \ 6 | --evaluate_only_best_on_test --push_both_val_and_test \ 7 | MODEL.WEIGHT MODEL/glip_a_tiny_o365.pth \ 8 | SOLVER.USE_AMP True TEST.DURING_TRAINING True TEST.IMS_PER_BATCH 4 SOLVER.IMS_PER_BATCH 4 SOLVER.WEIGHT_DECAY 0.05 TEST.EVAL_TASK grounding MODEL.BACKBONE.FREEZE_CONV_BODY_AT 2 MODEL.DYHEAD.USE_CHECKPOINT True SOLVER.FIND_UNUSED_PARAMETERS False SOLVER.TEST_WITH_INFERENCE True SOLVER.USE_AUTOSTEP True DATASETS.USE_OVERRIDE_CATEGORY True SOLVER.SEED 10 DATASETS.SHUFFLE_SEED 3 DATASETS.USE_CAPTION_PROMPT True DATASETS.DISABLE_SHUFFLE True \ 9 | SOLVER.STEP_PATIENCE 2 SOLVER.CHECKPOINT_PER_EPOCH 1.0 SOLVER.AUTO_TERMINATE_PATIENCE 4 SOLVER.MODEL_EMA 0.0 SOLVER.TUNING_HIGHLEVEL_OVERRIDE full 10 | 11 | test: 12 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/imports.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | # if torch._six.PY37: 5 | if False: 6 | import importlib 7 | import importlib.util 8 | import sys 9 | 10 | 11 | # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa 12 | def import_file(module_name, file_path, make_importable=False): 13 | spec = importlib.util.spec_from_file_location(module_name, file_path) 14 | module = importlib.util.module_from_spec(spec) 15 | spec.loader.exec_module(module) 16 | if make_importable: 17 | sys.modules[module_name] = module 18 | return module 19 | else: 20 | import imp 21 | 22 | def import_file(module_name, file_path, make_importable=None): 23 | module = imp.load_source(module_name, file_path) 24 | return module 25 | -------------------------------------------------------------------------------- /grounding/test/task.vis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.manifold import TSNE 4 | 5 | vectors = np.loadtxt('../MID/tasks_array.txt') 6 | 7 | tsne = TSNE(n_components=2, random_state=0, perplexity=1) 8 | words = ['appliance', 'sports', 'outdoor', 'electronic', 'accessory', 'indoor', 'kitchen', 'furniture', 'vehicle', 'food', 'animal', 'person'] 9 | 10 | Y = tsne.fit_transform(vectors) 11 | 12 | # colors = cm.rainbow(np.linspace(0, 1, Y.shape[0])) 13 | colors = plt.get_cmap('tab20')(range(12)) 14 | for dataset, color, label in zip(Y, colors, words): 15 | plt.scatter(dataset[0], dataset[1], color=color, label=label, s=120) 16 | 17 | plt.xlabel("X") 18 | plt.ylabel("Y") 19 | plt.legend(ncol=4, loc=(0,2/3)) 20 | plt.savefig('../MID/task_visual_4.svg') 21 | # for dataset, label in zip(Y, words): 22 | # plt.annotate(label, (dataset[0], dataset[1]), textcoords='offset points',xytext=(0,10), ha='center') 23 | plt.show() 24 | 25 | # plt.savefig('../MID/task_visual.png') -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | 18 | 19 | std::pair soft_nms_cpu(const at::Tensor& dets, 20 | const at::Tensor& scores, 21 | const float threshold, 22 | const float sigma); -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/duplicate_dataset.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import TypeVar, Optional, Iterator 3 | 4 | import torch 5 | from torch.utils.data import Sampler, Dataset 6 | import torch.distributed as dist 7 | import random 8 | import numpy as np 9 | 10 | 11 | def create_duplicate_dataset(DatasetBaseClass): 12 | class DupDataset(DatasetBaseClass): 13 | 14 | def __init__(self, copy, **kwargs): 15 | super(DupDataset, self).__init__(**kwargs) 16 | 17 | self.copy = copy 18 | self.length = super(DupDataset, self).__len__() 19 | 20 | def __len__(self): 21 | return self.copy * self.length 22 | 23 | def __getitem__(self, index): 24 | true_index = index % self.length 25 | return super(DupDataset, self).__getitem__(true_index) 26 | 27 | def get_img_info(self, index): 28 | true_index = index % self.length 29 | return super(DupDataset, self).get_img_info(true_index) 30 | 31 | return DupDataset 32 | -------------------------------------------------------------------------------- /grounding/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | build/ 4 | DATASET/ 5 | OUTPUT/ 6 | MODEL/ 7 | best_model/ 8 | all_key/ 9 | OUTPUT_org 10 | 11 | # compilation and distribution 12 | __pycache__ 13 | _ext 14 | *.so 15 | maskrcnn_benchmark.egg-info/ 16 | dist/ 17 | 18 | # pytorch/python/numpy formats 19 | *.pth 20 | *.pkl 21 | *.npy 22 | 23 | # ipython/jupyter notebooks 24 | *.ipynb 25 | **/.ipynb_checkpoints/ 26 | 27 | # Editor temporaries 28 | *.swn 29 | *.swo 30 | *.swp 31 | *~ 32 | 33 | # Pycharm editor settings 34 | .idea 35 | 36 | # vscode editor settings 37 | .vscode 38 | 39 | # MacOS 40 | .DS_Store 41 | 42 | # Custom 43 | *.custom.py 44 | 45 | # logs 46 | logs/ 47 | log_new/ 48 | 49 | # res 50 | FINAL_RES/ 51 | FINAL_RES_v2/ 52 | eval/ 53 | 54 | # visualize 55 | visualize_base/ 56 | visualize_glip/ 57 | visualize_lpi/ 58 | visualize_lpi_v2/ 59 | visualize_maple/ 60 | visualize_sprompts/ 61 | 62 | # task_div 63 | TASK_DIV/ 64 | TASK_DIV_REFCOCO/ 65 | 66 | # prompt_save 67 | prompt_save/ 68 | 69 | # embedding save 70 | embedding_save/ 71 | 72 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import logging 3 | import os 4 | import sys 5 | from datetime import datetime 6 | 7 | def setup_logger(name, save_dir, distributed_rank): 8 | logger = logging.getLogger(name) 9 | logger.setLevel(logging.DEBUG) 10 | # don't log results for the non-master process 11 | if distributed_rank > 0: 12 | return logger 13 | ch = logging.StreamHandler(stream=sys.stdout) 14 | ch.setLevel(logging.DEBUG) 15 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 16 | ch.setFormatter(formatter) 17 | logger.addHandler(ch) 18 | 19 | if save_dir: 20 | filename = f'{datetime.now().date()}-{datetime.now().hour}:{datetime.now().minute}:{datetime.now().second}-GLIP.log.txt' 21 | fh = logging.FileHandler(os.path.join(save_dir, filename)) 22 | fh.setLevel(logging.DEBUG) 23 | fh.setFormatter(formatter) 24 | logger.addHandler(fh) 25 | 26 | return logger 27 | -------------------------------------------------------------------------------- /retrieval/configs/cddb_slip.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "cddb", 4 | "task_name": ["gaugan", "biggan", "wild", "whichfaceisreal", "san"], 5 | "data_path": "/home/wangyabin/workspace/datasets/DeepFake_Data/CL_data/", 6 | "multiclass": [0, 0, 0, 0, 0], 7 | "class_order": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 8 | "total_sessions" : 7, 9 | "memory_size": 0, 10 | "memory_per_class": 0, 11 | "fixed_memory": true, 12 | "shuffle": false, 13 | "init_cls": 2, 14 | "increment": 2, 15 | "model_name": "sprompts", 16 | "net_type": "slip", 17 | "embd_dim" : 768, 18 | "prompt_length" : 10, 19 | "device": ["0","1"], 20 | "seed": [1993], 21 | "EPSILON" : 1e-8, 22 | "init_epoch" : 20, 23 | "init_lr" : 0.001, 24 | "init_milestones" : [20,30,40], 25 | "init_lr_decay" : 0.1, 26 | "init_weight_decay" : 0.0005, 27 | "epochs" : 50, 28 | "lrate" : 0.01, 29 | "milestones" : [20, 30], 30 | "lrate_decay" : 0.1, 31 | "batch_size" : 128, 32 | "weight_decay" : 2e-4, 33 | "num_workers" : 16 34 | } -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .coco import COCODataset 3 | from .voc import PascalVOCDataset 4 | from .concat_dataset import ConcatDataset 5 | from .background import Background 6 | from .tsv import TSVDataset, ODTSVDataset 7 | 8 | from .modulated_coco import ModulatedDataset, CocoDetection, CocoGrounding 9 | from .flickr import FlickrDataset 10 | from .refexp import RefExpDataset 11 | from .mixed import MixedDataset 12 | from .gqa import GQADataset 13 | 14 | from .coco_dt import CocoDetectionTSV 15 | from .caption import CaptionTSV 16 | from .lvis import LvisDetection 17 | from .pseudo_data import PseudoData 18 | from .phrasecut import PhrasecutDetection 19 | 20 | __all__ = ["COCODataset", "TSVDataset", "ODTSVDataset", "ConcatDataset", "PascalVOCDataset", "Background", 21 | "ModulatedDataset", "MixedDataset", "CocoDetection", "FlickrDataset", "RefExpDataset", "GQADataset", 22 | "CocoDetectionTSV", "CocoGrounding", "CaptionTSV", "LvisDetection", "PseudoData", "PhrasecutDetection" 23 | ] 24 | -------------------------------------------------------------------------------- /grounding/cmd/cmd.txt: -------------------------------------------------------------------------------- 1 | Environment parameter 2 | CUDA_VISIBLE_DEVICES=2,3;TOKENIZERS_PARALLELISM=(true | false) 3 | 4 | # finetune 5 | --nproc_per_node=2 tools/finetune.py --config-file configs/refcoco/finetune_A.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_1_1 --evaluate_only_best_on_test --push_both_val_and_test 6 | 7 | # inference 8 | python tools/testgrounding_net.py --config-file configs/refcoco/refcoco.yaml --task_config configs/refcoco/val.yaml --weight MODEL/glip_a_tiny_o365.pth OUTPUT_DIR ./ TEST.IMS_PER_BATCH 1 SOLVER.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding 9 | 10 | python -m torch.distributed.launch --nproc_per_node=2 tools/testgrounding_net.py --config-file configs/refcoco/refcoco.yaml --task_config configs/refcoco/val.yaml --weight MODEL/glip_a_tiny_o365.pth OUTPUT_DIR ./ TEST.IMS_PER_BATCH 2 SOLVER.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding 11 | 12 | python -m torch.distributed.launch --nproc_per_node=2 tools/finetune.py --config-file configs/refcoco/finetune_A.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_5_1 -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/list_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Simple dataset class that wraps a list of path names 4 | """ 5 | 6 | from PIL import Image 7 | 8 | from maskrcnn_benchmark.structures.bounding_box import BoxList 9 | 10 | 11 | class ListDataset(object): 12 | def __init__(self, image_lists, transforms=None): 13 | self.image_lists = image_lists 14 | self.transforms = transforms 15 | 16 | def __getitem__(self, item): 17 | img = Image.open(self.image_lists[item]).convert("RGB") 18 | 19 | # dummy target 20 | w, h = img.size 21 | target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") 22 | 23 | if self.transforms is not None: 24 | img, target = self.transforms(img, target) 25 | 26 | return img, target 27 | 28 | def __len__(self): 29 | return len(self.image_lists) 30 | 31 | def get_img_info(self, item): 32 | """ 33 | Return the image dimensions for the image, without 34 | loading and pre-processing it 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/backbone/mixer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class MixedOperationRandom(nn.Module): 5 | def __init__(self, search_ops): 6 | super(MixedOperationRandom, self).__init__() 7 | self.ops = nn.ModuleList(search_ops) 8 | self.num_ops = len(search_ops) 9 | 10 | def forward(self, x, x_path=None): 11 | if x_path is None: 12 | output = sum(op(x) for op in self.ops) / self.num_ops 13 | else: 14 | assert isinstance(x_path, (int, float)) and 0 <= x_path < self.num_ops or isinstance(x_path, torch.Tensor) 15 | if isinstance(x_path, (int, float)): 16 | x_path = int(x_path) 17 | assert 0 <= x_path < self.num_ops 18 | output = self.ops[x_path](x) 19 | elif isinstance(x_path, torch.Tensor): 20 | assert x_path.size(0) == x.size(0), 'batch_size should match length of y_idx' 21 | output = torch.cat([self.ops[int(x_path[i].item())](x.narrow(0, i, 1)) 22 | for i in range(x.size(0))], dim=0) 23 | return output -------------------------------------------------------------------------------- /retrieval/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Fu-Yun Wang. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /grounding/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /retrieval/configs/lpi/coco_l2p.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "Coco", 4 | "data_path": "/home/wangye/wangye/data/", 5 | "memory_size": 0, 6 | "memory_per_class": 0, 7 | "fixed_memory": true, 8 | "shuffle": false, 9 | "init_cls": 345, 10 | "increment": 345, 11 | "model_name": "sprompts", 12 | "net_type": "slip", 13 | "embd_dim" : 768, 14 | "prompt_length" : 10, 15 | "total_sessions" : 12, 16 | "device": ["0"], 17 | "seed": [1993], 18 | "EPSILON" : 1e-8, 19 | "init_epoch" : 5, 20 | "init_lr" : 0.05, 21 | "init_lr_decay" : 0.1, 22 | "init_weight_decay" : 0.0005, 23 | "epochs" : 5, 24 | "lrate" : 0.05, 25 | "lrate_decay" : 0.1, 26 | "batch_size" : 16, 27 | "weight_decay" : 2e-4, 28 | "num_workers" : 8, 29 | 30 | "trainer": "AMPL", 31 | "vision_depth": 0, 32 | "language_depth": 0, 33 | "vision_ctx": 0, 34 | "language_ctx": 0, 35 | "cmpa_length": 10, 36 | "fusing": "mean", 37 | "parameter_sharing": false, 38 | 39 | "backbonename": "ViT-B/16", 40 | "NCTX": 10, 41 | "CTXINIT": "", 42 | "CSC": false, 43 | "CLASS_TOKEN_POSITION": "end", 44 | "prompt_type" : "l2p" 45 | } -------------------------------------------------------------------------------- /retrieval/configs/lpi/coco_clip.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "Coco", 4 | "data_path": "/home/wangye/wangye/data/", 5 | "memory_size": 0, 6 | "memory_per_class": 0, 7 | "fixed_memory": true, 8 | "shuffle": false, 9 | "init_cls": 345, 10 | "increment": 345, 11 | "model_name": "sprompts", 12 | "net_type": "slip", 13 | "embd_dim" : 768, 14 | "prompt_length" : 10, 15 | "total_sessions" : 12, 16 | "device": ["0"], 17 | "seed": [1993], 18 | "EPSILON" : 1e-8, 19 | "init_epoch" : 0, 20 | "init_lr" : 0.05, 21 | "init_lr_decay" : 0.1, 22 | "init_weight_decay" : 0.0005, 23 | "epochs" : 0, 24 | "lrate" : 0.05, 25 | "lrate_decay" : 0.1, 26 | "batch_size" : 128, 27 | "weight_decay" : 2e-4, 28 | "num_workers" : 8, 29 | 30 | "trainer": "AMPL", 31 | "vision_depth": 0, 32 | "language_depth": 0, 33 | "vision_ctx": 0, 34 | "language_ctx": 0, 35 | "cmpa_length": 16, 36 | "fusing": "mean", 37 | "parameter_sharing": false, 38 | 39 | "backbonename": "ViT-B/16", 40 | "NCTX": 10, 41 | "CTXINIT": "", 42 | "CSC": false, 43 | "CLASS_TOKEN_POSITION": "none", 44 | "prompt_type" : "clip" 45 | } -------------------------------------------------------------------------------- /retrieval/configs/lpi/coco_lpi.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "Coco", 4 | "image_root": "/home1/yanweicai/DATA/CV/coco", 5 | "annotation_train_root": "/home1/yanweicai/DATA/CV/coco/annotations/retrieval_train2014.json", 6 | "annotation_val_root": "/home1/yanweicai/DATA/CV/coco/annotations/retrieval_val2014.json", 7 | "memory_size": 0, 8 | "memory_per_class": 0, 9 | "fixed_memory": true, 10 | "shuffle": false, 11 | "model_name": "sprompts", 12 | "net_type": "slip", 13 | "embd_dim" : 768, 14 | "visual_dim": 768, 15 | "textual_dim": 512, 16 | "prompt_length" : 16, 17 | "total_sessions" : 12, 18 | "device": ["0"], 19 | "seed": [1993], 20 | "EPSILON" : 1e-8, 21 | "init_epoch" : 10, 22 | "init_lr" : 0.05, 23 | "init_lr_decay" : 0.1, 24 | "init_weight_decay" : 0.0005, 25 | "epochs" : 10, 26 | "lrate" : 0.05, 27 | "lrate_decay" : 0.1, 28 | "batch_size" : 64, 29 | "weight_decay" : 2e-4, 30 | "num_workers" : 8, 31 | 32 | "backbonename": "ViT-B/16", 33 | "NCTX": 16, 34 | "CTXINIT": "", 35 | "CSC": false, 36 | "CLASS_TOKEN_POSITION": "end", 37 | "prompt_depth": 3, 38 | "prompt_type" : "lpi" 39 | } -------------------------------------------------------------------------------- /retrieval/configs/lpi/coco_sprompts.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "Coco", 4 | "data_path": "/home/wangye/wangye/data/", 5 | "memory_size": 0, 6 | "memory_per_class": 0, 7 | "fixed_memory": true, 8 | "shuffle": false, 9 | "init_cls": 345, 10 | "increment": 345, 11 | "model_name": "sprompts", 12 | "net_type": "slip", 13 | "embd_dim" : 768, 14 | "prompt_length" : 16, 15 | "total_sessions" : 12, 16 | "device": ["0"], 17 | "seed": [1993], 18 | "EPSILON" : 1e-8, 19 | "init_epoch" : 10, 20 | "init_lr" : 0.05, 21 | "init_lr_decay" : 0.1, 22 | "init_weight_decay" : 0.0005, 23 | "epochs" : 10, 24 | "lrate" : 0.05, 25 | "lrate_decay" : 0.1, 26 | "batch_size" : 128, 27 | "weight_decay" : 2e-4, 28 | "num_workers" : 8, 29 | 30 | "trainer": "AMPL", 31 | "vision_depth": 0, 32 | "language_depth": 0, 33 | "vision_ctx": 0, 34 | "language_ctx": 0, 35 | "cmpa_length": 16, 36 | "fusing": "mean", 37 | "parameter_sharing": false, 38 | 39 | "backbonename": "ViT-B/16", 40 | "NCTX": 16, 41 | "CTXINIT": "", 42 | "CSC": false, 43 | "CLASS_TOKEN_POSITION": "end", 44 | "prompt_type" : "sprompts" 45 | } -------------------------------------------------------------------------------- /retrieval/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from trainer import train 4 | 5 | 6 | def main(): 7 | args = setup_parser().parse_args() 8 | # param = load_json('configs/coco_org_sprompt.json') 9 | ''' 10 | configs: 11 | clip: configs/lpi/coco_clip.json 12 | l2p: configs/lpi/coco_l2p.json 13 | S-prompts: configs/lpi/coco_sprompts.json 14 | lpi(ours): configs/lpi/coco_sprompts.json 15 | ''' 16 | # param = load_json('configs/lpi/coco_org_sprompt.json') 17 | 18 | param = load_json(args.config) 19 | args = vars(args) # Converting argparse Namespace to a dict. 20 | args.update(param) # Add parameters from json 21 | train(args) 22 | 23 | 24 | def load_json(settings_path): 25 | with open(settings_path) as data_file: 26 | param = json.load(data_file) 27 | 28 | return param 29 | 30 | 31 | def setup_parser(): 32 | parser = argparse.ArgumentParser(description='Reproduce of multiple continual learning algorthms.') 33 | parser.add_argument('--config', type=str, default='./exps/finetune.json', 34 | help='Json file of settings.') 35 | parser.add_argument('--local_rank', default=-1) 36 | return parser 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /grounding/shell/cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /root/workspace/grounding/prompt_grounding 4 | #python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_task_layer_interact.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 5 | 6 | python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_task_interact.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 7 | 8 | python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_layer_interact.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 9 | 10 | python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 11 | 12 | #python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 13 | # 14 | #python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_layer.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 15 | # 16 | #python tools/finetune.py --config-file configs/refcoco/finetune_A_decompose_interact.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 -------------------------------------------------------------------------------- /retrieval/configs/coco_slip.json: -------------------------------------------------------------------------------- 1 | { 2 | "prefix": "reproduce", 3 | "dataset": "Coco", 4 | "image_root": "/root/autodl-tmp/coco", 5 | "annotation_root": "/root/autodl-tmp/coco/annotations/retrieval_train2014.json", 6 | "memory_size": 0, 7 | "memory_per_class": 0, 8 | "fixed_memory": true, 9 | "shuffle": false, 10 | "init_cls": 345, 11 | "increment": 345, 12 | "model_name": "sprompts", 13 | "net_type": "slip", 14 | "embd_dim" : 768, 15 | "prompt_length" : 16, 16 | "total_sessions" : 12, 17 | "device": ["0"], 18 | "seed": [1993], 19 | "EPSILON" : 1e-8, 20 | "init_epoch" : 5, 21 | "init_lr" : 0.05, 22 | "init_lr_decay" : 0.1, 23 | "init_weight_decay" : 0.0005, 24 | "epochs" : 5, 25 | "lrate" : 0.05, 26 | "lrate_decay" : 0.1, 27 | "batch_size" : 128, 28 | "weight_decay" : 2e-4, 29 | "num_workers" : 16, 30 | 31 | "trainer": "CMPA", 32 | "vision_depth": 0, 33 | "language_depth": 0, 34 | "vision_ctx": 0, 35 | "language_ctx": 0, 36 | "cmpa_length": 16, 37 | "fusing": "mean", 38 | "parameter_sharing": true, 39 | 40 | "backbonename": "ViT-B/16", 41 | "NCTX": 16, 42 | "CTXINIT": "", 43 | "CSC": false, 44 | "CLASS_TOKEN_POSITION": "end" 45 | } -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor SigmoidFocalLoss_forward( 11 | const at::Tensor& logits, 12 | const at::Tensor& targets, 13 | const int num_classes, 14 | const float gamma, 15 | const float alpha) { 16 | if (logits.device().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor SigmoidFocalLoss_backward( 27 | const at::Tensor& logits, 28 | const at::Tensor& targets, 29 | const at::Tensor& d_losses, 30 | const int num_classes, 31 | const float gamma, 32 | const float alpha) { 33 | if (logits.device().is_cuda()) { 34 | #ifdef WITH_CUDA 35 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); 36 | #else 37 | AT_ERROR("Not compiled with GPU support"); 38 | #endif 39 | } 40 | AT_ERROR("Not implemented on the CPU"); 41 | } 42 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch.utils.data.sampler import BatchSampler 3 | 4 | 5 | class IterationBasedBatchSampler(BatchSampler): 6 | """ 7 | Wraps a BatchSampler, resampling from it until 8 | a specified number of iterations have been sampled 9 | """ 10 | 11 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 12 | self.batch_sampler = batch_sampler 13 | self.num_iterations = num_iterations 14 | self.start_iter = start_iter 15 | 16 | def __iter__(self): 17 | iteration = self.start_iter 18 | while iteration <= self.num_iterations: 19 | # if the underlying sampler has a set_epoch method, like 20 | # DistributedSampler, used for making each process see 21 | # a different split of the dataset, then set it 22 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 23 | self.batch_sampler.sampler.set_epoch(iteration) 24 | for batch in self.batch_sampler: 25 | iteration += 1 26 | if iteration > self.num_iterations: 27 | break 28 | yield batch 29 | 30 | def __len__(self): 31 | return self.num_iterations 32 | -------------------------------------------------------------------------------- /grounding/SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | 4 | from maskrcnn_benchmark.utils.imports import import_file 5 | 6 | 7 | def setup_environment(): 8 | """Perform environment setup work. The default setup is a no-op, but this 9 | function allows the user to specify a Python source file that performs 10 | custom setup work that may be necessary to their computing environment. 11 | """ 12 | custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") 13 | if custom_module_path: 14 | setup_custom_environment(custom_module_path) 15 | else: 16 | # The default setup is a no-op 17 | pass 18 | 19 | 20 | def setup_custom_environment(custom_module_path): 21 | """Load custom environment setup from a Python source file and run the setup 22 | function. 23 | """ 24 | module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) 25 | assert hasattr(module, "setup_environment") and callable( 26 | module.setup_environment 27 | ), ( 28 | "Custom environment module defined in {} does not have the " 29 | "required callable attribute 'setup_environment'." 30 | ).format( 31 | custom_module_path 32 | ) 33 | module.setup_environment() 34 | 35 | 36 | # Force environment setup when this module is imported 37 | setup_environment() 38 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | 4 | from maskrcnn_benchmark import layers 5 | 6 | 7 | class KeypointRCNNPredictor(nn.Module): 8 | def __init__(self, cfg): 9 | super(KeypointRCNNPredictor, self).__init__() 10 | input_features = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS[-1] 11 | num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES 12 | deconv_kernel = 4 13 | self.kps_score_lowres = layers.ConvTranspose2d( 14 | input_features, 15 | num_keypoints, 16 | deconv_kernel, 17 | stride=2, 18 | padding=deconv_kernel // 2 - 1, 19 | ) 20 | nn.init.kaiming_normal_( 21 | self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu" 22 | ) 23 | nn.init.constant_(self.kps_score_lowres.bias, 0) 24 | self.up_scale = 2 25 | 26 | def forward(self, x): 27 | x = self.kps_score_lowres(x) 28 | x = layers.interpolate( 29 | x, scale_factor=self.up_scale, mode="bilinear", align_corners=False 30 | ) 31 | return x 32 | 33 | 34 | _ROI_KEYPOINT_PREDICTOR = {"KeypointRCNNPredictor": KeypointRCNNPredictor} 35 | 36 | 37 | def make_roi_keypoint_predictor(cfg): 38 | func = _ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR] 39 | return func(cfg) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # grounding 2 | grounding/*.egg-info 3 | grounding/*.pyc 4 | grounding/build/ 5 | grounding/DATASET/ 6 | grounding/OUTPUT/ 7 | grounding/MODEL/ 8 | grounding/best_model/ 9 | grounding/all_key/ 10 | grounding/OUTPUT_org 11 | 12 | # compilation and distribution 13 | __pycache__ 14 | _ext 15 | *.so 16 | grounding/maskrcnn_benchmark.egg-info/ 17 | grounding/dist/ 18 | 19 | # pytorch/python/numpy formats 20 | grounding/*.pth 21 | grounding/*.pkl 22 | grounding/*.npy 23 | 24 | # ipython/jupyter notebooks 25 | *.ipynb 26 | **/.ipynb_checkpoints/ 27 | 28 | # Editor temporaries 29 | *.swn 30 | *.swo 31 | *.swp 32 | *~ 33 | 34 | # Pycharm editor settings 35 | .idea 36 | 37 | # vscode editor settings 38 | .vscode 39 | 40 | # MacOS 41 | .DS_Store 42 | 43 | # Custom 44 | *.custom.py 45 | 46 | # logs 47 | logs/ 48 | log_new/ 49 | 50 | # res 51 | grounding/FINAL_RES/ 52 | grounding/FINAL_RES_v2/ 53 | grounding/eval/ 54 | 55 | # visualize 56 | grounding/visualize_base/ 57 | grounding/visualize_glip/ 58 | grounding/visualize_lpi/ 59 | grounding/visualize_lpi_v2/ 60 | grounding/visualize_maple/ 61 | grounding/visualize_sprompts/ 62 | 63 | # task_div 64 | grounding/TASK_DIV/ 65 | grounding/TASK_DIV_REFCOCO/ 66 | 67 | # prompt_save 68 | grounding/prompt_save/prompt_grounding/ 69 | 70 | # embedding save 71 | grounding/embedding_save/ 72 | 73 | # retrieval 74 | __pycache__ 75 | .idea/ 76 | retrieval/logs/ 77 | retrieval/logss/ 78 | retrieval/res/ 79 | retrieval/.vscode/ 80 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.device().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | 30 | 31 | std::pair soft_nms(const at::Tensor& dets, 32 | const at::Tensor& scores, 33 | const float threshold, 34 | const float sigma) { 35 | 36 | if (dets.device().is_cuda()) { 37 | #ifdef WITH_CUDA 38 | AT_ERROR("Soft NMS Does Not have GPU support"); 39 | #endif 40 | } 41 | 42 | std::pair result = soft_nms_cpu(dets, scores, threshold, sigma); 43 | 44 | return result; 45 | } -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/evonorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class EvoNorm2d(nn.Module): 6 | __constants__ = ['num_features', 'eps', 'nonlinearity'] 7 | 8 | def __init__(self, num_features, eps=1e-5, nonlinearity=True, group=32): 9 | super(EvoNorm2d, self).__init__() 10 | 11 | self.num_features = num_features 12 | self.eps = eps 13 | self.nonlinearity = nonlinearity 14 | self.group = group 15 | 16 | self.weight = nn.Parameter(torch.Tensor(1, num_features, 1, 1)) 17 | self.bias = nn.Parameter(torch.Tensor(1, num_features, 1, 1)) 18 | if self.nonlinearity: 19 | self.v = nn.Parameter(torch.Tensor(1, num_features, 1, 1)) 20 | 21 | self.reset_parameters() 22 | 23 | def reset_parameters(self): 24 | nn.init.ones_(self.weight) 25 | nn.init.zeros_(self.bias) 26 | if self.nonlinearity: 27 | nn.init.ones_(self.v) 28 | 29 | def group_std(self, x, groups=32): 30 | N, C, H, W = x.shape 31 | x = torch.reshape(x, (N, groups, C // groups, H, W)) 32 | std = torch.std(x, (3, 4), keepdim=True) 33 | return torch.reshape(std + self.eps, (N, C, 1, 1)) 34 | 35 | def forward(self, x): 36 | if self.nonlinearity: 37 | num = x * torch.sigmoid(self.v * x) 38 | return num / self.group_std(x, self.group) * self.weight + self.bias 39 | else: 40 | return x * self.weight + self.bias -------------------------------------------------------------------------------- /grounding/test/task_visual.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import spatial 3 | import matplotlib.pyplot as plt 4 | from sklearn.manifold import TSNE 5 | import matplotlib.cm as cm 6 | import numpy as np 7 | from openai import OpenAI 8 | api_key = "sk-MazpnWiEWQhrgtP8526a79F8D7254a5894296d2d81Ea6c7a" 9 | api_base = "https://oneapi.xty.app/v1" 10 | 11 | client = OpenAI(api_key=api_key, base_url=api_base) 12 | 13 | def get_embedding(text, model="text-embedding-3-large"): 14 | text = text.replace("\n", " ") 15 | return client.embeddings.create(input = [text], model=model).data[0].embedding 16 | 17 | 18 | tsne = TSNE(n_components=2, random_state=0, perplexity=1) 19 | words = ['appliance', 'sports', 'outdoor', 'electronic', 'accessory', 'indoor', 'kitchen', 'furniture', 'vehicle', 'food', 'animal', 'person'] 20 | vectors = [get_embedding(word) for word in words] 21 | vectors = np.array(vectors) 22 | 23 | np.savetxt('../MID/tasks_array.txt', vectors) 24 | 25 | Y = tsne.fit_transform(vectors) 26 | 27 | # colors = cm.rainbow(np.linspace(0, 1, Y.shape[0])) 28 | colors = plt.get_cmap('tab20')(range(12)) 29 | for dataset, color, label in zip(Y, colors, words): 30 | plt.scatter(dataset[0], dataset[1], color=color, label=label) 31 | 32 | plt.xlabel("X") 33 | plt.ylabel("Y") 34 | plt.legend() 35 | plt.savefig('../MID/task_visual.png') 36 | # for dataset, label in zip(Y, words): 37 | # plt.annotate(label, (dataset[0], dataset[1]), textcoords='offset points',xytext=(0,10), ha='center') 38 | plt.show() 39 | 40 | # plt.savefig('../MID/task_visual.png') -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/language_backbone/backbone.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import torch 3 | from torch import nn 4 | 5 | from maskrcnn_benchmark.modeling import registry 6 | from . import bert_model 7 | from . import rnn_model 8 | from . import clip_model 9 | from . import word_utils 10 | 11 | 12 | @registry.LANGUAGE_BACKBONES.register("bert-base-uncased") 13 | def build_bert_backbone(cfg): 14 | body = bert_model.BertEncoder(cfg) 15 | model = nn.Sequential(OrderedDict([("body", body)])) 16 | return model 17 | 18 | 19 | @registry.LANGUAGE_BACKBONES.register("roberta-base") 20 | def build_bert_backbone(cfg): 21 | body = bert_model.BertEncoder(cfg) 22 | model = nn.Sequential(OrderedDict([("body", body)])) 23 | return model 24 | 25 | 26 | @registry.LANGUAGE_BACKBONES.register("rnn") 27 | def build_rnn_backbone(cfg): 28 | body = rnn_model.RNNEnoder(cfg) 29 | model = nn.Sequential(OrderedDict([("body", body)])) 30 | return model 31 | 32 | 33 | @registry.LANGUAGE_BACKBONES.register("clip") 34 | def build_clip_backbone(cfg): 35 | body = clip_model.CLIPTransformer(cfg) 36 | model = nn.Sequential(OrderedDict([("body", body)])) 37 | return model 38 | 39 | 40 | def build_backbone(cfg): 41 | assert cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE in registry.LANGUAGE_BACKBONES, \ 42 | "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format( 43 | cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 44 | ) 45 | return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE](cfg) 46 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | 4 | def _register_generic(module_dict, module_name, module): 5 | assert module_name not in module_dict 6 | module_dict[module_name] = module 7 | 8 | 9 | class Registry(dict): 10 | ''' 11 | A helper class for managing registering modules, it extends a dictionary 12 | and provides a register functions. 13 | 14 | Eg. creeting a registry: 15 | some_registry = Registry({"default": default_module}) 16 | 17 | There're two ways of registering new modules: 18 | 1): normal way is just calling register function: 19 | def foo(): 20 | ... 21 | some_registry.register("foo_module", foo) 22 | 2): used as decorator when declaring the module: 23 | @some_registry.register("foo_module") 24 | @some_registry.register("foo_modeul_nickname") 25 | def foo(): 26 | ... 27 | 28 | Access of module is just like using a dictionary, eg: 29 | f = some_registry["foo_modeul"] 30 | ''' 31 | def __init__(self, *args, **kwargs): 32 | super(Registry, self).__init__(*args, **kwargs) 33 | 34 | def register(self, module_name, module=None): 35 | # used as function call 36 | if module is not None: 37 | _register_generic(self, module_name, module) 38 | return 39 | 40 | # used as decorator 41 | def register_fn(fn): 42 | _register_generic(self, module_name, fn) 43 | return fn 44 | 45 | return register_fn 46 | -------------------------------------------------------------------------------- /grounding/odinw/download.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | argparser = argparse.ArgumentParser() 5 | argparser.add_argument("--dataset_names", default="all", type=str) # "all" or names joined by comma 6 | argparser.add_argument("--dataset_path", default="DATASET/odinw", type=str) 7 | args = argparser.parse_args() 8 | 9 | root = "https://huggingface.co/GLIPModel/GLIP/tree/main/odinw_35" 10 | 11 | all_datasets = ["AerialMaritimeDrone", "AmericanSignLanguageLetters", "Aquarium", "BCCD", "ChessPieces", "CottontailRabbits", "DroneControl", "EgoHands", "HardHatWorkers", "MaskWearing", "MountainDewCommercial", "NorthAmericaMushrooms", "OxfordPets", "PKLot", "Packages", "PascalVOC", "Raccoon", "ShellfishOpenImages", "ThermalCheetah", "UnoCards", "VehiclesOpenImages", "WildfireSmoke", "boggleBoards", "brackishUnderwater", "dice", "openPoetryVision", "pistols", "plantdoc", "pothole", "selfdrivingCar", "thermalDogsAndPeople", "vector", "websiteScreenshots"] 12 | 13 | datasets_to_download = [] 14 | if args.dataset_names == "all": 15 | datasets_to_download = all_datasets 16 | else: 17 | datasets_to_download = args.dataset_names.split(",") 18 | 19 | for dataset in datasets_to_download: 20 | if dataset in all_datasets: 21 | print("Downloading dataset: ", dataset) 22 | os.system("wget " + root + "/" + dataset + ".zip" + " -O " + args.dataset_path + "/" + dataset + ".zip") 23 | os.system("unzip " + args.dataset_path + "/" + dataset + ".zip -d " + args.dataset_path) 24 | os.system("rm " + args.dataset_path + "/" + dataset + ".zip") 25 | else: 26 | print("Dataset not found: ", dataset) 27 | -------------------------------------------------------------------------------- /grounding/matrix/matrix.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | from torch.nn.functional import binary_cross_entropy_with_logits 5 | 6 | def nt_bxent_loss(x, target, temperature=1.0): 7 | assert len(x.size()) == 2 8 | target = target.type(torch.float32).to(x.device) 9 | # Cosine similarity 10 | xcs = F.cosine_similarity(x[None, :, :], x[:, None, :], dim=-1) 11 | # Set logit of diagonal element to "inf" signifying complete 12 | # correlation. sigmoid(inf) = 1.0 so this will work out nicely 13 | # when computing the Binary cross-entropy Loss. 14 | xcs[torch.eye(x.size(0)).bool()] = float("inf") 15 | 16 | # Standard binary cross-entropy loss. We use binary_cross_entropy() here and not 17 | # binary_cross_entropy_with_logits() because of 18 | # https://github.com/pytorch/pytorch/issues/102894 19 | # The method *_with_logits() uses the log-sum-exp-trick, which causes inf and -inf values 20 | # to result in a NaN result. 21 | loss = binary_cross_entropy_with_logits(input=(xcs / temperature).sigmoid(), target=target, reduction="none") 22 | 23 | target_pos = target.bool() 24 | target_neg = ~target_pos 25 | 26 | loss_pos = torch.zeros(x.size(0), x.size(0)).to(x.device).masked_scatter(target_pos, loss[target_pos]) 27 | loss_neg = torch.zeros(x.size(0), x.size(0)).to(x.device).masked_scatter(target_neg, loss[target_neg]) 28 | loss_pos = loss_pos.sum(dim=1) 29 | loss_neg = loss_neg.sum(dim=1) 30 | num_pos = target.sum(dim=1) 31 | num_neg = x.size(0) - num_pos 32 | 33 | return ((loss_pos / num_pos) + (loss_neg / num_neg)).mean() -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | from .batch_norm import FrozenBatchNorm2d, NaiveSyncBatchNorm2d 5 | from .misc import Conv2d, _NewEmptyTensorOp 6 | from .misc import ConvTranspose2d 7 | from .misc import DFConv2d 8 | from .misc import interpolate 9 | from .misc import Scale 10 | from .nms import nms 11 | from .nms import ml_nms 12 | from .nms import soft_nms 13 | from .roi_align import ROIAlign 14 | from .roi_align import roi_align 15 | from .roi_align import ROIAlignV2 16 | from .roi_pool import ROIPool 17 | from .roi_pool import roi_pool 18 | from .smooth_l1_loss import smooth_l1_loss 19 | from .sigmoid_focal_loss import SigmoidFocalLoss, TokenSigmoidFocalLoss 20 | from .iou_loss import IOULoss, IOUWHLoss 21 | from .deform_conv import DeformConv, ModulatedDeformConv 22 | from .dropblock import DropBlock2D, DropBlock3D 23 | from .evonorm import EvoNorm2d 24 | from .dyrelu import DYReLU, swish 25 | from .se import SELayer, SEBlock 26 | from .dyhead import DyHead 27 | from .set_loss import HungarianMatcher, SetCriterion 28 | 29 | __all__ = ["nms", "ml_nms", "soft_nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", 30 | "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "swish", 31 | "FrozenBatchNorm2d", "NaiveSyncBatchNorm2d", "SigmoidFocalLoss", "TokenSigmoidFocalLoss", "IOULoss", 32 | "IOUWHLoss", "Scale", "DeformConv", "ModulatedDeformConv", "DyHead", 33 | "DropBlock2D", "DropBlock3D", "EvoNorm2d", "DYReLU", "SELayer", "SEBlock", 34 | "HungarianMatcher", "SetCriterion", "ROIAlignV2", "_NewEmptyTensorOp"] 35 | -------------------------------------------------------------------------------- /grounding/configs/odinw_35/_all.json: -------------------------------------------------------------------------------- 1 | ["configs/odinw_35/AerialMaritimeDrone_large.yaml","configs/odinw_35/AerialMaritimeDrone_tiled.yaml","configs/odinw_35/AmericanSignLanguageLetters_American_Sign_Language_Letters.v1-v1.coco.yaml","configs/odinw_35/Aquarium_Aquarium_Combined.v2-raw-1024.coco.yaml","configs/odinw_35/BCCD_BCCD.v3-raw.coco.yaml","configs/odinw_35/ChessPieces_Chess_Pieces.v23-raw.coco.yaml","configs/odinw_35/CottontailRabbits.yaml","configs/odinw_35/DroneControl_Drone_Control.v3-raw.coco.yaml","configs/odinw_35/EgoHands_generic.yaml","configs/odinw_35/EgoHands_specific.yaml","configs/odinw_35/HardHatWorkers_raw.yaml","configs/odinw_35/MaskWearing_raw.yaml","configs/odinw_35/MountainDewCommercial.yaml","configs/odinw_35/NorthAmericaMushrooms_North_American_Mushrooms.v1-416x416.coco.yaml","configs/odinw_35/OxfordPets_by-breed.yaml","configs/odinw_35/OxfordPets_by-species.yaml","configs/odinw_35/PKLot_640.yaml","configs/odinw_35/Packages_Raw.yaml","configs/odinw_35/PascalVOC.yaml","configs/odinw_35/Raccoon_Raccoon.v2-raw.coco.yaml","configs/odinw_35/ShellfishOpenImages_raw.yaml","configs/odinw_35/ThermalCheetah.yaml","configs/odinw_35/UnoCards_raw.yaml","configs/odinw_35/VehiclesOpenImages_416x416.yaml","configs/odinw_35/WildfireSmoke.yaml","configs/odinw_35/boggleBoards_416x416AutoOrient_export_.yaml","configs/odinw_35/brackishUnderwater_960x540.yaml","configs/odinw_35/dice_mediumColor_export.yaml","configs/odinw_35/openPoetryVision_512x512.yaml","configs/odinw_35/pistols_export.yaml","configs/odinw_35/plantdoc_416x416.yaml","configs/odinw_35/pothole.yaml","configs/odinw_35/selfdrivingCar_fixedLarge_export_.yaml","configs/odinw_35/thermalDogsAndPeople.yaml","configs/odinw_35/websiteScreenshots.yaml"] -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ml_nms.h" 4 | #include "ROIAlign.h" 5 | #include "ROIPool.h" 6 | #include "SigmoidFocalLoss.h" 7 | #include "deform_conv.h" 8 | #include "deform_pool.h" 9 | 10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 11 | m.def("nms", &nms, "non-maximum suppression"); 12 | m.def("ml_nms", &ml_nms, "multi-label non-maximum suppression"); 13 | m.def("soft_nms", &soft_nms, "soft non-maximum suppression"); 14 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 15 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 16 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 17 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 18 | m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); 19 | m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); 20 | m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); 21 | m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input"); 22 | m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters"); 23 | m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward"); 24 | m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward"); 25 | m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward"); 26 | m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward"); 27 | } 28 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/background.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import json 4 | from PIL import Image 5 | 6 | import torch 7 | import torchvision 8 | import torch.utils.data as data 9 | from maskrcnn_benchmark.structures.bounding_box import BoxList 10 | 11 | class Background(data.Dataset): 12 | """ Background 13 | 14 | Args: 15 | root (string): Root directory where images are downloaded to. 16 | annFile (string): Path to json annotation file. 17 | transform (callable, optional): A function/transform that takes in an PIL image 18 | and returns a transformed version. E.g, ``transforms.ToTensor`` 19 | """ 20 | 21 | def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None): 22 | self.root = root 23 | 24 | with open(ann_file, 'r') as f: 25 | self.ids = json.load(f)['images'] 26 | self.transform = transforms 27 | 28 | def __getitem__(self, index): 29 | """ 30 | Args: 31 | index (int): Index 32 | 33 | Returns: 34 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 35 | """ 36 | im_info = self.ids[index] 37 | path = im_info['file_name'] 38 | fp = os.path.join(self.root, path) 39 | 40 | img = Image.open(fp).convert('RGB') 41 | if self.transform is not None: 42 | img, _ = self.transform(img, None) 43 | null_target = BoxList(torch.zeros((0,4)), (img.shape[-1], img.shape[-2])) 44 | null_target.add_field('labels', torch.zeros(0)) 45 | 46 | return img, null_target, index 47 | 48 | def __len__(self): 49 | return len(self.ids) 50 | 51 | def get_img_info(self, index): 52 | im_info = self.ids[index] 53 | return im_info -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/ema.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from collections import OrderedDict 3 | import torch 4 | 5 | 6 | class ModelEma: 7 | def __init__(self, model, decay=0.9999, device=''): 8 | self.ema = deepcopy(model) 9 | self.ema.eval() 10 | self.decay = decay 11 | self.device = device 12 | if device: 13 | self.ema.to(device=device) 14 | self.ema_is_dp = hasattr(self.ema, 'module') 15 | for p in self.ema.parameters(): 16 | p.requires_grad_(False) 17 | 18 | def load_checkpoint(self, checkpoint): 19 | if isinstance(checkpoint, str): 20 | checkpoint = torch.load(checkpoint) 21 | 22 | assert isinstance(checkpoint, dict) 23 | if 'model_ema' in checkpoint: 24 | new_state_dict = OrderedDict() 25 | for k, v in checkpoint['model_ema'].items(): 26 | if self.ema_is_dp: 27 | name = k if k.startswith('module') else 'module.' + k 28 | else: 29 | name = k.replace('module.', '') if k.startswith('module') else k 30 | new_state_dict[name] = v 31 | self.ema.load_state_dict(new_state_dict) 32 | 33 | def state_dict(self): 34 | return self.ema.state_dict() 35 | 36 | def update(self, model): 37 | pre_module = hasattr(model, 'module') and not self.ema_is_dp 38 | with torch.no_grad(): 39 | curr_msd = model.state_dict() 40 | for k, ema_v in self.ema.state_dict().items(): 41 | k = 'module.' + k if pre_module else k 42 | model_v = curr_msd[k].detach() 43 | if self.device: 44 | model_v = model_v.to(device=self.device) 45 | ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v) 46 | 47 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/pretrain_model_loading.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from collections import OrderedDict 6 | 7 | def _remove_bn_statics(state_dict): 8 | layer_keys = sorted(state_dict.keys()) 9 | remove_list = [] 10 | for key in layer_keys: 11 | if 'running_mean' in key or 'running_var' in key or 'num_batches_tracked' in key: 12 | remove_list.append(key) 13 | for key in remove_list: 14 | del state_dict[key] 15 | return state_dict 16 | 17 | def _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg): 18 | import re 19 | layer_keys = sorted(state_dict.keys()) 20 | for ix, stage_with_dcn in enumerate(cfg.MODEL.RESNETS.STAGE_WITH_DCN, 1): 21 | if not stage_with_dcn: 22 | continue 23 | for old_key in layer_keys: 24 | pattern = ".*layer{}.*conv2.*".format(ix) 25 | r = re.match(pattern, old_key) 26 | if r is None: 27 | continue 28 | for param in ["weight", "bias"]: 29 | if old_key.find(param) is -1: 30 | continue 31 | if 'unit01' in old_key: 32 | continue 33 | new_key = old_key.replace( 34 | "conv2.{}".format(param), "conv2.conv.{}".format(param) 35 | ) 36 | print("pattern: {}, old_key: {}, new_key: {}".format( 37 | pattern, old_key, new_key 38 | )) 39 | state_dict[new_key] = state_dict[old_key] 40 | del state_dict[old_key] 41 | return state_dict 42 | 43 | 44 | def load_pretrain_format(cfg, f): 45 | model = torch.load(f) 46 | model = _remove_bn_statics(model) 47 | model = _rename_conv_weights_for_deformable_conv_layers(model, cfg) 48 | 49 | return dict(model=model) 50 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/ROIPool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | 11 | std::tuple ROIPool_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width) { 16 | if (input.device().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIPool_backward(const at::Tensor& grad, 27 | const at::Tensor& input, 28 | const at::Tensor& rois, 29 | const at::Tensor& argmax, 30 | const float spatial_scale, 31 | const int pooled_height, 32 | const int pooled_width, 33 | const int batch_size, 34 | const int channels, 35 | const int height, 36 | const int width) { 37 | if (grad.device().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.device().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.device().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from . import transforms as T 3 | from torchvision import transforms 4 | 5 | def build_transforms(cfg, is_train=True): 6 | if is_train: 7 | if len(cfg.AUGMENT.MULT_MIN_SIZE_TRAIN)>0: 8 | min_size = cfg.AUGMENT.MULT_MIN_SIZE_TRAIN 9 | else: 10 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 11 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 12 | flip_horizontal_prob = cfg.AUGMENT.FLIP_PROB_TRAIN 13 | flip_vertical_prob = cfg.AUGMENT.VERTICAL_FLIP_PROB_TRAIN 14 | brightness = cfg.AUGMENT.BRIGHTNESS 15 | contrast = cfg.AUGMENT.CONTRAST 16 | saturation = cfg.AUGMENT.SATURATION 17 | hue = cfg.AUGMENT.HUE 18 | 19 | crop_prob = cfg.AUGMENT.CROP_PROB 20 | min_ious = cfg.AUGMENT.CROP_MIN_IOUS 21 | min_crop_size = cfg.AUGMENT.CROP_MIN_SIZE 22 | 23 | else: 24 | min_size = cfg.INPUT.MIN_SIZE_TEST 25 | max_size = cfg.INPUT.MAX_SIZE_TEST 26 | flip_horizontal_prob = 0.0 27 | 28 | fix_res = cfg.INPUT.FIX_RES 29 | if cfg.INPUT.FORMAT is not '': 30 | input_format = cfg.INPUT.FORMAT 31 | elif cfg.INPUT.TO_BGR255: 32 | input_format = 'bgr255' 33 | normalize_transform = T.Normalize( 34 | mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format 35 | ) 36 | min_size = 448 37 | max_size = 448 38 | transform = T.Compose( 39 | [ 40 | # T.Resize(min_size, max_size, restrict=fix_res), 41 | T.Resize(min_size=min_size, max_size=max_size, restrict=True), 42 | # transforms.Resize(320), 43 | # T.Resize(320), 44 | T.RandomHorizontalFlip(flip_horizontal_prob), 45 | T.ToTensor(), 46 | normalize_transform, 47 | ] 48 | ) 49 | return transform 50 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/se.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class SELayer(nn.Module): 5 | def __init__(self, channel, reduction=16): 6 | super(SELayer, self).__init__() 7 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 8 | self.fc = nn.Sequential( 9 | nn.Linear(channel, channel // reduction, bias=False), 10 | nn.ReLU(inplace=True), 11 | nn.Linear(channel // reduction, channel, bias=False), 12 | nn.Sigmoid() 13 | ) 14 | 15 | def forward(self, x): 16 | b, c, _, _ = x.size() 17 | y = self.avg_pool(x).view(b, c) 18 | y = self.fc(y).view(b, c, 1, 1) 19 | return x * y.expand_as(x) 20 | 21 | 22 | class SEBlock(nn.Module): 23 | def __init__(self, channels, reduction=16, 24 | use_conv=True, mid_activation=nn.ReLU(inplace=True), out_activation=nn.Sigmoid()): 25 | super(SEBlock, self).__init__() 26 | self.use_conv = use_conv 27 | mid_channels = channels // reduction 28 | 29 | self.pool = nn.AdaptiveAvgPool2d(output_size=1) 30 | if use_conv: 31 | self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, bias=True) 32 | else: 33 | self.fc1 = nn.Linear(channels, mid_channels) 34 | self.activ = mid_activation 35 | if use_conv: 36 | self.conv2 = nn.Conv2d(mid_channels, channels, kernel_size=1, bias=True) 37 | else: 38 | self.fc2 = nn.Linear(mid_channels, channels) 39 | self.sigmoid = out_activation 40 | 41 | def forward(self, x): 42 | w = self.pool(x) 43 | if not self.use_conv: 44 | w = w.view(x.size(0), -1) 45 | w = self.conv1(w) if self.use_conv else self.fc1(w) 46 | w = self.activ(w) 47 | w = self.conv2(w) if self.use_conv else self.fc2(w) 48 | w = self.sigmoid(w) 49 | if not self.use_conv: 50 | w = w.unsqueeze(2).unsqueeze(3) 51 | x = x * w 52 | return x -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/rpn/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn, Tensor 4 | 5 | import copy 6 | from typing import Optional, List 7 | 8 | 9 | def _get_clones(module, N): 10 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 11 | 12 | 13 | def _get_activation_fn(activation): 14 | """Return an activation function given a string""" 15 | if activation == "relu": 16 | return F.relu 17 | if activation == "gelu": 18 | return F.gelu 19 | if activation == "glu": 20 | return F.glu 21 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 22 | 23 | 24 | class TransformerEncoderLayer(nn.Module): 25 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 26 | activation="relu", normalize_before=False): 27 | super(TransformerEncoderLayer, self).__init__() 28 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 29 | # Implementation of Feedforward model 30 | self.linear1 = nn.Linear(d_model, dim_feedforward) 31 | self.dropout = nn.Dropout(dropout) 32 | self.linear2 = nn.Linear(dim_feedforward, d_model) 33 | 34 | self.norm1 = nn.LayerNorm(d_model) 35 | self.norm2 = nn.LayerNorm(d_model) 36 | self.dropout1 = nn.Dropout(dropout) 37 | self.dropout2 = nn.Dropout(dropout) 38 | 39 | self.activation = _get_activation_fn(activation) 40 | self.normalize_before = normalize_before 41 | 42 | def forward(self, src, 43 | src_mask: Optional[Tensor] = None, 44 | src_key_padding_mask: Optional[Tensor] = None): 45 | src2 = self.self_attn(src, src, src, attn_mask=src_mask, 46 | key_padding_mask=src_key_padding_mask)[0] 47 | src = src + self.dropout1(src2) 48 | src = self.norm1(src) 49 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 50 | src = src + self.dropout2(src2) 51 | src = self.norm2(src) 52 | return src 53 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/roi_pool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from maskrcnn_benchmark import _C 9 | 10 | 11 | class _ROIPool(Function): 12 | @staticmethod 13 | def forward(ctx, input, roi, output_size, spatial_scale): 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.input_shape = input.size() 17 | output, argmax = _C.roi_pool_forward( 18 | input, roi, spatial_scale, output_size[0], output_size[1] 19 | ) 20 | ctx.save_for_backward(input, roi, argmax) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | input, rois, argmax = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | bs, ch, h, w = ctx.input_shape 30 | grad_input = _C.roi_pool_backward( 31 | grad_output, 32 | input, 33 | rois, 34 | argmax, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | ) 43 | return grad_input, None, None, None 44 | 45 | 46 | roi_pool = _ROIPool.apply 47 | 48 | 49 | class ROIPool(nn.Module): 50 | def __init__(self, output_size, spatial_scale): 51 | super(ROIPool, self).__init__() 52 | self.output_size = output_size 53 | self.spatial_scale = spatial_scale 54 | 55 | def forward(self, input, rois): 56 | return roi_pool(input, rois, self.output_size, self.spatial_scale) 57 | 58 | def __repr__(self): 59 | tmpstr = self.__class__.__name__ + "(" 60 | tmpstr += "output_size=" + str(self.output_size) 61 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 62 | tmpstr += ")" 63 | return tmpstr 64 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/deform_pool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | // Interface for Python 11 | void deform_psroi_pooling_forward( 12 | at::Tensor input, 13 | at::Tensor bbox, 14 | at::Tensor trans, 15 | at::Tensor out, 16 | at::Tensor top_count, 17 | const int no_trans, 18 | const float spatial_scale, 19 | const int output_dim, 20 | const int group_size, 21 | const int pooled_size, 22 | const int part_size, 23 | const int sample_per_part, 24 | const float trans_std) 25 | { 26 | if (input.device().is_cuda()) { 27 | #ifdef WITH_CUDA 28 | return deform_psroi_pooling_cuda_forward( 29 | input, bbox, trans, out, top_count, 30 | no_trans, spatial_scale, output_dim, group_size, 31 | pooled_size, part_size, sample_per_part, trans_std 32 | ); 33 | #else 34 | AT_ERROR("Not compiled with GPU support"); 35 | #endif 36 | } 37 | AT_ERROR("Not implemented on the CPU"); 38 | } 39 | 40 | 41 | void deform_psroi_pooling_backward( 42 | at::Tensor out_grad, 43 | at::Tensor input, 44 | at::Tensor bbox, 45 | at::Tensor trans, 46 | at::Tensor top_count, 47 | at::Tensor input_grad, 48 | at::Tensor trans_grad, 49 | const int no_trans, 50 | const float spatial_scale, 51 | const int output_dim, 52 | const int group_size, 53 | const int pooled_size, 54 | const int part_size, 55 | const int sample_per_part, 56 | const float trans_std) 57 | { 58 | if (input.device().is_cuda()) { 59 | #ifdef WITH_CUDA 60 | return deform_psroi_pooling_cuda_backward( 61 | out_grad, input, bbox, trans, top_count, input_grad, trans_grad, 62 | no_trans, spatial_scale, output_dim, group_size, pooled_size, 63 | part_size, sample_per_part, trans_std 64 | ); 65 | #else 66 | AT_ERROR("Not compiled with GPU support"); 67 | #endif 68 | } 69 | AT_ERROR("Not implemented on the CPU"); 70 | } 71 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor 4 | from .roi_keypoint_predictors import make_roi_keypoint_predictor 5 | from .inference import make_roi_keypoint_post_processor 6 | from .loss import make_roi_keypoint_loss_evaluator 7 | 8 | 9 | class ROIKeypointHead(torch.nn.Module): 10 | def __init__(self, cfg): 11 | super(ROIKeypointHead, self).__init__() 12 | self.cfg = cfg.clone() 13 | self.feature_extractor = make_roi_keypoint_feature_extractor(cfg) 14 | self.predictor = make_roi_keypoint_predictor(cfg) 15 | self.post_processor = make_roi_keypoint_post_processor(cfg) 16 | self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg) 17 | 18 | def forward(self, features, proposals, targets=None): 19 | """ 20 | Arguments: 21 | features (list[Tensor]): feature-maps from possibly several levels 22 | proposals (list[BoxList]): proposal boxes 23 | targets (list[BoxList], optional): the ground-truth targets. 24 | 25 | Returns: 26 | x (Tensor): the result of the feature extractor 27 | proposals (list[BoxList]): during training, the original proposals 28 | are returned. During testing, the predicted boxlists are returned 29 | with the `mask` field set 30 | losses (dict[Tensor]): During training, returns the losses for the 31 | head. During testing, returns an empty dict. 32 | """ 33 | if self.training: 34 | with torch.no_grad(): 35 | proposals = self.loss_evaluator.subsample(proposals, targets) 36 | 37 | x = self.feature_extractor(features, proposals) 38 | kp_logits = self.predictor(x) 39 | 40 | if not self.training: 41 | result = self.post_processor(kp_logits, proposals) 42 | return x, result, {} 43 | 44 | loss_kp = self.loss_evaluator(proposals, kp_logits) 45 | 46 | return x, proposals, dict(loss_kp=loss_kp) 47 | 48 | 49 | def build_roi_keypoint_head(cfg): 50 | return ROIKeypointHead(cfg) -------------------------------------------------------------------------------- /grounding/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #!/usr/bin/env python 3 | 4 | import glob 5 | import os 6 | 7 | import torch 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | from torch.utils.cpp_extension import CUDA_HOME 11 | from torch.utils.cpp_extension import CppExtension 12 | from torch.utils.cpp_extension import CUDAExtension 13 | 14 | requirements = ["torch", "torchvision"] 15 | 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | 28 | extra_compile_args = {"cxx": []} 29 | define_macros = [] 30 | 31 | if torch.cuda.is_available() and CUDA_HOME is not None: 32 | extension = CUDAExtension 33 | sources += source_cuda 34 | define_macros += [("WITH_CUDA", None)] 35 | extra_compile_args["nvcc"] = [ 36 | "-DCUDA_HAS_FP16=1", 37 | "-D__CUDA_NO_HALF_OPERATORS__", 38 | "-D__CUDA_NO_HALF_CONVERSIONS__", 39 | "-D__CUDA_NO_HALF2_OPERATORS__", 40 | ] 41 | 42 | sources = [os.path.join(extensions_dir, s) for s in sources] 43 | 44 | include_dirs = [extensions_dir] 45 | 46 | ext_modules = [ 47 | extension( 48 | "maskrcnn_benchmark._C", 49 | sources, 50 | include_dirs=include_dirs, 51 | define_macros=define_macros, 52 | extra_compile_args=extra_compile_args, 53 | ) 54 | ] 55 | 56 | return ext_modules 57 | 58 | 59 | setup( 60 | name="maskrcnn_benchmark", 61 | description="object detection in pytorch", 62 | packages=find_packages(exclude=("configs", "tests",)), 63 | # install_requires=requirements, 64 | ext_modules=get_extensions(), 65 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension.with_options(use_ninja=False)}, 66 | ) 67 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/imagenet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import json 4 | from PIL import Image 5 | 6 | import torch.utils.data as data 7 | 8 | def pil_loader(path): 9 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 10 | with open(path, 'rb') as f: 11 | img = Image.open(f) 12 | return img.convert('RGB') 13 | 14 | class ImageNet(data.Dataset): 15 | """ ImageNet 16 | 17 | Args: 18 | root (string): Root directory where images are downloaded to. 19 | annFile (string): Path to json annotation file. 20 | transform (callable, optional): A function/transform that takes in an PIL image 21 | and returns a transformed version. E.g, ``transforms.ToTensor`` 22 | """ 23 | 24 | def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None): 25 | 26 | 27 | self.root = root 28 | self.transform = transforms 29 | 30 | meta_file = os.path.join(root, ann_file) 31 | assert os.path.exists(meta_file), 'meta file %s under root %s not found' % (os.path.basename(meta_file), root) 32 | 33 | with open(meta_file, 'r') as f: 34 | meta = json.load(f) 35 | 36 | self.classes = meta['classes'] 37 | self.class_to_idx = meta['class_to_idx'] 38 | self.samples = meta['samples'] 39 | self.num_sample = len(self.samples) 40 | self.allsamples = self.samples 41 | 42 | def select_class(self, cls): 43 | new_samples = [sample for sample in self.allsamples if sample[-1] in cls] 44 | self.samples = new_samples 45 | self.num_sample = len(self.samples) 46 | 47 | def __getitem__(self, index): 48 | """ 49 | Args: 50 | index (int): Index 51 | 52 | Returns: 53 | tuple: (sample, target) where target is class_index of the target class. 54 | """ 55 | img_path, target = self.samples[index] 56 | sample = pil_loader(self.root + '/' + img_path) 57 | if self.transform is not None: 58 | sample = self.transform(sample) 59 | 60 | return sample, target, index 61 | 62 | def __len__(self): 63 | return len(self.samples) -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch import nn 3 | 4 | 5 | class FastRCNNPredictor(nn.Module): 6 | def __init__(self, config, pretrained=None): 7 | super(FastRCNNPredictor, self).__init__() 8 | 9 | stage_index = 4 10 | stage2_relative_factor = 2 ** (stage_index - 1) 11 | res2_out_channels = config.MODEL.RESNETS.RES2_OUT_CHANNELS 12 | num_inputs = res2_out_channels * stage2_relative_factor 13 | 14 | num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES 15 | self.avgpool = nn.AvgPool2d(kernel_size=7, stride=7) 16 | self.cls_score = nn.Linear(num_inputs, num_classes) 17 | self.bbox_pred = nn.Linear(num_inputs, num_classes * 4) 18 | 19 | nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) 20 | nn.init.constant_(self.cls_score.bias, 0) 21 | 22 | nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) 23 | nn.init.constant_(self.bbox_pred.bias, 0) 24 | 25 | def forward(self, x): 26 | x = self.avgpool(x) 27 | x = x.view(x.size(0), -1) 28 | cls_logit = self.cls_score(x) 29 | bbox_pred = self.bbox_pred(x) 30 | return cls_logit, bbox_pred 31 | 32 | 33 | class FPNPredictor(nn.Module): 34 | def __init__(self, cfg): 35 | super(FPNPredictor, self).__init__() 36 | num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES 37 | representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM 38 | 39 | self.cls_score = nn.Linear(representation_size, num_classes) 40 | self.bbox_pred = nn.Linear(representation_size, num_classes * 4) 41 | 42 | nn.init.normal_(self.cls_score.weight, std=0.01) 43 | nn.init.normal_(self.bbox_pred.weight, std=0.001) 44 | for l in [self.cls_score, self.bbox_pred]: 45 | nn.init.constant_(l.bias, 0) 46 | 47 | def forward(self, x): 48 | scores = self.cls_score(x) 49 | bbox_deltas = self.bbox_pred(x) 50 | 51 | return scores, bbox_deltas 52 | 53 | 54 | _ROI_BOX_PREDICTOR = { 55 | "FastRCNNPredictor": FastRCNNPredictor, 56 | "FPNPredictor": FPNPredictor, 57 | } 58 | 59 | 60 | def make_roi_box_predictor(cfg): 61 | func = _ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] 62 | return func(cfg) 63 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/shallow_contrastive_loss_helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import maskrcnn_benchmark.utils.dist as dist 3 | 4 | 5 | def normalized_positive_map(positive_map): 6 | positive_map = positive_map.float() 7 | positive_map_num_pos = positive_map.sum(2) 8 | positive_map_num_pos[positive_map_num_pos == 0] = 1e-6 9 | positive_map = positive_map / positive_map_num_pos.unsqueeze(-1) 10 | return positive_map 11 | 12 | 13 | def pad_tensor_given_dim_length(tensor, dim, length, padding_value=0, batch_first=True): 14 | new_size = list(tensor.size()[:dim]) + [length] + list(tensor.size()[dim + 1:]) 15 | out_tensor = tensor.data.new(*new_size).fill_(padding_value) 16 | if batch_first: 17 | out_tensor[:, :tensor.size(1), ...] = tensor 18 | else: 19 | out_tensor[:tensor.size(0), ...] = tensor 20 | return out_tensor 21 | 22 | 23 | def pad_random_negative_tensor_given_length(positive_tensor, negative_padding_tensor, length=None): 24 | assert positive_tensor.shape[0] + negative_padding_tensor.shape[0] == length 25 | return torch.cat((positive_tensor, negative_padding_tensor), dim=0) 26 | 27 | 28 | def gather_tensors(tensor): 29 | """ 30 | Performs all_gather operation on the provided tensors. 31 | *** Warning ***: torch.distributed.all_gather has no gradient. 32 | """ 33 | if not dist.is_dist_avail_and_initialized(): 34 | return torch.stack([tensor], dim=0) 35 | 36 | total = dist.get_world_size() 37 | rank = torch.distributed.get_rank() 38 | # gathered_normalized_img_emb = [torch.zeros_like(normalized_img_emb) for _ in range(total)] 39 | # torch.distributed.all_gather(gathered_normalized_img_emb, normalized_img_emb) 40 | 41 | tensors_gather = [ 42 | torch.zeros_like(tensor) 43 | for _ in range(total) 44 | ] 45 | torch.distributed.all_gather(tensors_gather, tensor, async_op=False) 46 | 47 | # need to do this to restore propagation of the gradients 48 | tensors_gather[rank] = tensor 49 | output = torch.stack(tensors_gather, dim=0) 50 | return output 51 | 52 | 53 | def convert_to_roi_format(boxes): 54 | concat_boxes = boxes.bbox 55 | device, dtype = concat_boxes.device, concat_boxes.dtype 56 | ids = torch.full((len(boxes), 1), 0, dtype=dtype, device=device) 57 | rois = torch.cat([ids, concat_boxes], dim=1) 58 | return rois -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/mask_head/hourglass.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 4 | 5 | 6 | class Residual(nn.Module): 7 | def __init__(self, inp_dim, out_dim, use_gn=False): 8 | super(Residual, self).__init__() 9 | self.relu = nn.ReLU() 10 | # self.bn1 = nn.BatchNorm2d(inp_dim) 11 | self.conv1 = make_conv3x3(inp_dim, int(out_dim / 2), 1, use_relu=False, use_gn=use_gn) 12 | # self.bn2 = nn.BatchNorm2d(int(out_dim / 2)) 13 | self.conv2 = make_conv3x3(int(out_dim / 2), int(out_dim / 2), 3, use_relu=False, use_gn=use_gn) 14 | # self.bn3 = nn.BatchNorm2d(int(out_dim / 2)) 15 | self.conv3 = make_conv3x3(int(out_dim / 2), out_dim, 1, use_relu=False, use_gn=use_gn) 16 | if inp_dim == out_dim: 17 | self.need_skip = False 18 | else: 19 | self.need_skip = True 20 | self.skip_layer = make_conv3x3(inp_dim, out_dim, 1, use_relu=False, use_gn=False) 21 | 22 | def forward(self, x): 23 | if self.need_skip: 24 | residual = self.skip_layer(x) 25 | else: 26 | residual = x 27 | out = x 28 | # out = self.bn1(out) 29 | out = self.relu(out) 30 | out = self.conv1(out) 31 | # out = self.bn2(out) 32 | out = self.relu(out) 33 | out = self.conv2(out) 34 | # out = self.bn3(out) 35 | out = self.relu(out) 36 | out = self.conv3(out) 37 | out += residual 38 | return out 39 | 40 | 41 | class Hourglass(nn.Module): 42 | def __init__(self, n, f, gn=False, increase=0): 43 | super(Hourglass, self).__init__() 44 | nf = f + increase 45 | self.up1 = Residual(f, f) 46 | # Lower branch 47 | self.pool1 = nn.MaxPool2d(2, 2) 48 | self.low1 = Residual(f, nf) 49 | self.n = n 50 | # Recursive hourglass 51 | if self.n > 1: 52 | self.low2 = Hourglass(n-1, nf, gn=gn) 53 | else: 54 | self.low2 = Residual(nf, nf, gn) 55 | self.low3 = Residual(nf, f, gn) 56 | self.up2 = nn.Upsample(scale_factor=2, mode='nearest') 57 | 58 | def forward(self, x): 59 | up1 = self.up1(x) 60 | pool1 = self.pool1(x) 61 | low1 = self.low1(pool1) 62 | low2 = self.low2(low1) 63 | low3 = self.low3(low2) 64 | up2 = self.up2(low3) 65 | return up1 + up2 -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from maskrcnn_benchmark.data import datasets 2 | 3 | from .coco import coco_evaluation 4 | from .voc import voc_evaluation 5 | from .vg import vg_evaluation 6 | from .box_aug import im_detect_bbox_aug 7 | from .od_to_grounding import od_to_grounding_evaluation 8 | 9 | 10 | def evaluate(dataset, predictions, output_folder, **kwargs): 11 | """evaluate dataset using different methods based on dataset type. 12 | Args: 13 | dataset: Dataset object 14 | predictions(list[BoxList]): each item in the list represents the 15 | prediction results for one image. 16 | output_folder: output folder, to save evaluation files or results. 17 | **kwargs: other args. 18 | Returns: 19 | evaluation result 20 | """ 21 | args = dict( 22 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 23 | ) 24 | if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset): 25 | return coco_evaluation(**args) 26 | # elif isinstance(dataset, datasets.VGTSVDataset): 27 | # return vg_evaluation(**args) 28 | elif isinstance(dataset, datasets.PascalVOCDataset): 29 | return voc_evaluation(**args) 30 | elif isinstance(dataset, datasets.CocoDetectionTSV): 31 | return od_to_grounding_evaluation(**args) 32 | elif isinstance(dataset, datasets.LvisDetection): 33 | pass 34 | else: 35 | dataset_name = dataset.__class__.__name__ 36 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 37 | 38 | 39 | def evaluate_mdetr(dataset, predictions, output_folder, cfg): 40 | 41 | args = dict( 42 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 43 | ) 44 | if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset): 45 | return coco_evaluation(**args) 46 | # elif isinstance(dataset, datasets.VGTSVDataset): 47 | # return vg_evaluation(**args) 48 | elif isinstance(dataset, datasets.PascalVOCDataset): 49 | return voc_evaluation(**args) 50 | elif isinstance(dataset, datasets.CocoDetectionTSV): 51 | return od_to_grounding_evaluation(**args) 52 | elif isinstance(dataset, datasets.LvisDetection): 53 | pass 54 | else: 55 | dataset_name = dataset.__class__.__name__ 56 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 57 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/structures/image_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from __future__ import division 3 | 4 | import torch 5 | 6 | 7 | class ImageList(object): 8 | """ 9 | Structure that holds a list of images (of possibly 10 | varying sizes) as a single tensor. 11 | This works by padding the images to the same size, 12 | and storing in a field the original sizes of each image 13 | """ 14 | 15 | def __init__(self, tensors, image_sizes): 16 | """ 17 | Arguments: 18 | tensors (tensor) 19 | image_sizes (list[tuple[int, int]]) 20 | """ 21 | self.tensors = tensors 22 | self.image_sizes = image_sizes 23 | 24 | def to(self, *args, **kwargs): 25 | cast_tensor = self.tensors.to(*args, **kwargs) 26 | return ImageList(cast_tensor, self.image_sizes) 27 | 28 | 29 | def to_image_list(tensors, size_divisible=0): 30 | """ 31 | tensors can be an ImageList, a torch.Tensor or 32 | an iterable of Tensors. It can't be a numpy array. 33 | When tensors is an iterable of Tensors, it pads 34 | the Tensors with zeros so that they have the same 35 | shape 36 | """ 37 | if isinstance(tensors, torch.Tensor) and size_divisible > 0: 38 | tensors = [tensors] 39 | 40 | if isinstance(tensors, ImageList): 41 | return tensors 42 | elif isinstance(tensors, torch.Tensor): 43 | # single tensor shape can be inferred 44 | assert tensors.dim() == 4 45 | image_sizes = [tensor.shape[-2:] for tensor in tensors] 46 | return ImageList(tensors, image_sizes) 47 | elif isinstance(tensors, (tuple, list)): 48 | max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) 49 | 50 | # TODO Ideally, just remove this and let me model handle arbitrary 51 | # input sizs 52 | if size_divisible > 0: 53 | import math 54 | 55 | stride = size_divisible 56 | max_size = list(max_size) 57 | max_size[1] = int(math.ceil(max_size[1] / stride) * stride) 58 | max_size[2] = int(math.ceil(max_size[2] / stride) * stride) 59 | max_size = tuple(max_size) 60 | 61 | batch_shape = (len(tensors),) + max_size 62 | batched_imgs = tensors[0].new(*batch_shape).zero_() 63 | for img, pad_img in zip(tensors, batched_imgs): 64 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 65 | 66 | image_sizes = [im.shape[-2:] for im in tensors] 67 | 68 | return ImageList(batched_imgs, image_sizes) 69 | else: 70 | raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) 71 | -------------------------------------------------------------------------------- /grounding/configs/pretrain/glip_Swin_T_O365.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: True 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: True 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("object365_dt_train", ) 60 | TEST: ("coco_2017_val", ) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | INPUT: 69 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 70 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 71 | MIN_SIZE_TRAIN: 800 72 | MAX_SIZE_TRAIN: 1333 73 | MIN_SIZE_TEST: 800 74 | MAX_SIZE_TEST: 1333 75 | 76 | AUGMENT: 77 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 78 | 79 | DATALOADER: 80 | SIZE_DIVISIBILITY: 32 81 | 82 | SOLVER: 83 | OPTIMIZER: ADAMW 84 | BASE_LR: 0.0001 85 | LANG_LR: 0.00001 86 | WEIGHT_DECAY: 0.0001 87 | STEPS: (0.67, 0.89) 88 | MAX_EPOCH: 30 89 | IMS_PER_BATCH: 64 90 | WARMUP_ITERS: 2000 91 | WARMUP_FACTOR: 0.001 92 | USE_AMP: True 93 | MODEL_EMA: 0.999 94 | FIND_UNUSED_PARAMETERS: False 95 | 96 | CLIP_GRADIENTS: 97 | ENABLED: True 98 | CLIP_TYPE: "full_model" 99 | CLIP_VALUE: 1.0 100 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /grounding/configs/pretrain/glip_A_Swin_T_O365.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("object365_dt_train", ) 60 | TEST: ("coco_2017_val", ) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | INPUT: 69 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 70 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 71 | MIN_SIZE_TRAIN: 800 72 | MAX_SIZE_TRAIN: 1333 73 | MIN_SIZE_TEST: 800 74 | MAX_SIZE_TEST: 1333 75 | 76 | AUGMENT: 77 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 78 | 79 | DATALOADER: 80 | SIZE_DIVISIBILITY: 32 81 | 82 | SOLVER: 83 | OPTIMIZER: ADAMW 84 | BASE_LR: 0.0001 85 | LANG_LR: 0.00001 86 | WEIGHT_DECAY: 0.0001 87 | STEPS: (0.67, 0.89) 88 | MAX_EPOCH: 30 89 | IMS_PER_BATCH: 64 90 | WARMUP_ITERS: 2000 91 | WARMUP_FACTOR: 0.001 92 | USE_AMP: True 93 | MODEL_EMA: 0.999 94 | FIND_UNUSED_PARAMETERS: False 95 | 96 | CLIP_GRADIENTS: 97 | ENABLED: True 98 | CLIP_TYPE: "full_model" 99 | CLIP_VALUE: 1.0 100 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /grounding/configs/pretrain/glip_Swin_T_O365_GoldG.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: True 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: True 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", ) 60 | TEST: ("coco_2017_val", ) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | INPUT: 69 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 70 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 71 | MIN_SIZE_TRAIN: 800 72 | MAX_SIZE_TRAIN: 1333 73 | MIN_SIZE_TEST: 800 74 | MAX_SIZE_TEST: 1333 75 | 76 | AUGMENT: 77 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 78 | 79 | DATALOADER: 80 | SIZE_DIVISIBILITY: 32 81 | 82 | SOLVER: 83 | OPTIMIZER: ADAMW 84 | BASE_LR: 0.0001 85 | LANG_LR: 0.00001 86 | WEIGHT_DECAY: 0.0001 87 | STEPS: (0.67, 0.89) 88 | MAX_EPOCH: 30 89 | IMS_PER_BATCH: 64 90 | WARMUP_ITERS: 2000 91 | WARMUP_FACTOR: 0.001 92 | USE_AMP: True 93 | MODEL_EMA: 0.999 94 | FIND_UNUSED_PARAMETERS: False 95 | 96 | CLIP_GRADIENTS: 97 | ENABLED: True 98 | CLIP_TYPE: "full_model" 99 | CLIP_VALUE: 1.0 100 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /grounding/tools/cityscapes/instances2dict_with_polygons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Convert instances from png files to a dictionary 4 | # This files is created according to https://github.com/facebookresearch/Detectron/issues/111 5 | 6 | from __future__ import print_function, absolute_import, division 7 | import os, sys 8 | 9 | sys.path.append( os.path.normpath( os.path.join( os.path.dirname( __file__ ) , '..' , 'helpers' ) ) ) 10 | from csHelpers import * 11 | 12 | # Cityscapes imports 13 | from cityscapesscripts.evaluation.instance import * 14 | from cityscapesscripts.helpers.csHelpers import * 15 | import cv2 16 | from maskrcnn_benchmark.utils import cv2_util 17 | 18 | 19 | def instances2dict_with_polygons(imageFileList, verbose=False): 20 | imgCount = 0 21 | instanceDict = {} 22 | 23 | if not isinstance(imageFileList, list): 24 | imageFileList = [imageFileList] 25 | 26 | if verbose: 27 | print("Processing {} images...".format(len(imageFileList))) 28 | 29 | for imageFileName in imageFileList: 30 | # Load image 31 | img = Image.open(imageFileName) 32 | 33 | # Image as numpy array 34 | imgNp = np.array(img) 35 | 36 | # Initialize label categories 37 | instances = {} 38 | for label in labels: 39 | instances[label.name] = [] 40 | 41 | # Loop through all instance ids in instance image 42 | for instanceId in np.unique(imgNp): 43 | if instanceId < 1000: 44 | continue 45 | instanceObj = Instance(imgNp, instanceId) 46 | instanceObj_dict = instanceObj.toDict() 47 | 48 | #instances[id2label[instanceObj.labelID].name].append(instanceObj.toDict()) 49 | if id2label[instanceObj.labelID].hasInstances: 50 | mask = (imgNp == instanceId).astype(np.uint8) 51 | contour, hier = cv2_util.findContours( 52 | mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) 53 | 54 | polygons = [c.reshape(-1).tolist() for c in contour] 55 | instanceObj_dict['contours'] = polygons 56 | 57 | instances[id2label[instanceObj.labelID].name].append(instanceObj_dict) 58 | 59 | imgKey = os.path.abspath(imageFileName) 60 | instanceDict[imgKey] = instances 61 | imgCount += 1 62 | 63 | if verbose: 64 | print("\rImages Processed: {}".format(imgCount), end=' ') 65 | sys.stdout.flush() 66 | 67 | if verbose: 68 | print("") 69 | 70 | return instanceDict 71 | 72 | def main(argv): 73 | fileList = [] 74 | if (len(argv) > 2): 75 | for arg in argv: 76 | if ("png" in arg): 77 | fileList.append(arg) 78 | instances2dict_with_polygons(fileList, True) 79 | 80 | if __name__ == "__main__": 81 | main(sys.argv[1:]) 82 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/finetune.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_tiny_model_o365_goldg.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: True 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: True 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | # TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", ) 60 | # TEST: ("coco_2017_val", ) 61 | TRAIN: ("refexp_train",) 62 | TEST: ("refexp_val",) 63 | DISABLE_SHUFFLE: False 64 | ADD_DET_PROMPT: False 65 | RANDOM_SAMPLE_NEG: 85 66 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 67 | 68 | SEPARATION_TOKENS: ". " 69 | 70 | INPUT: 71 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 72 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 73 | MIN_SIZE_TRAIN: 800 74 | MAX_SIZE_TRAIN: 1333 75 | MIN_SIZE_TEST: 800 76 | MAX_SIZE_TEST: 1333 77 | 78 | AUGMENT: 79 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 80 | 81 | DATALOADER: 82 | SIZE_DIVISIBILITY: 32 83 | 84 | SOLVER: 85 | OPTIMIZER: ADAMW 86 | BASE_LR: 0.0001 87 | LANG_LR: 0.00001 88 | WEIGHT_DECAY: 0.0001 89 | STEPS: (0.67, 0.89) 90 | MAX_EPOCH: 30 91 | IMS_PER_BATCH: 64 92 | WARMUP_ITERS: 2000 93 | WARMUP_FACTOR: 0.001 94 | USE_AMP: True 95 | MODEL_EMA: 0.999 96 | FIND_UNUSED_PARAMETERS: False 97 | 98 | CLIP_GRADIENTS: 99 | ENABLED: True 100 | CLIP_TYPE: "full_model" 101 | CLIP_VALUE: 1.0 102 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /grounding/configs/refcoco/refcoco.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | MLM_LOSS: False 51 | 52 | USE_CHECKPOINT: True 53 | 54 | TEST: 55 | DURING_TRAINING: False 56 | IMS_PER_BATCH: 1 57 | MDETR_STYLE_AGGREGATE_CLASS_NUM: 100 58 | EVAL_TASK: grounding 59 | # use for grounding model 60 | DATASETS: 61 | TRAIN: ("refcoco_train", ) 62 | TEST: ("refcoco_val", ) 63 | DISABLE_SHUFFLE: False 64 | ADD_DET_PROMPT: False 65 | RANDOM_SAMPLE_NEG: 85 66 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 67 | 68 | SEPARATION_TOKENS: ". " 69 | 70 | INPUT: 71 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 72 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 73 | MIN_SIZE_TRAIN: 800 74 | MAX_SIZE_TRAIN: 1333 75 | MIN_SIZE_TEST: 800 76 | MAX_SIZE_TEST: 1333 77 | 78 | AUGMENT: 79 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 80 | 81 | DATALOADER: 82 | SIZE_DIVISIBILITY: 32 83 | 84 | SOLVER: 85 | OPTIMIZER: ADAMW 86 | BASE_LR: 0.0001 87 | LANG_LR: 0.00001 88 | WEIGHT_DECAY: 0.0001 89 | STEPS: (0.67, 0.89) 90 | MAX_EPOCH: 30 91 | IMS_PER_BATCH: 1 92 | WARMUP_ITERS: 2000 93 | WARMUP_FACTOR: 0.001 94 | USE_AMP: True 95 | MODEL_EMA: 0.999 96 | FIND_UNUSED_PARAMETERS: False 97 | 98 | CLIP_GRADIENTS: 99 | ENABLED: True 100 | CLIP_TYPE: "full_model" 101 | CLIP_VALUE: 1.0 102 | NORM_TYPE: 2.0 103 | 104 | OUTPUT_DIR: OUTPUT 105 | -------------------------------------------------------------------------------- /grounding/configs/flickr/flickr.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | MLM_LOSS: False 51 | 52 | USE_CHECKPOINT: True 53 | 54 | TEST: 55 | DURING_TRAINING: False 56 | IMS_PER_BATCH: 1 57 | MDETR_STYLE_AGGREGATE_CLASS_NUM: 100 58 | EVAL_TASK: grounding 59 | # use for grounding model 60 | DATASETS: 61 | TRAIN: ("object365_dt_train", ) 62 | TEST: ("coco_2017_val", ) 63 | DISABLE_SHUFFLE: False 64 | ADD_DET_PROMPT: False 65 | RANDOM_SAMPLE_NEG: 85 66 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 67 | 68 | SEPARATION_TOKENS: ". " 69 | 70 | INPUT: 71 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 72 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 73 | MIN_SIZE_TRAIN: 800 74 | MAX_SIZE_TRAIN: 1333 75 | MIN_SIZE_TEST: 800 76 | MAX_SIZE_TEST: 1333 77 | 78 | AUGMENT: 79 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 80 | 81 | DATALOADER: 82 | SIZE_DIVISIBILITY: 32 83 | 84 | SOLVER: 85 | OPTIMIZER: ADAMW 86 | BASE_LR: 0.0001 87 | LANG_LR: 0.00001 88 | WEIGHT_DECAY: 0.0001 89 | STEPS: (0.67, 0.89) 90 | MAX_EPOCH: 30 91 | IMS_PER_BATCH: 1 92 | WARMUP_ITERS: 2000 93 | WARMUP_FACTOR: 0.001 94 | USE_AMP: True 95 | MODEL_EMA: 0.999 96 | FIND_UNUSED_PARAMETERS: False 97 | 98 | CLIP_GRADIENTS: 99 | ENABLED: True 100 | CLIP_TYPE: "full_model" 101 | CLIP_VALUE: 1.0 102 | NORM_TYPE: 2.0 103 | 104 | OUTPUT_DIR: OUTPUT 105 | -------------------------------------------------------------------------------- /grounding/SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data_ptr(); 30 | auto order = order_t.data_ptr(); 31 | auto x1 = x1_t.data_ptr(); 32 | auto y1 = y1_t.data_ptr(); 33 | auto x2 = x2_t.data_ptr(); 34 | auto y2 = y2_t.data_ptr(); 35 | auto areas = areas_t.data_ptr(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/backbone/ops.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | def conv7x7(in_planes, out_planes, stride=1, groups=1, dilation=1): 8 | """7x7 convolution with padding""" 9 | return nn.Conv2d(in_planes, out_planes, kernel_size=7, stride=stride, 10 | padding=3*dilation, groups=groups, bias=False, dilation=dilation) 11 | 12 | 13 | def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1): 14 | """5x5 convolution with padding""" 15 | return nn.Conv2d(in_planes, out_planes, kernel_size=5, stride=stride, 16 | padding=2*dilation, groups=groups, bias=False, dilation=dilation) 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 20 | """3x3 convolution with padding""" 21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 22 | padding=dilation, groups=groups, bias=False, dilation=dilation) 23 | 24 | 25 | def conv1x1(in_planes, out_planes, stride=1): 26 | """1x1 convolution""" 27 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 28 | 29 | 30 | def maxpool(**kwargs): 31 | return nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 32 | 33 | 34 | def avgpool(**kwargs): 35 | return nn.AvgPool2d(kernel_size=3, stride=2, padding=1) 36 | 37 | def dropout(prob): 38 | return nn.Dropout(prob) 39 | 40 | 41 | conv3x3sep = lambda i, o, s=1: conv3x3(i, o, s, groups=i) 42 | conv3x3g2 = lambda i, o, s=1: conv3x3(i, o, s, groups=2) 43 | conv3x3g4 = lambda i, o, s=1: conv3x3(i, o, s, groups=4) 44 | conv3x3g8 = lambda i, o, s=1: conv3x3(i, o, s, groups=8) 45 | conv3x3dw = lambda i, o, s=1: conv3x3(i, o, s, groups=i) 46 | 47 | conv3x3d2 = lambda i, o, s=1: conv3x3(i, o, s, dilation=2) 48 | conv3x3d3 = lambda i, o, s=1: conv3x3(i, o, s, dilation=3) 49 | conv3x3d4 = lambda i, o, s=1: conv3x3(i, o, s, dilation=4) 50 | 51 | 52 | conv5x5sep = lambda i, o, s=1: conv5x5(i, o, s, groups=i) 53 | conv5x5g2 = lambda i, o, s=1: conv5x5(i, o, s, groups=2) 54 | conv5x5g4 = lambda i, o, s=1: conv5x5(i, o, s, groups=4) 55 | conv5x5g8 = lambda i, o, s=1: conv5x5(i, o, s, groups=8) 56 | conv5x5dw = lambda i, o, s=1: conv5x5(i, o, s, groups=i) 57 | 58 | 59 | conv5x5d2 = lambda i, o, s=1: conv5x5(i, o, s, dilation=2) 60 | conv5x5d3 = lambda i, o, s=1: conv5x5(i, o, s, dilation=3) 61 | conv5x5d4 = lambda i, o, s=1: conv5x5(i, o, s, dilation=4) 62 | 63 | conv7x7sep = lambda i, o, s=1: conv7x7(i, o, s, groups=i) 64 | conv7x7g2 = lambda i, o, s=1: conv7x7(i, o, s, groups=2) 65 | conv7x7g4 = lambda i, o, s=1: conv7x7(i, o, s, groups=4) 66 | conv7x7g8 = lambda i, o, s=1: conv7x7(i, o, s, groups=8) 67 | conv7x7dw = lambda i, o, s=1: conv7x7(i, o, s, groups=i) 68 | 69 | conv7x7d2 = lambda i, o, s=1: conv7x7(i, o, s, dilation=2) 70 | conv7x7d3 = lambda i, o, s=1: conv7x7(i, o, s, dilation=3) 71 | conv7x7d4 = lambda i, o, s=1: conv7x7(i, o, s, dilation=4) -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | 5 | class BalancedPositiveNegativeSampler(object): 6 | """ 7 | This class samples batches, ensuring that they contain a fixed proportion of positives 8 | """ 9 | 10 | def __init__(self, batch_size_per_image, positive_fraction): 11 | """ 12 | Arguments: 13 | batch_size_per_image (int): number of elements to be selected per image 14 | positive_fraction (float): percentace of positive elements per batch 15 | """ 16 | self.batch_size_per_image = batch_size_per_image 17 | self.positive_fraction = positive_fraction 18 | 19 | def __call__(self, matched_idxs): 20 | """ 21 | Arguments: 22 | matched idxs: list of tensors containing -1, 0 or positive values. 23 | Each tensor corresponds to a specific image. 24 | -1 values are ignored, 0 are considered as negatives and > 0 as 25 | positives. 26 | 27 | Returns: 28 | pos_idx (list[tensor]) 29 | neg_idx (list[tensor]) 30 | 31 | Returns two lists of binary masks for each image. 32 | The first list contains the positive elements that were selected, 33 | and the second list the negative example. 34 | """ 35 | pos_idx = [] 36 | neg_idx = [] 37 | for matched_idxs_per_image in matched_idxs: 38 | positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) 39 | negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) 40 | 41 | num_pos = int(self.batch_size_per_image * self.positive_fraction) 42 | # protect against not enough positive examples 43 | num_pos = min(positive.numel(), num_pos) 44 | num_neg = self.batch_size_per_image - num_pos 45 | # protect against not enough negative examples 46 | num_neg = min(negative.numel(), num_neg) 47 | 48 | # randomly select positive and negative examples 49 | perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] 50 | perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] 51 | 52 | pos_idx_per_image = positive[perm1] 53 | neg_idx_per_image = negative[perm2] 54 | 55 | # create binary mask from indices 56 | pos_idx_per_image_mask = torch.zeros_like( 57 | matched_idxs_per_image, dtype=torch.bool 58 | ) 59 | neg_idx_per_image_mask = torch.zeros_like( 60 | matched_idxs_per_image, dtype=torch.bool 61 | ) 62 | pos_idx_per_image_mask[pos_idx_per_image] = 1 63 | neg_idx_per_image_mask[neg_idx_per_image] = 1 64 | 65 | pos_idx.append(pos_idx_per_image_mask) 66 | neg_idx.append(neg_idx_per_image_mask) 67 | 68 | return pos_idx, neg_idx 69 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Miscellaneous utility functions 4 | """ 5 | 6 | import torch 7 | 8 | 9 | def cat(tensors, dim=0): 10 | """ 11 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 12 | """ 13 | assert isinstance(tensors, (list, tuple)) 14 | if len(tensors) == 1: 15 | return tensors[0] 16 | return torch.cat(tensors, dim) 17 | 18 | 19 | def permute_and_flatten(layer, N, A, C, H, W): 20 | layer = layer.view(N, -1, C, H, W) 21 | layer = layer.permute(0, 3, 4, 1, 2) 22 | layer = layer.reshape(N, -1, C) 23 | return layer 24 | 25 | 26 | def concat_box_prediction_layers(box_regression, box_cls=None, token_logits=None): 27 | box_regression_flattened = [] 28 | box_cls_flattened = [] 29 | token_logit_flattened = [] 30 | 31 | # for each feature level, permute the outputs to make them be in the 32 | # same format as the labels. Note that the labels are computed for 33 | # all feature levels concatenated, so we keep the same representation 34 | # for the objectness and the box_regression 35 | for box_cls_per_level, box_regression_per_level in zip( 36 | box_cls, box_regression 37 | ): 38 | N, AxC, H, W = box_cls_per_level.shape 39 | Ax4 = box_regression_per_level.shape[1] 40 | A = Ax4 // 4 41 | C = AxC // A 42 | box_cls_per_level = permute_and_flatten( 43 | box_cls_per_level, N, A, C, H, W 44 | ) 45 | box_cls_flattened.append(box_cls_per_level) 46 | 47 | box_regression_per_level = permute_and_flatten( 48 | box_regression_per_level, N, A, 4, H, W 49 | ) 50 | box_regression_flattened.append(box_regression_per_level) 51 | 52 | if token_logits is not None: 53 | for token_logit_per_level in token_logits: 54 | N, AXT, H, W = token_logit_per_level.shape 55 | T = AXT // A 56 | token_logit_per_level = permute_and_flatten( 57 | token_logit_per_level, N, A, T, H, W 58 | ) 59 | token_logit_flattened.append(token_logit_per_level) 60 | 61 | # concatenate on the first dimension (representing the feature levels), to 62 | # take into account the way the labels were generated (with all feature maps 63 | # being concatenated as well) 64 | box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) 65 | box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) 66 | 67 | token_logits_stacked = None 68 | if token_logits is not None: 69 | # stacked 70 | token_logits_stacked = cat(token_logit_flattened, dim=1) 71 | 72 | return box_regression, box_cls, token_logits_stacked 73 | 74 | 75 | def round_channels(channels, divisor=8): 76 | rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor) 77 | if float(rounded_channels) < 0.9 * channels: 78 | rounded_channels += divisor 79 | return rounded_channels 80 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | from maskrcnn_benchmark.utils.comm import shared_random_seed 10 | 11 | 12 | class DistributedSampler(Sampler): 13 | """Sampler that restricts data loading to a subset of the dataset. 14 | It is especially useful in conjunction with 15 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 16 | process can pass a DistributedSampler instance as a DataLoader sampler, 17 | and load a subset of the original dataset that is exclusive to it. 18 | .. note:: 19 | Dataset is assumed to be of constant size. 20 | Arguments: 21 | dataset: Dataset used for sampling. 22 | num_replicas (optional): Number of processes participating in 23 | distributed training. 24 | rank (optional): Rank of the current process within num_replicas. 25 | """ 26 | 27 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, use_random=False): 28 | if num_replicas is None: 29 | if not dist.is_available(): 30 | raise RuntimeError("Requires distributed package to be available") 31 | num_replicas = dist.get_world_size() 32 | if rank is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | rank = dist.get_rank() 36 | self.dataset = dataset 37 | self.num_replicas = num_replicas 38 | self.rank = rank 39 | self.epoch = 0 40 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 41 | self.total_size = self.num_samples * self.num_replicas 42 | self.shuffle = shuffle 43 | self.use_random = use_random 44 | 45 | def __iter__(self): 46 | if self.shuffle: 47 | # deterministically shuffle based on epoch 48 | _seed = self.epoch 49 | if self.use_random: 50 | _seed = int(shared_random_seed()) 51 | g = torch.Generator() 52 | g.manual_seed(_seed) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/iou_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class IOULoss(nn.Module): 6 | def __init__(self, loss_type="iou"): 7 | super(IOULoss, self).__init__() 8 | self.loss_type = loss_type 9 | 10 | def forward(self, pred, target, weight=None): 11 | pred_left = pred[:, 0] 12 | pred_top = pred[:, 1] 13 | pred_right = pred[:, 2] 14 | pred_bottom = pred[:, 3] 15 | 16 | target_left = target[:, 0] 17 | target_top = target[:, 1] 18 | target_right = target[:, 2] 19 | target_bottom = target[:, 3] 20 | 21 | target_area = (target_left + target_right) * \ 22 | (target_top + target_bottom) 23 | pred_area = (pred_left + pred_right) * \ 24 | (pred_top + pred_bottom) 25 | 26 | w_intersect = torch.min(pred_left, target_left) + torch.min(pred_right, target_right) 27 | g_w_intersect = torch.max(pred_left, target_left) + torch.max( 28 | pred_right, target_right) 29 | h_intersect = torch.min(pred_bottom, target_bottom) + torch.min(pred_top, target_top) 30 | g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max(pred_top, target_top) 31 | ac_uion = g_w_intersect * g_h_intersect + 1e-7 32 | area_intersect = w_intersect * h_intersect 33 | area_union = target_area + pred_area - area_intersect 34 | ious = (area_intersect + 1.0) / (area_union + 1.0) 35 | gious = ious - (ac_uion - area_union) / ac_uion 36 | if self.loss_type == 'iou': 37 | losses = -torch.log(ious) 38 | elif self.loss_type == 'linear_iou': 39 | losses = 1 - ious 40 | elif self.loss_type == 'giou': 41 | losses = 1 - gious 42 | else: 43 | raise NotImplementedError 44 | 45 | if weight is not None and weight.sum() > 0: 46 | return (losses * weight).sum() 47 | else: 48 | assert losses.numel() != 0 49 | return losses.sum() 50 | 51 | 52 | class IOUWHLoss(nn.Module): # used for anchor guiding 53 | def __init__(self, reduction='none'): 54 | super(IOUWHLoss, self).__init__() 55 | self.reduction = reduction 56 | 57 | def forward(self, pred, target): 58 | orig_shape = pred.shape 59 | pred = pred.view(-1, 4) 60 | target = target.view(-1, 4) 61 | target[:, :2] = 0 62 | tl = torch.max((target[:, :2] - pred[:, 2:] / 2), 63 | (target[:, :2] - target[:, 2:] / 2)) 64 | 65 | br = torch.min((target[:, :2] + pred[:, 2:] / 2), 66 | (target[:, :2] + target[:, 2:] / 2)) 67 | 68 | area_p = torch.prod(pred[:, 2:], 1) 69 | area_g = torch.prod(target[:, 2:], 1) 70 | 71 | en = (tl < br).type(tl.type()).prod(dim=1) 72 | area_i = torch.prod(br - tl, 1) * en 73 | U = area_p + area_g - area_i + 1e-16 74 | iou = area_i / U 75 | 76 | loss = 1 - iou ** 2 77 | if self.reduction == 'mean': 78 | loss = loss.mean() 79 | elif self.reduction == 'sum': 80 | loss = loss.sum() 81 | 82 | return loss 83 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | 5 | from .roi_box_feature_extractors import make_roi_box_feature_extractor 6 | from .roi_box_predictors import make_roi_box_predictor 7 | from .inference import make_roi_box_post_processor 8 | from .loss import make_roi_box_loss_evaluator 9 | from maskrcnn_benchmark.utils.amp import custom_fwd, custom_bwd 10 | 11 | class ROIBoxHead(torch.nn.Module): 12 | """ 13 | Generic Box Head class. 14 | """ 15 | 16 | def __init__(self, cfg): 17 | super(ROIBoxHead, self).__init__() 18 | self.feature_extractor = make_roi_box_feature_extractor(cfg) 19 | self.predictor = make_roi_box_predictor(cfg) 20 | self.post_processor = make_roi_box_post_processor(cfg) 21 | self.loss_evaluator = make_roi_box_loss_evaluator(cfg) 22 | self.onnx = cfg.MODEL.ONNX 23 | 24 | @custom_fwd(cast_inputs=torch.float32) 25 | def forward(self, features, proposals, targets=None): 26 | """ 27 | Arguments: 28 | features (list[Tensor]): feature-maps from possibly several levels 29 | proposals (list[BoxList]): proposal boxes 30 | targets (list[BoxList], optional): the ground-truth targets. 31 | 32 | Returns: 33 | x (Tensor): the result of the feature extractor 34 | proposals (list[BoxList]): during training, the subsampled proposals 35 | are returned. During testing, the predicted boxlists are returned 36 | losses (dict[Tensor]): During training, returns the losses for the 37 | head. During testing, returns an empty dict. 38 | """ 39 | 40 | if self.training: 41 | # Faster R-CNN subsamples during training the proposals with a fixed 42 | # positive / negative ratio 43 | with torch.no_grad(): 44 | proposals = self.loss_evaluator.subsample(proposals, targets) 45 | 46 | # extract features that will be fed to the final classifier. The 47 | # feature_extractor generally corresponds to the pooler + heads 48 | x = self.feature_extractor(features, proposals) 49 | # final classifier that converts the features into predictions 50 | class_logits, box_regression = self.predictor(x) 51 | 52 | if self.onnx: 53 | return x, (class_logits, box_regression, [box.bbox for box in proposals]), {} 54 | 55 | if not self.training: 56 | result = self.post_processor((class_logits, box_regression), proposals) 57 | return x, result, {} 58 | 59 | loss_classifier, loss_box_reg = self.loss_evaluator( 60 | [class_logits], [box_regression] 61 | ) 62 | return ( 63 | x, 64 | proposals, 65 | dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), 66 | ) 67 | 68 | 69 | def build_roi_box_head(cfg): 70 | """ 71 | Constructs a new box head. 72 | By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class 73 | and make it a parameter in the config 74 | """ 75 | return ROIBoxHead(cfg) 76 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/utils/model_zoo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | import sys 4 | 5 | try: 6 | from torch.hub import download_url_to_file 7 | from torch.hub import urlparse 8 | from torch.hub import HASH_REGEX 9 | except ImportError: 10 | from torch.utils.model_zoo import _download_url_to_file 11 | from torch.utils.model_zoo import urlparse 12 | from torch.utils.model_zoo import HASH_REGEX 13 | 14 | from maskrcnn_benchmark.utils.comm import is_main_process 15 | from maskrcnn_benchmark.utils.comm import synchronize 16 | 17 | 18 | # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py 19 | # but with a few improvements and modifications 20 | def cache_url(url, model_dir='model', progress=True): 21 | r"""Loads the Torch serialized object at the given URL. 22 | If the object is already present in `model_dir`, it's deserialized and 23 | returned. The filename part of the URL should follow the naming convention 24 | ``filename-.ext`` where ```` is the first eight or more 25 | digits of the SHA256 hash of the contents of the file. The hash is used to 26 | ensure unique names and to verify the contents of the file. 27 | The default value of `model_dir` is ``$TORCH_HOME/models`` where 28 | ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be 29 | overridden with the ``$TORCH_MODEL_ZOO`` environment variable. 30 | Args: 31 | url (string): URL of the object to download 32 | model_dir (string, optional): directory in which to save the object 33 | progress (bool, optional): whether or not to display a progress bar to stderr 34 | Example: 35 | >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') 36 | """ 37 | if model_dir is None: 38 | torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch")) 39 | model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models")) 40 | if not os.path.exists(model_dir): 41 | os.makedirs(model_dir, exist_ok=True) 42 | parts = urlparse(url) 43 | filename = os.path.basename(parts.path) 44 | if filename == "model_final.pkl": 45 | # workaround as pre-trained Caffe2 models from Detectron have all the same filename 46 | # so make the full path the filename by replacing / with _ 47 | filename = parts.path.replace("/", "_") 48 | cached_file = os.path.join(model_dir, filename) 49 | if not os.path.exists(cached_file): 50 | sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) 51 | hash_prefix = HASH_REGEX.search(filename) 52 | if hash_prefix is not None: 53 | hash_prefix = hash_prefix.group(1) 54 | # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, 55 | # which matches the hash PyTorch uses. So we skip the hash matching 56 | # if the hash_prefix is less than 6 characters 57 | if len(hash_prefix) < 6: 58 | hash_prefix = None 59 | _download_url_to_file(url, cached_file, hash_prefix, progress=progress) 60 | synchronize() 61 | return cached_file 62 | -------------------------------------------------------------------------------- /grounding/maskrcnn_benchmark/layers/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from maskrcnn_benchmark import _C 9 | 10 | class _ROIAlign(Function): 11 | @staticmethod 12 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 13 | ctx.save_for_backward(roi) 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.sampling_ratio = sampling_ratio 17 | ctx.input_shape = input.size() 18 | output = _C.roi_align_forward( 19 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 20 | ) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | rois, = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | sampling_ratio = ctx.sampling_ratio 30 | bs, ch, h, w = ctx.input_shape 31 | grad_input = _C.roi_align_backward( 32 | grad_output, 33 | rois, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | h, 40 | w, 41 | sampling_ratio, 42 | ) 43 | return grad_input, None, None, None, None 44 | 45 | try: 46 | import torchvision 47 | from torchvision.ops import roi_align 48 | except: 49 | roi_align = _ROIAlign.apply 50 | 51 | class ROIAlign(nn.Module): 52 | def __init__(self, output_size, spatial_scale, sampling_ratio): 53 | super(ROIAlign, self).__init__() 54 | self.output_size = output_size 55 | self.spatial_scale = spatial_scale 56 | self.sampling_ratio = sampling_ratio 57 | 58 | def forward(self, input, rois): 59 | return roi_align( 60 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 61 | ) 62 | 63 | def __repr__(self): 64 | tmpstr = self.__class__.__name__ + "(" 65 | tmpstr += "output_size=" + str(self.output_size) 66 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 67 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 68 | tmpstr += ")" 69 | return tmpstr 70 | 71 | class ROIAlignV2(nn.Module): 72 | def __init__(self, output_size, spatial_scale, sampling_ratio): 73 | super(ROIAlignV2, self).__init__() 74 | self.output_size = output_size 75 | self.spatial_scale = spatial_scale 76 | self.sampling_ratio = sampling_ratio 77 | 78 | def forward(self, input, rois): 79 | return torchvision.ops.roi_align( 80 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, aligned=True 81 | ) 82 | 83 | def __repr__(self): 84 | tmpstr = self.__class__.__name__ + "(" 85 | tmpstr += "output_size=" + str(self.output_size) 86 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 87 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 88 | tmpstr += ")" 89 | return tmpstr 90 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: False 118 | LAYER_ALIGNMENT: True 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | 123 | PROMPT_LORA_D: 4 124 | INTERACT_LORA_D: 4 125 | PROMPT_LORA: True 126 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_tt.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 16 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: True 118 | LAYER_ALIGNMENT: True 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | 123 | PROMPT_LORA_D: 4 124 | INTERACT_LORA_D: 4 125 | PROMPT_LORA: True 126 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_base.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: False 116 | TEXTUAL_PROMPT: False 117 | TASK_ALIGNMENT: False 118 | LAYER_ALIGNMENT: False 119 | INTERACT: False 120 | PROMPT_DEPTH: 9 121 | 122 | 123 | PROMPT_LORA_D: 4 124 | INTERACT_LORA_D: 4 125 | PROMPT_LORA: True 126 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_layer.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: False 118 | LAYER_ALIGNMENT: True 119 | INTERACT: False 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_task.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | # USE_AUTOSTEP: True 101 | USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: True 118 | LAYER_ALIGNMENT: False 119 | INTERACT: False 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_interact.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 16 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: False 118 | LAYER_ALIGNMENT: False 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_layer_task.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: True 118 | LAYER_ALIGNMENT: True 119 | INTERACT: False 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_task_interact.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: True 118 | LAYER_ALIGNMENT: False 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_layer_interact.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: False 118 | LAYER_ALIGNMENT: True 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_decompose_task_layer_interact.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_val",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: True 118 | LAYER_ALIGNMENT: True 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | PROMPT_LORA_D: 4 123 | INTERACT_LORA_D: 4 124 | PROMPT_LORA: True 125 | -------------------------------------------------------------------------------- /grounding/configs/refcoco/org/finetune_A_test.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "MODEL/glip_a_tiny_o365.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: False 52 | 53 | TEST: 54 | DURING_TRAINING: True 55 | IMS_PER_BATCH: 1 56 | EVAL_TASK: grounding 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("refexp_train", ) 60 | TEST: ("refexp_testA", "refexp_testB",) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | USE_OVERRIDE_CATEGORY: True 69 | SHUFFLE_SEED: 3 70 | 71 | INPUT: 72 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 73 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 74 | MIN_SIZE_TRAIN: 800 75 | MAX_SIZE_TRAIN: 1333 76 | MIN_SIZE_TEST: 800 77 | MAX_SIZE_TEST: 1333 78 | 79 | AUGMENT: 80 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 81 | 82 | DATALOADER: 83 | SIZE_DIVISIBILITY: 32 84 | 85 | SOLVER: 86 | OPTIMIZER: ADAMW 87 | BASE_LR: 0.0001 88 | LANG_LR: 0.0001 89 | WEIGHT_DECAY: 0.05 90 | STEPS: (0.67, 0.89) 91 | MAX_EPOCH: 30 92 | IMS_PER_BATCH: 32 93 | WARMUP_ITERS: 2000 94 | WARMUP_FACTOR: 0.001 95 | USE_AMP: True 96 | MODEL_EMA: 0.0 97 | FIND_UNUSED_PARAMETERS: True 98 | 99 | TEST_WITH_INFERENCE: True 100 | USE_AUTOSTEP: True 101 | # USE_COSINE: True 102 | 103 | CLIP_GRADIENTS: 104 | ENABLED: True 105 | CLIP_TYPE: "full_model" 106 | CLIP_VALUE: 1.0 107 | NORM_TYPE: 2.0 108 | 109 | SEED: 10 110 | STEP_PATIENCE: 2 111 | AUTO_TERMINATE_PATIENCE: 4 112 | TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4 113 | 114 | LPAI: 115 | VISUAL_PROMPT: True 116 | TEXTUAL_PROMPT: True 117 | TASK_ALIGNMENT: True 118 | LAYER_ALIGNMENT: True 119 | INTERACT: True 120 | PROMPT_DEPTH: 9 121 | 122 | 123 | PROMPT_LORA_D: 4 124 | INTERACT_LORA_D: 4 125 | PROMPT_LORA: True 126 | --------------------------------------------------------------------------------