├── retrieval
    ├── loss
    │   └── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── prompts
    │   │   └── __init__.py
    │   └── clip
    │   │   ├── __init__.py
    │   │   └── bpe_simple_vocab_16e6.txt.gz
    ├── utils
    │   ├── __init__.py
    │   └── factory.py
    ├── methods
    │   └── __init__.py
    ├── .gitignore
    ├── shell
    │   └── run.sh
    ├── util
    │   └── __init__.py
    ├── configs
    │   ├── domainnet_slip.json
    │   ├── core50_slip.json
    │   ├── cddb_sip.json
    │   ├── cddb_slip.json
    │   ├── lpi
    │   │   ├── coco_l2p.json
    │   │   ├── coco_clip.json
    │   │   ├── coco_lpi.json
    │   │   └── coco_sprompts.json
    │   └── coco_slip.json
    ├── LICENSE
    └── main.py
├── grounding
    ├── DATASET
    ├── tools
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── colors.py
    │   └── cityscapes
    │   │   └── instances2dict_with_polygons.py
    ├── shell
    │   ├── s1.py
    │   ├── s2.py
    │   ├── s3.py
    │   ├── base.sh
    │   ├── lpim.sh
    │   ├── refcoco+.sh
    │   ├── l2p.sh
    │   ├── maple.sh
    │   ├── lpip.sh
    │   ├── vis.sh
    │   ├── depth.sh
    │   ├── prompt_lora.sh
    │   ├── sprompt.sh
    │   └── cmd.sh
    ├── MODEL
    ├── maskrcnn_benchmark
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   ├── collect_env.py
    │   │   ├── miscellaneous.py
    │   │   ├── amp.py
    │   │   ├── cv2_util.py
    │   │   ├── imports.py
    │   │   ├── logger.py
    │   │   ├── env.py
    │   │   ├── registry.py
    │   │   ├── ema.py
    │   │   ├── pretrain_model_loading.py
    │   │   ├── shallow_contrastive_loss_helper.py
    │   │   └── model_zoo.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │   │   └── __init__.py
    │   │   ├── prompt
    │   │   │   └── __init__.py
    │   │   ├── roi_heads
    │   │   │   ├── box_head
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── roi_box_predictors.py
    │   │   │   │   └── box_head.py
    │   │   │   ├── mask_head
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── hourglass.py
    │   │   │   └── keypoint_head
    │   │   │   │   ├── roi_keypoint_predictors.py
    │   │   │   │   └── keypoint_head.py
    │   │   ├── language_backbone
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── __init__.py
    │   │   │   ├── test_clip_tokenizer.py
    │   │   │   ├── build.py
    │   │   │   └── backbone.py
    │   │   ├── registry.py
    │   │   ├── detector
    │   │   │   └── __init__.py
    │   │   ├── rpn
    │   │   │   ├── __init__.py
    │   │   │   └── transformer.py
    │   │   ├── backbone
    │   │   │   ├── mixer.py
    │   │   │   └── ops.py
    │   │   ├── balanced_positive_negative_sampler.py
    │   │   └── utils.py
    │   ├── structures
    │   │   ├── __init__.py
    │   │   └── image_list.py
    │   ├── data
    │   │   ├── datasets
    │   │   │   ├── evaluation
    │   │   │   │   ├── od_eval.py
    │   │   │   │   ├── flickr
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── lvis
    │   │   │   │   │   └── _change_lvis_annotation.py
    │   │   │   │   ├── vg
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── voc
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── coco
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── od_to_grounding
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── flickr.py
    │   │   │   ├── object365.py
    │   │   │   ├── phrasecut.py
    │   │   │   ├── concat_dataset.py
    │   │   │   ├── duplicate_dataset.py
    │   │   │   ├── __init__.py
    │   │   │   ├── list_dataset.py
    │   │   │   ├── background.py
    │   │   │   └── imagenet.py
    │   │   ├── __init__.py
    │   │   ├── transforms
    │   │   │   ├── __init__.py
    │   │   │   └── build.py
    │   │   └── samplers
    │   │   │   ├── __init__.py
    │   │   │   ├── iteration_based_batch_sampler.py
    │   │   │   └── distributed.py
    │   ├── __init__.py
    │   ├── engine
    │   │   └── __init__.py
    │   ├── config
    │   │   └── __init__.py
    │   ├── solver
    │   │   └── __init__.py
    │   ├── layers
    │   │   ├── nms.py
    │   │   ├── smooth_l1_loss.py
    │   │   ├── evonorm.py
    │   │   ├── __init__.py
    │   │   ├── se.py
    │   │   ├── roi_pool.py
    │   │   ├── iou_loss.py
    │   │   └── roi_align.py
    │   └── csrc
    │   │   ├── ml_nms.h
    │   │   ├── cpu
    │   │       ├── vision.h
    │   │       └── nms_cpu.cpp
    │   │   ├── SigmoidFocalLoss.h
    │   │   ├── nms.h
    │   │   ├── vision.cpp
    │   │   ├── ROIPool.h
    │   │   ├── ROIAlign.h
    │   │   └── deform_pool.h
    ├── bert-base-uncased
    ├── docs
    │   ├── lead.png
    │   ├── word_cloud_od.png
    │   └── benchmark_example_od.png
    ├── MID
    │   ├── task_visual.png
    │   └── task_visual_2.png
    ├── configs
    │   ├── pretrain
    │   │   ├── _coco.yaml
    │   │   ├── glip_Swin_T_O365.yaml
    │   │   ├── glip_A_Swin_T_O365.yaml
    │   │   └── glip_Swin_T_O365_GoldG.yaml
    │   ├── flickr
    │   │   ├── test.yaml
    │   │   ├── val.yaml
    │   │   └── flickr.yaml
    │   ├── refcoco
    │   │   ├── val.yaml
    │   │   ├── script.txt
    │   │   ├── finetune.yaml
    │   │   ├── refcoco.yaml
    │   │   └── org
    │   │   │   ├── finetune_A.yaml
    │   │   │   ├── finetune_A_tt.yaml
    │   │   │   ├── finetune_A_base.yaml
    │   │   │   ├── finetune_A_decompose_layer.yaml
    │   │   │   ├── finetune_A_decompose_task.yaml
    │   │   │   ├── finetune_A_decompose_interact.yaml
    │   │   │   ├── finetune_A_decompose_layer_task.yaml
    │   │   │   ├── finetune_A_decompose_task_interact.yaml
    │   │   │   ├── finetune_A_decompose_layer_interact.yaml
    │   │   │   ├── finetune_A_decompose_task_layer_interact.yaml
    │   │   │   └── finetune_A_test.yaml
    │   ├── lvis
    │   │   ├── val.yaml
    │   │   └── minival.yaml
    │   └── odinw_35
    │   │   └── _all.json
    ├── test
    │   ├── task_div.py
    │   ├── tt.py
    │   ├── vis_2.py
    │   ├── task_sim_matrix.py
    │   ├── colors.py
    │   ├── task.vis.py
    │   └── task_visual.py
    ├── webui
    │   └── flagged
    │   │   ├── Image
    │   │       └── d60a16e47c3a575b719d
    │   │       │   └── download.jpg
    │   │   ├── Bounding box
    │   │       └── 46774ff9412648535a6f
    │   │       │   └── image.png
    │   │   └── log.csv
    ├── CODE_OF_CONDUCT.md
    ├── test.py
    ├── requirements.txt
    ├── .gitignore
    ├── cmd
    │   └── cmd.txt
    ├── LICENSE
    ├── SUPPORT.md
    ├── odinw
    │   └── download.py
    ├── matrix
    │   └── matrix.py
    ├── setup.py
    └── SECURITY.md
├── assets
    └── webui.png
├── script
    ├── retrieval
    │   └── lpi.sh
    └── grounding
    │   └── lpi_p.sh
└── .gitignore


/retrieval/loss/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/retrieval/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/retrieval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/DATASET:
--------------------------------------------------------------------------------
1 | /root/autodl-tmp


--------------------------------------------------------------------------------
/grounding/tools/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/retrieval/methods/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/shell/s1.py:
--------------------------------------------------------------------------------
1 | print('s1 test')


--------------------------------------------------------------------------------
/grounding/shell/s2.py:
--------------------------------------------------------------------------------
1 | print('s2 test')


--------------------------------------------------------------------------------
/grounding/shell/s3.py:
--------------------------------------------------------------------------------
1 | print('s3 test')


--------------------------------------------------------------------------------
/retrieval/models/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/MODEL:
--------------------------------------------------------------------------------
1 | /home1/yanweicai/MODEL/glip


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/structures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/bert/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/prompt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/retrieval/models/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/grounding/bert-base-uncased:
--------------------------------------------------------------------------------
1 | /home1/yanweicai/MODEL/bert-base-uncased/


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/od_eval.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/retrieval/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .idea/
3 | logs/
4 | logss/
5 | res/


--------------------------------------------------------------------------------
/assets/webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/assets/webui.png


--------------------------------------------------------------------------------
/grounding/docs/lead.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/docs/lead.png


--------------------------------------------------------------------------------
/grounding/MID/task_visual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/MID/task_visual.png


--------------------------------------------------------------------------------
/grounding/MID/task_visual_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/MID/task_visual_2.png


--------------------------------------------------------------------------------
/grounding/configs/pretrain/_coco.yaml:
--------------------------------------------------------------------------------
1 | DATASETS:
2 |   TRAIN: ("coco_2017_train",)
3 |   TEST: ("coco_2017_val", )


--------------------------------------------------------------------------------
/grounding/docs/word_cloud_od.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/docs/word_cloud_od.png


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/flickr/__init__.py:
--------------------------------------------------------------------------------
1 | from .flickr_eval import FlickrEvaluator
2 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/grounding/docs/benchmark_example_od.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/docs/benchmark_example_od.png


--------------------------------------------------------------------------------
/script/retrieval/lpi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd /home1/yanweicai/workspace/prompt/lpi/retrieval
3 | python main.py --config ./configs/lpi/coco_lpi.json


--------------------------------------------------------------------------------
/retrieval/models/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/retrieval/models/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .build import make_data_loader
3 | 


--------------------------------------------------------------------------------
/grounding/test/task_div.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | mat = np.loadtxt('../MID/task_sim_matrix.txt')
4 | threshold = 0.4
5 | mat = (mat>threshold).astype(int)
6 | print(mat)
7 | 


--------------------------------------------------------------------------------
/grounding/webui/flagged/Image/d60a16e47c3a575b719d/download.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/webui/flagged/Image/d60a16e47c3a575b719d/download.jpg


--------------------------------------------------------------------------------
/grounding/webui/flagged/Bounding box/46774ff9412648535a6f/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/webui/flagged/Bounding box/46774ff9412648535a6f/image.png


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .defaults import _C as cfg
3 | from .paths_catalog import try_to_find


--------------------------------------------------------------------------------
/retrieval/shell/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd /root/workspace0401/S-Prompts-lpi
3 | python main.py > ./logss/hip.txt
4 | cd /root/workspace0401/S-Prompts-sprompt
5 | python main.py > ./logss/sprompt.txt


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/language_backbone/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelvin-ywc/LPI/HEAD/grounding/maskrcnn_benchmark/modeling/language_backbone/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/grounding/webui/flagged/log.csv:
--------------------------------------------------------------------------------
1 | Caption,Image,Bounding box,flag,username,timestamp
2 | ,flagged/Image/d60a16e47c3a575b719d/download.jpg,flagged/Bounding box/46774ff9412648535a6f/image.png,,,2024-04-16 15:26:28.521612
3 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/README.md:
--------------------------------------------------------------------------------
1 | # Utility functions
2 | 
3 | This folder contain utility functions that are not used in the
4 | core library, but are useful for building models or training
5 | code using the config system.
6 | 


--------------------------------------------------------------------------------
/retrieval/utils/factory.py:
--------------------------------------------------------------------------------
1 | from methods.sprompt import SPrompts
2 | 
3 | def get_model(model_name, args):
4 |     name = model_name.lower()
5 |     options = {'sprompts': SPrompts,
6 |                }
7 |     return options[name](args)
8 | 
9 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/solver/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .build import make_optimizer
3 | from .build import make_lr_scheduler
4 | from .lr_scheduler import WarmupMultiStepLR
5 | 


--------------------------------------------------------------------------------
/script/grounding/lpi_p.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd /home1/yanweicai/workspace/prompt/lpi/grounding
4 | 
5 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/flickr.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 | import torch.utils.data as data
4 | from maskrcnn_benchmark.data.datasets.modulated_coco import ModulatedDataset
5 | 
6 | 
7 | class FlickrDataset(ModulatedDataset):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/object365.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 | import torch.utils.data as data
4 | from maskrcnn_benchmark.data.datasets.coco_dt import CocoDetectionTSV
5 | 
6 | 
7 | class Object365DetectionTSV(CocoDetectionTSV):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/phrasecut.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 | import torch.utils.data as data
4 | from maskrcnn_benchmark.data.datasets.modulated_coco import ModulatedDataset
5 | 
6 | 
7 | class PhrasecutDetection(ModulatedDataset):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/grounding/test/tt.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | def cal_dif( v1, v2):
 4 |     return np.sum((v1 - v2) ** 2)
 5 | 
 6 | v1 = [1,2,3]
 7 | v2 = [4,5,6]
 8 | v1 = torch.tensor(v1)
 9 | v2 = torch.tensor(v2)
10 | # print(cal_dif(v1, v2))
11 | print(torch.sum((v1-v2)**2))


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/language_backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone as build_language_backbone
2 | from .build import build_tokenizer
3 | 
4 | from .hfpt_tokenizer import HFPTTokenizer
5 | from .simple_tokenizer import SimpleTokenizer
6 | from .clip_model import CLIPTransformer
7 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | from maskrcnn_benchmark.utils.registry import Registry
 4 | 
 5 | BACKBONES = Registry()
 6 | 
 7 | LANGUAGE_BACKBONES = Registry()
 8 | 
 9 | ROI_BOX_FEATURE_EXTRACTORS = Registry()
10 | RPN_HEADS = Registry()
11 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .transforms import Compose
3 | from .transforms import Resize
4 | from .transforms import RandomHorizontalFlip
5 | from .transforms import ToTensor
6 | from .transforms import Normalize
7 | 
8 | from .build import build_transforms
9 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .distributed import DistributedSampler
3 | from .grouped_batch_sampler import GroupedBatchSampler
4 | from .iteration_based_batch_sampler import IterationBasedBatchSampler
5 | 
6 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"]
7 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/language_backbone/test_clip_tokenizer.py:
--------------------------------------------------------------------------------
1 | from maskrcnn_benchmark.modeling.language_backbone import build_tokenizer
2 | 
3 | if __name__ == '__main__':
4 | 
5 |     tokenizer2 = build_tokenizer("clip")
6 |     tokenized2 = tokenizer2(
7 |         ["Detectest : fishid. jellyfishioasod. penguinasd. puffin.asd shark. starfish. round stingray"])
8 |     print(tokenized2)
9 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/nms.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from maskrcnn_benchmark import _C
 3 | 
 4 | try:
 5 |     import torchvision
 6 |     from torchvision.ops import nms
 7 | except:
 8 |     nms = _C.nms
 9 | 
10 | ml_nms = _C.ml_nms
11 | soft_nms = _C.soft_nms
12 | 
13 | # nms.__doc__ = """
14 | # This function performs Non-maximum suppresion"""
15 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/lvis/_change_lvis_annotation.py:
--------------------------------------------------------------------------------
 1 | path = "DATASET/coco/annotations/lvis_v1_minival.json"
 2 | import json
 3 | with open(path) as f:
 4 |     all = json.load(f)
 5 | 
 6 | for i in all["images"]:
 7 |     i["file_name"] = "/".join(i["coco_url"].split("/")[-2:])
 8 | 
 9 | with open("DATASET/coco/annotations/lvis_v1_minival_inserted_image_name.json", "w") as f:
10 |     json.dump(all, f)


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/collect_env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import PIL
 3 | 
 4 | from torch.utils.collect_env import get_pretty_env_info
 5 | 
 6 | 
 7 | def get_pil_version():
 8 |     return "\n        Pillow ({})".format(PIL.__version__)
 9 | 
10 | 
11 | def collect_env_info():
12 |     env_str = get_pretty_env_info()
13 |     env_str += get_pil_version()
14 |     return env_str
15 | 


--------------------------------------------------------------------------------
/grounding/shell/base.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd /root/workspace/grounding/prompt_grounding
4 | 
5 | python tools/finetune.py       --config-file configs/refcoco+/finetune_A_base.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcoco+_base.txt
6 | 
7 | python tools/finetune.py       --config-file configs/refcocog/finetune_A_base.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcocog_base.txt


--------------------------------------------------------------------------------
/grounding/shell/lpim.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd /root/workspace/grounding/prompt_grounding
4 | 
5 | python tools/finetune.py       --config-file configs/refcoco+/finetune_A_decompose_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcoco+_layer_task.txt
6 | 
7 | python tools/finetune.py       --config-file configs/refcocog/finetune_A_decompose_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcocog_layer_task.txt


--------------------------------------------------------------------------------
/grounding/shell/refcoco+.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd /root/workspace/grounding/prompt_grounding
4 | 
5 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_interact_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco+_interact_layer_task.txt
6 | 
7 | python tools/finetune.py --config-file configs/refcoco+/finetune_A_decompose_layer_task.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco+_layer_task.txt


--------------------------------------------------------------------------------
/grounding/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/detector/__init__.py:
--------------------------------------------------------------------------------
 1 | from .generalized_rcnn import GeneralizedRCNN
 2 | from .generalized_vl_rcnn import GeneralizedVLRCNN
 3 | 
 4 | _DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN,
 5 |                                  "GeneralizedVLRCNN": GeneralizedVLRCNN
 6 |                                  }
 7 | 
 8 | 
 9 | def build_detection_model(cfg):
10 |     meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE]
11 |     return meta_arch(cfg)
12 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/miscellaneous.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import errno
 3 | import os
 4 | from .comm import is_main_process
 5 | 
 6 | def mkdir(path):
 7 |     try:
 8 |         os.makedirs(path)
 9 |     except OSError as e:
10 |         if e.errno != errno.EEXIST:
11 |             raise
12 | 
13 | 
14 | def save_config(cfg, path):
15 |     if is_main_process():
16 |         with open(path, 'w') as f:
17 |             f.write(cfg.dump())
18 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/amp.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | 
 3 | @contextmanager
 4 | def nullcontext(enter_result=None, **kwargs):
 5 |     yield enter_result
 6 | 
 7 | try:
 8 |     from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd
 9 | except:
10 |     print('[Warning] Library for automatic mixed precision is not found, AMP is disabled!!')
11 |     GradScaler = nullcontext
12 |     autocast = nullcontext
13 |     custom_fwd = nullcontext
14 |     custom_bwd = nullcontext


--------------------------------------------------------------------------------
/retrieval/util/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/grounding/test.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | def run1(event):
 4 |     print("开启线程1")
 5 |     for i in range(10):
 6 |         print('a' + str(i))
 7 |         event.wait()
 8 | 
 9 | def run2(event):
10 |     print("开启线程2")
11 |     for i in range(10):
12 |         print('b' + str(i))
13 |         event.wait()
14 | 
15 | event = threading.Event()
16 | t1 = threading.Thread(target=run1, args=(event,))
17 | t2 = threading.Thread(target=run2, args=(event,))
18 | 
19 | t1.start()
20 | t2.start()
21 | 
22 | print("运行结束")
23 | t1.join()
24 | t2.join()
25 | 
26 | print("结束")


--------------------------------------------------------------------------------
/grounding/configs/flickr/test.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   ATSS:
 3 |     NUM_CLASSES: 8 # Placeholder
 4 |   FCOS:
 5 |     NUM_CLASSES: 8 # Placeholder
 6 |   ROI_BOX_HEAD:
 7 |     NUM_CLASSES: 8 # Placeholder
 8 |   DYHEAD:
 9 |     NUM_CLASSES: 8 # Placeholder
10 | DATASETS:
11 |   TRAIN: ("flickr30k_test", )
12 |   TEST: ("flickr30k_test", )
13 |   FLICKR_GT_TYPE: "separate"
14 | 
15 | INPUT:
16 |   MIN_SIZE_TRAIN: 800
17 |   MAX_SIZE_TRAIN: 1333
18 |   MIN_SIZE_TEST: 800
19 |   MAX_SIZE_TEST: 1333
20 | DATALOADER:
21 |   SIZE_DIVISIBILITY: 32
22 |   ASPECT_RATIO_GROUPING: False


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/smooth_l1_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | 
 5 | # TODO maybe push this to nn?
 6 | def smooth_l1_loss(input, target, beta=1. / 9, size_average=True):
 7 |     """
 8 |     very similar to the smooth_l1_loss from pytorch, but with
 9 |     the extra beta parameter
10 |     """
11 |     n = torch.abs(input - target)
12 |     cond = n < beta
13 |     loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
14 |     if size_average:
15 |         return loss.mean()
16 |     return loss.sum()
17 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/vg/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .vg_eval import do_vg_evaluation
 4 | 
 5 | 
 6 | def vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes=False, **_):
 7 |     logger = logging.getLogger("maskrcnn_benchmark.inference")
 8 |     logger.info("performing vg evaluation, ignored iou_types.")
 9 |     return do_vg_evaluation(
10 |         dataset=dataset,
11 |         predictions=predictions,
12 |         output_folder=output_folder,
13 |         box_only=box_only,
14 |         eval_attributes=eval_attributes,
15 |         logger=logger,
16 |     )
17 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .voc_eval import do_voc_evaluation
 4 | 
 5 | 
 6 | def voc_evaluation(dataset, predictions, output_folder, box_only, **_):
 7 |     logger = logging.getLogger("maskrcnn_benchmark.inference")
 8 |     if box_only:
 9 |         logger.warning("voc evaluation doesn't support box_only, ignored.")
10 |     logger.info("performing voc evaluation, ignored iou_types.")
11 |     return do_voc_evaluation(
12 |         dataset=dataset,
13 |         predictions=predictions,
14 |         output_folder=output_folder,
15 |         logger=logger,
16 |     )
17 | 


--------------------------------------------------------------------------------
/grounding/shell/l2p.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /root/workspace/grounding/prompt_grounding
 4 | 
 5 | #python tools/finetune.py       --config-file configs/maple/finetune_A_decompose.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcoco.txt
 6 | 
 7 | python tools/finetune.py       --config-file configs/l2p/finetune_A_decompose_refcoco+.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/l2p_refcoco+.txt
 8 | 
 9 | python tools/finetune.py       --config-file configs/l2p/finetune_A_decompose_refcocog.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/l2p_refcocog.txt
10 | 


--------------------------------------------------------------------------------
/grounding/shell/maple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /root/workspace/grounding/prompt_grounding
 4 | 
 5 | #python tools/finetune.py       --config-file configs/maple/finetune_A_decompose.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcoco.txt
 6 | 
 7 | python tools/finetune.py       --config-file configs/maple/finetune_A_decompose_refcoco+.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcoco+.txt
 8 | 
 9 | python tools/finetune.py       --config-file configs/maple/finetune_A_decompose_refcocog.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/maple_refcocog.txt
10 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coco_eval import do_coco_evaluation
 2 | 
 3 | 
 4 | def coco_evaluation(
 5 |     dataset,
 6 |     predictions,
 7 |     output_folder,
 8 |     box_only=False,
 9 |     iou_types=("bbox",),
10 |     expected_results=(),
11 |     expected_results_sigma_tol=4,
12 | ):
13 |     return do_coco_evaluation(
14 |         dataset=dataset,
15 |         predictions=predictions,
16 |         box_only=box_only,
17 |         output_folder=output_folder,
18 |         iou_types=iou_types,
19 |         expected_results=expected_results,
20 |         expected_results_sigma_tol=expected_results_sigma_tol,
21 |     )
22 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/val.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   ATSS:
 3 |     NUM_CLASSES: 8 # Placeholder
 4 |   FCOS:
 5 |     NUM_CLASSES: 8 # Placeholder
 6 |   ROI_BOX_HEAD:
 7 |     NUM_CLASSES: 8 # Placeholder
 8 |   DYHEAD:
 9 |     NUM_CLASSES: 8 # Placeholder
10 | DATASETS:
11 |   TRAIN: ("refexp_val", )
12 |   TEST: ("refexp_val", )
13 |   FLICKR_GT_TYPE: "separate"
14 | 
15 | INPUT:
16 |   MIN_SIZE_TRAIN: 800
17 |   MAX_SIZE_TRAIN: 1333
18 |   MIN_SIZE_TEST: 800
19 |   MAX_SIZE_TEST: 1333
20 | DATALOADER:
21 |   SIZE_DIVISIBILITY: 32
22 |   ASPECT_RATIO_GROUPING: False
23 | SOLVER:
24 |   WARMUP_ITERS: 0
25 |   MAX_EPOCH: 12
26 |   CHECKPOINT_PERIOD: 100
27 | TEST:
28 |   IMS_PER_BATCH: 1


--------------------------------------------------------------------------------
/grounding/configs/flickr/val.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   ATSS:
 3 |     NUM_CLASSES: 8 # Placeholder
 4 |   FCOS:
 5 |     NUM_CLASSES: 8 # Placeholder
 6 |   ROI_BOX_HEAD:
 7 |     NUM_CLASSES: 8 # Placeholder
 8 |   DYHEAD:
 9 |     NUM_CLASSES: 8 # Placeholder
10 | DATASETS:
11 |   TRAIN: ("flickr30k_val", )
12 |   TEST: ("flickr30k_val", )
13 |   FLICKR_GT_TYPE: "separate"
14 | 
15 | INPUT:
16 |   MIN_SIZE_TRAIN: 800
17 |   MAX_SIZE_TRAIN: 1333
18 |   MIN_SIZE_TEST: 800
19 |   MAX_SIZE_TEST: 1333
20 | DATALOADER:
21 |   SIZE_DIVISIBILITY: 32
22 |   ASPECT_RATIO_GROUPING: False
23 | SOLVER:
24 |   WARMUP_ITERS: 0
25 |   MAX_EPOCH: 12
26 |   CHECKPOINT_PERIOD: 100
27 | TEST:
28 |   IMS_PER_BATCH: 8


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/language_backbone/build.py:
--------------------------------------------------------------------------------
 1 | from .simple_tokenizer import SimpleTokenizer
 2 | 
 3 | 
 4 | def build_tokenizer(tokenizer_name):
 5 |     tokenizer = None
 6 |     if tokenizer_name == 'clip':
 7 |         tokenizer = SimpleTokenizer()
 8 |     elif 'hf_' in tokenizer_name:
 9 |         from .hfpt_tokenizer import HFPTTokenizer
10 | 
11 |         tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:])
12 |     elif 'hfc_' in tokenizer_name:
13 |         from .hfpt_tokenizer import HFPTTokenizer
14 |         tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:])
15 |     else:
16 |         raise ValueError('Unknown tokenizer')
17 | 
18 |     return tokenizer
19 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/od_to_grounding/__init__.py:
--------------------------------------------------------------------------------
 1 | from .od_eval import do_od_evaluation
 2 | 
 3 | 
 4 | def od_to_grounding_evaluation(
 5 |         dataset,
 6 |         predictions,
 7 |         output_folder,
 8 |         box_only=False,
 9 |         iou_types=("bbox",),
10 |         expected_results=(),
11 |         expected_results_sigma_tol=4, ):
12 |     return do_od_evaluation(
13 |         dataset=dataset,
14 |         predictions=predictions,
15 |         box_only=box_only,
16 |         output_folder=output_folder,
17 |         iou_types=iou_types,
18 |         expected_results=expected_results,
19 |         expected_results_sigma_tol=expected_results_sigma_tol,
20 |     )
21 | 


--------------------------------------------------------------------------------
/grounding/requirements.txt:
--------------------------------------------------------------------------------
 1 | cityscapesscripts==2.2.2
 2 | einops==0.7.0
 3 | ftfy==6.2.0
 4 | gradio==4.26.0
 5 | h5py==3.11.0
 6 | inflect==7.2.0
 7 | matplotlib==3.5.2
 8 | nltk==3.8.1
 9 | numpy==1.22.4
10 | openai==1.19.0
11 | opencv_python==4.9.0.80
12 | pandas==2.0.3
13 | Pillow==9.1.1
14 | Pillow==10.3.0
15 | prettytable==3.10.0
16 | pycocotools==2.0.7
17 | PyYAML==6.0.1
18 | PyYAML==6.0.1
19 | qd==0.8.9
20 | regex==2023.12.25
21 | Requests==2.31.0
22 | scikit_learn==1.3.2
23 | scipy==1.13.0
24 | setuptools==52.0.0.post20210125
25 | tensorboardX==2.6.2.2
26 | tensorboardX==2.6.2.2
27 | tensorflow==2.16.1
28 | timm==0.9.16
29 | torch==1.11.0+cu113
30 | torchvision==0.12.0+cu113
31 | tqdm==4.61.2
32 | transformers==4.38.2
33 | yacs==0.1.8
34 | 


--------------------------------------------------------------------------------
/grounding/shell/lpip.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd /root/workspace/grounding/prompt_grounding
4 | 
5 | torchrun --nnodes=1 ----nproc_per_node=4 tools/finetune.py       --config-file configs/refcoco/val/finetune_A_decompose_interact_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/lpip_v1.txt
6 | 
7 | python tools/finetune.py       --config-file configs/refcoco+/finetune_A_decompose_interact_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcoco+_layer_task.txt
8 | 
9 | python tools/finetune.py       --config-file configs/refcocog/finetune_A_decompose_interact_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/refcocog_layer_task.txt


--------------------------------------------------------------------------------
/grounding/shell/vis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /root/workspace/grounding/prompt_grounding
 4 | 
 5 | python tools/finetune.py       --config-file configs/sprompt/finetune_A_decompose.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/sprompt_vis_refcoco.txt
 6 | 
 7 | mv visualize visualize_sprompts
 8 | mkdir visualize
 9 | cd visualize
10 | mkdir 0
11 | mkdir 1
12 | mkdir 2
13 | mkdir 3
14 | mkdir 4
15 | mkdir 5
16 | mkdir 6
17 | mkdir 7
18 | mkdir 8
19 | mkdir 9
20 | mkdir 10
21 | mkdir 11
22 | cd ..
23 | 
24 | python tools/finetune.py       --config-file configs/refcoco/val/finetune_A_decompose_interact_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/lpi_vis_refcoco.txt
25 | 


--------------------------------------------------------------------------------
/grounding/test/vis_2.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | api_key = "sk-MazpnWiEWQhrgtP8526a79F8D7254a5894296d2d81Ea6c7a"
 3 | api_base = "https://oneapi.xty.app/v1"
 4 | client = OpenAI(
 5 |     api_key=api_key,
 6 |     base_url=api_base
 7 | )
 8 | model = "text-embedding-3-large"
 9 | 
10 | 
11 | def get_embedding(text, model):
12 |     text = text.replace("\n", " ")
13 |     embeddings = client.embeddings
14 |     creation = embeddings.create(input =[text], model=model)
15 | 
16 |     return creation.data[0].embedding
17 | 
18 | 
19 | task_names =['appliance', 'sports', 'outdoor','electronic', 'accessory', 'indoor','kitchen', 'furniture', 'vehicle','food', 'animal', 'person']
20 | 
21 | task_senmantic_embedding =[get_embedding(task_name, model) for task_name in task_names]


--------------------------------------------------------------------------------
/grounding/shell/depth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd /root/workspace/grounding/prompt_grounding
4 | 
5 | python tools/finetune.py --config-file configs/ablation/prompt_depth/finetune_A_decompose_interact_layer_task_d8.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco_interact_layer_task_d8.txt
6 | 
7 | python tools/finetune.py --config-file configs/ablation/prompt_depth/finetune_A_decompose_interact_layer_task_d10.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco_interact_layer_task_d10.txt
8 | 
9 | python tools/finetune.py --config-file configs/ablation/prompt_depth/finetune_A_decompose_interact_layer_task_d12.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./log_new/refcoco_interact_layer_task_d12.txt


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/cv2_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for cv2 utility functions and maintaining version compatibility
 3 | between 3.x and 4.x
 4 | """
 5 | import cv2
 6 | 
 7 | 
 8 | def findContours(*args, **kwargs):
 9 |     """
10 |     Wraps cv2.findContours to maintain compatiblity between versions
11 |     3 and 4
12 | 
13 |     Returns:
14 |         contours, hierarchy
15 |     """
16 |     if cv2.__version__.startswith('4'):
17 |         contours, hierarchy = cv2.findContours(*args, **kwargs)
18 |     elif cv2.__version__.startswith('3'):
19 |         _, contours, hierarchy = cv2.findContours(*args, **kwargs)
20 |     else:
21 |         raise AssertionError(
22 |             'cv2 must be either version 3 or 4 to call this method')
23 | 
24 |     return contours, hierarchy
25 | 


--------------------------------------------------------------------------------
/grounding/shell/prompt_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /root/workspace/grounding/prompt_grounding
 4 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r1.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r1.txt
 5 | 
 6 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r2.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r2.txt
 7 | 
 8 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r8.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r8.txt
 9 | 
10 | python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r16.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r16.txt


--------------------------------------------------------------------------------
/grounding/shell/sprompt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /root/workspace/grounding/prompt_grounding
 4 | #python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r1.txt
 5 | 
 6 | python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose_refcoco+.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/sprompt_refcoco+.txt
 7 | 
 8 | python tools/finetune.py --config-file configs/sprompt/finetune_A_decompose_refcocog.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/sprompt_refcocog.txt
 9 | 
10 | #python tools/finetune.py --config-file configs/ablation/prompt/finetune_A_r16.yaml --skip-test --custom_shot_and_epoch_and_general_copy 0_10_1 > ./logs/prompt_finetune_A_r16.txt


--------------------------------------------------------------------------------
/grounding/configs/lvis/val.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   ATSS:
 3 |     NUM_CLASSES: 8 # these fields are not used; just a placeholder
 4 |   FCOS:
 5 |     NUM_CLASSES: 8
 6 |   ROI_BOX_HEAD:
 7 |     NUM_CLASSES: 8
 8 |   DYHEAD:
 9 |     NUM_CLASSES: 8
10 | DATASETS:
11 |   REGISTER:
12 |     lvis_evaluation_mini_val:
13 |       img_dir: "coco"
14 |       ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json"
15 |     lvis_evaluation_val:
16 |       img_dir: "coco"
17 |       ann_file: "coco/annotations/lvis_od_val.json"
18 |   TRAIN: ("lvis_evaluation_val",) 
19 |   TEST: ("lvis_evaluation_val",)
20 | 
21 | INPUT:
22 |   MIN_SIZE_TRAIN: 800
23 |   MAX_SIZE_TRAIN: 1333
24 |   MIN_SIZE_TEST: 800
25 |   MAX_SIZE_TEST: 1333
26 | DATALOADER:
27 |   SIZE_DIVISIBILITY: 32
28 |   ASPECT_RATIO_GROUPING: False
29 | TEST:
30 |   IMS_PER_BATCH: 8
31 | 


--------------------------------------------------------------------------------
/grounding/configs/lvis/minival.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   ATSS:
 3 |     NUM_CLASSES: 8 # these fields are not used; just a placeholder
 4 |   FCOS:
 5 |     NUM_CLASSES: 8
 6 |   ROI_BOX_HEAD:
 7 |     NUM_CLASSES: 8
 8 |   DYHEAD:
 9 |     NUM_CLASSES: 8
10 | DATASETS:
11 |   REGISTER:
12 |     lvis_evaluation_mini_val:
13 |       img_dir: "coco"
14 |       ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json"
15 |     lvis_evaluation_val:
16 |       img_dir: "coco"
17 |       ann_file: "coco/annotations/lvis_od_val.json"
18 |   TRAIN: ("lvis_evaluation_mini_val",) 
19 |   TEST: ("lvis_evaluation_mini_val",)
20 | 
21 | INPUT:
22 |   MIN_SIZE_TRAIN: 800
23 |   MAX_SIZE_TRAIN: 1333
24 |   MIN_SIZE_TEST: 800
25 |   MAX_SIZE_TEST: 1333
26 | DATALOADER:
27 |   SIZE_DIVISIBILITY: 32
28 |   ASPECT_RATIO_GROUPING: False
29 | TEST:
30 |   IMS_PER_BATCH: 8
31 | 


--------------------------------------------------------------------------------
/retrieval/configs/domainnet_slip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "domainnet",
 4 |     "data_path": "/home/wangyabin/workspace/datasets/domainnet",
 5 |     "memory_size": 0,
 6 |     "memory_per_class": 0,
 7 |     "fixed_memory": true,
 8 |     "shuffle": false,
 9 |     "init_cls": 345,
10 |     "increment": 345,
11 |     "model_name": "sprompts",
12 |     "net_type": "slip",
13 |     "embd_dim" : 768,
14 |     "prompt_length" : 10,
15 |     "total_sessions" : 6,
16 |     "device": ["2","3"],
17 |     "seed": [1993],
18 |     "EPSILON" : 1e-8,
19 |     "init_epoch" : 30,
20 |     "init_lr" : 0.01,
21 |     "init_lr_decay" : 0.1,
22 |     "init_weight_decay" : 0.0005,
23 |     "epochs" : 30,
24 |     "lrate" : 0.01,
25 |     "lrate_decay" : 0.1,
26 |     "batch_size" : 128,
27 |     "weight_decay" : 2e-4,
28 |     "num_workers" : 16
29 | }


--------------------------------------------------------------------------------
/grounding/test/task_sim_matrix.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # task_vector = np.loadtxt('../MID/tasks_array.txt')
 4 | # print(task_vector)
 5 | 
 6 | # 12个1024维的向量
 7 | vectors = np.loadtxt('../MID/tasks_array.txt')  # 填入实际的向量值
 8 | 
 9 | # 初始化一个12x12的矩阵，用于存储相似度
10 | cosine_similarity_matrix = np.zeros((12, 12))
11 | 
12 | # 计算余弦相似度
13 | for i in range(12):
14 |     for j in range(12):
15 |         # embedding1 = vectors[i]
16 |         # embedding2 = vectors[j]
17 |         # cosine_similarity_matrix[i, j] = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
18 |         cosine_similarity_matrix[i, j] = np.dot(vectors[i], vectors[j]) / (np.linalg.norm(vectors[i]) * np.linalg.norm(vectors[j]))
19 | 
20 | # 输出余弦相似度矩阵
21 | print(cosine_similarity_matrix)
22 | 
23 | np.savetxt('../MID/task_sim_matrix.txt', cosine_similarity_matrix)


--------------------------------------------------------------------------------
/grounding/test/colors.py:
--------------------------------------------------------------------------------
 1 | import colorsys
 2 | import random
 3 | 
 4 | 
 5 | def get_n_hls_colors(num):
 6 |     hls_colors = []
 7 |     i = 0
 8 |     step = 360.0 / num
 9 |     while i < 360:
10 |         h = i
11 |         s = 90 + random.random() * 10
12 |         l = 50 + random.random() * 10
13 |         _hlsc = [h / 360.0, l / 100.0, s / 100.0]
14 |         hls_colors.append(_hlsc)
15 |         i += step
16 | 
17 |     return hls_colors
18 | 
19 | 
20 | def ncolors(num):
21 |     rgb_colors = []
22 |     if num < 1:
23 |         return rgb_colors
24 |     hls_colors = get_n_hls_colors(num)
25 |     for hlsc in hls_colors:
26 |         _r, _g, _b = colorsys.hls_to_rgb(hlsc[0], hlsc[1], hlsc[2])
27 |         r, g, b = [int(x * 255.0) for x in (_r, _g, _b)]
28 |         rgb_colors.append([r, g, b])
29 | 
30 | 
31 |     return rgb_colors
32 | 
33 | 
34 | print(ncolors(10))


--------------------------------------------------------------------------------
/retrieval/configs/core50_slip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "core50",
 4 |     "data_path": "/home/wangyabin/workspace/datasets/core50/data/core50_128x128",
 5 |     "memory_size": 0,
 6 |     "memory_per_class": 0,
 7 |     "fixed_memory": true,
 8 |     "shuffle": false,
 9 |     "init_cls": 50,
10 |     "increment": 50,
11 |     "model_name": "sprompts",
12 |     "net_type": "slip",
13 |     "embd_dim" : 768,
14 |     "prompt_length" : 10,
15 |     "total_sessions" : 8,
16 |     "device": ["0","1"],
17 |     "seed": [1993],
18 |     "EPSILON" : 1e-8,
19 |     "init_epoch" : 20,
20 |     "init_lr" : 0.01,
21 |     "init_lr_decay" : 0.1,
22 |     "init_weight_decay" : 0.0005,
23 |     "epochs" : 20,
24 |     "lrate" : 0.01,
25 |     "lrate_decay" : 0.1,
26 |     "batch_size" : 128,
27 |     "weight_decay" : 2e-4,
28 |     "num_workers" : 16
29 | }


--------------------------------------------------------------------------------
/grounding/tools/utils/colors.py:
--------------------------------------------------------------------------------
 1 | import colorsys
 2 | import random
 3 | 
 4 | 
 5 | def get_n_hls_colors(num):
 6 |     hls_colors = []
 7 |     i = 0
 8 |     step = 360.0 / num
 9 |     while i < 360:
10 |         h = i
11 |         s = 90 + random.random() * 10
12 |         l = 50 + random.random() * 10
13 |         _hlsc = [h / 360.0, l / 100.0, s / 100.0]
14 |         hls_colors.append(_hlsc)
15 |         i += step
16 | 
17 |     return hls_colors
18 | 
19 | 
20 | def ncolors(num):
21 |     rgb_colors = []
22 |     if num < 1:
23 |         return rgb_colors
24 |     hls_colors = get_n_hls_colors(num)
25 |     for hlsc in hls_colors:
26 |         _r, _g, _b = colorsys.hls_to_rgb(hlsc[0], hlsc[1], hlsc[2])
27 |         r, g, b = [int(x * 255.0) for x in (_r, _g, _b)]
28 |         rgb_colors.append([r, g, b])
29 | 
30 | 
31 |     return rgb_colors
32 | 
33 | 
34 | # print(ncolors(10))


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import bisect
 3 | 
 4 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
 5 | 
 6 | 
 7 | class ConcatDataset(_ConcatDataset):
 8 |     """
 9 |     Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra
10 |     method for querying the sizes of the image
11 |     """
12 | 
13 |     def get_idxs(self, idx):
14 |         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
15 |         if dataset_idx == 0:
16 |             sample_idx = idx
17 |         else:
18 |             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
19 |         return dataset_idx, sample_idx
20 | 
21 |     def get_img_info(self, idx):
22 |         dataset_idx, sample_idx = self.get_idxs(idx)
23 |         return self.datasets[dataset_idx].get_img_info(sample_idx)
24 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/ml_nms.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | 
10 | at::Tensor ml_nms(const at::Tensor& dets,
11 |                   const at::Tensor& scores,
12 |                   const at::Tensor& labels,
13 |                   const float threshold) {
14 | 
15 |   if (dets.device().is_cuda()) {
16 | #ifdef WITH_CUDA
17 |     // TODO raise error if not compiled with CUDA
18 |     if (dets.numel() == 0)
19 |       return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
20 |     auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1);
21 |     return ml_nms_cuda(b, threshold);
22 | #else
23 |     AT_ERROR("Not compiled with GPU support");
24 | #endif
25 |   }
26 |   AT_ERROR("CPU version not implemented");
27 | }
28 | 


--------------------------------------------------------------------------------
/retrieval/configs/cddb_sip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "cddb",
 4 |     "task_name": ["gaugan", "biggan", "cyclegan", "imle", "deepfake", "crn", "wild"],
 5 |     "data_path": "/home/wangyabin/workspace/datasets/DeepFake_Data/CL_data/",
 6 |     "multiclass": [0, 0, 1, 0, 0, 0, 0],
 7 |     "class_order": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
 8 |     "memory_size": 0,
 9 |     "memory_per_class": 0,
10 |     "fixed_memory": true,
11 |     "shuffle": false,
12 |     "init_cls": 2,
13 |     "increment": 2,
14 |     "model_name": "sprompts",
15 |     "net_type": "slip",
16 |     "embd_dim" : 768,
17 |     "prompt_length" : 10,
18 |     "total_sessions" : 7,
19 |     "device": ["0","1"],
20 |     "seed": [1993],
21 |     "EPSILON" : 1e-8,
22 |     "epochs" : 10,
23 |     "lrate" : 0.01,
24 |     "milestones" : [20, 30],
25 |     "lrate_decay" : 0.1,
26 |     "batch_size" : 128,
27 |     "weight_decay" : 2e-4,
28 |     "num_workers" : 16
29 | }


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/rpn/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # from .rpn import build_rpn
 3 | from .rpn import RPNModule
 4 | from .retina import RetinaNetModule
 5 | from .fcos import FCOSModule
 6 | from .atss import ATSSModule
 7 | from .dyhead import DyHeadModule
 8 | from .vldyhead import VLDyHeadModule
 9 | 
10 | _RPN_META_ARCHITECTURES = {"RPN": RPNModule,
11 |                            "RETINA": RetinaNetModule,
12 |                            "FCOS": FCOSModule,
13 |                            "ATSS": ATSSModule,
14 |                            "DYHEAD": DyHeadModule,
15 |                            "VLDYHEAD": VLDyHeadModule
16 |                            }
17 | 
18 | 
19 | def build_rpn(cfg):
20 |     """
21 |     This gives the gist of it. Not super important because it doesn't change as much
22 |     """
23 |     rpn_arch = _RPN_META_ARCHITECTURES[cfg.MODEL.RPN_ARCHITECTURE]
24 |     return rpn_arch(cfg)
25 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/script.txt:
--------------------------------------------------------------------------------
 1 | finetune:
 2 | 
 3 | python -m torch.distributed.launch --nproc_per_node=4 tools/finetune.py \
 4 |       --config-file configs/refcoco/finetune_A.yaml --skip-test \
 5 |       --custom_shot_and_epoch_and_general_copy 0_1_1 \
 6 |       --evaluate_only_best_on_test --push_both_val_and_test \
 7 |       MODEL.WEIGHT MODEL/glip_a_tiny_o365.pth \
 8 |       SOLVER.USE_AMP True TEST.DURING_TRAINING True TEST.IMS_PER_BATCH 4 SOLVER.IMS_PER_BATCH 4 SOLVER.WEIGHT_DECAY 0.05 TEST.EVAL_TASK grounding MODEL.BACKBONE.FREEZE_CONV_BODY_AT 2 MODEL.DYHEAD.USE_CHECKPOINT True SOLVER.FIND_UNUSED_PARAMETERS False SOLVER.TEST_WITH_INFERENCE True SOLVER.USE_AUTOSTEP True DATASETS.USE_OVERRIDE_CATEGORY True SOLVER.SEED 10 DATASETS.SHUFFLE_SEED 3 DATASETS.USE_CAPTION_PROMPT True DATASETS.DISABLE_SHUFFLE True \
 9 |       SOLVER.STEP_PATIENCE 2 SOLVER.CHECKPOINT_PER_EPOCH 1.0 SOLVER.AUTO_TERMINATE_PATIENCE 4 SOLVER.MODEL_EMA 0.0 SOLVER.TUNING_HIGHLEVEL_OVERRIDE full
10 | 
11 | test:
12 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/imports.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | # if torch._six.PY37:
 5 | if False:
 6 |     import importlib
 7 |     import importlib.util
 8 |     import sys
 9 | 
10 | 
11 |     # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
12 |     def import_file(module_name, file_path, make_importable=False):
13 |         spec = importlib.util.spec_from_file_location(module_name, file_path)
14 |         module = importlib.util.module_from_spec(spec)
15 |         spec.loader.exec_module(module)
16 |         if make_importable:
17 |             sys.modules[module_name] = module
18 |         return module
19 | else:
20 |     import imp
21 | 
22 |     def import_file(module_name, file_path, make_importable=None):
23 |         module = imp.load_source(module_name, file_path)
24 |         return module
25 | 


--------------------------------------------------------------------------------
/grounding/test/task.vis.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.manifold import TSNE
 4 | 
 5 | vectors = np.loadtxt('../MID/tasks_array.txt')
 6 | 
 7 | tsne = TSNE(n_components=2, random_state=0, perplexity=1)
 8 | words = ['appliance', 'sports', 'outdoor', 'electronic', 'accessory', 'indoor', 'kitchen', 'furniture', 'vehicle', 'food', 'animal', 'person']
 9 | 
10 | Y = tsne.fit_transform(vectors)
11 | 
12 | # colors = cm.rainbow(np.linspace(0, 1, Y.shape[0]))
13 | colors = plt.get_cmap('tab20')(range(12))
14 | for dataset, color, label in zip(Y, colors, words):
15 |     plt.scatter(dataset[0], dataset[1], color=color, label=label, s=120)
16 | 
17 | plt.xlabel("X")
18 | plt.ylabel("Y")
19 | plt.legend(ncol=4, loc=(0,2/3))
20 | plt.savefig('../MID/task_visual_4.svg')
21 | # for dataset, label in zip(Y, words):
22 | #     plt.annotate(label, (dataset[0], dataset[1]), textcoords='offset points',xytext=(0,10), ha='center')
23 | plt.show()
24 | 
25 | # plt.savefig('../MID/task_visual.png')


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/cpu/vision.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include <torch/extension.h>
 4 | 
 5 | 
 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
 7 |                                 const at::Tensor& rois,
 8 |                                 const float spatial_scale,
 9 |                                 const int pooled_height,
10 |                                 const int pooled_width,
11 |                                 const int sampling_ratio);
12 | 
13 | 
14 | at::Tensor nms_cpu(const at::Tensor& dets,
15 |                    const at::Tensor& scores,
16 |                    const float threshold);
17 | 
18 | 
19 | std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
20 |                                                const at::Tensor& scores,
21 |                                                const float threshold,
22 |                                                const float sigma);


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/duplicate_dataset.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import TypeVar, Optional, Iterator
 3 | 
 4 | import torch
 5 | from torch.utils.data import Sampler, Dataset
 6 | import torch.distributed as dist
 7 | import random
 8 | import numpy as np
 9 | 
10 | 
11 | def create_duplicate_dataset(DatasetBaseClass):
12 |     class DupDataset(DatasetBaseClass):
13 | 
14 |         def __init__(self, copy, **kwargs):
15 |             super(DupDataset, self).__init__(**kwargs)
16 | 
17 |             self.copy = copy
18 |             self.length = super(DupDataset, self).__len__()
19 | 
20 |         def __len__(self):
21 |             return self.copy * self.length
22 | 
23 |         def __getitem__(self, index):
24 |             true_index = index % self.length
25 |             return super(DupDataset, self).__getitem__(true_index)
26 | 
27 |         def get_img_info(self, index):
28 |             true_index = index % self.length
29 |             return super(DupDataset, self).get_img_info(true_index)
30 | 
31 |     return DupDataset
32 | 


--------------------------------------------------------------------------------
/grounding/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info
 2 | *.pyc
 3 | build/
 4 | DATASET/
 5 | OUTPUT/
 6 | MODEL/
 7 | best_model/
 8 | all_key/
 9 | OUTPUT_org
10 | 
11 | # compilation and distribution
12 | __pycache__
13 | _ext
14 | *.so
15 | maskrcnn_benchmark.egg-info/
16 | dist/
17 | 
18 | # pytorch/python/numpy formats
19 | *.pth
20 | *.pkl
21 | *.npy
22 | 
23 | # ipython/jupyter notebooks
24 | *.ipynb
25 | **/.ipynb_checkpoints/
26 | 
27 | # Editor temporaries
28 | *.swn
29 | *.swo
30 | *.swp
31 | *~
32 | 
33 | # Pycharm editor settings
34 | .idea
35 | 
36 | # vscode editor settings
37 | .vscode
38 | 
39 | # MacOS
40 | .DS_Store
41 | 
42 | # Custom
43 | *.custom.py
44 | 
45 | # logs
46 | logs/
47 | log_new/
48 | 
49 | # res
50 | FINAL_RES/
51 | FINAL_RES_v2/
52 | eval/
53 | 
54 | # visualize
55 | visualize_base/
56 | visualize_glip/
57 | visualize_lpi/
58 | visualize_lpi_v2/
59 | visualize_maple/
60 | visualize_sprompts/
61 | 
62 | # task_div
63 | TASK_DIV/
64 | TASK_DIV_REFCOCO/
65 | 
66 | # prompt_save
67 | prompt_save/
68 | 
69 | # embedding save
70 | embedding_save/
71 | 
72 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import logging
 3 | import os
 4 | import sys
 5 | from datetime import datetime
 6 | 
 7 | def setup_logger(name, save_dir, distributed_rank):
 8 |     logger = logging.getLogger(name)
 9 |     logger.setLevel(logging.DEBUG)
10 |     # don't log results for the non-master process
11 |     if distributed_rank > 0:
12 |         return logger
13 |     ch = logging.StreamHandler(stream=sys.stdout)
14 |     ch.setLevel(logging.DEBUG)
15 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
16 |     ch.setFormatter(formatter)
17 |     logger.addHandler(ch)
18 | 
19 |     if save_dir:
20 |         filename = f'{datetime.now().date()}-{datetime.now().hour}:{datetime.now().minute}:{datetime.now().second}-GLIP.log.txt'
21 |         fh = logging.FileHandler(os.path.join(save_dir, filename))
22 |         fh.setLevel(logging.DEBUG)
23 |         fh.setFormatter(formatter)
24 |         logger.addHandler(fh)
25 | 
26 |     return logger
27 | 


--------------------------------------------------------------------------------
/retrieval/configs/cddb_slip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "cddb",
 4 |     "task_name": ["gaugan", "biggan", "wild", "whichfaceisreal", "san"],
 5 |     "data_path": "/home/wangyabin/workspace/datasets/DeepFake_Data/CL_data/",
 6 |     "multiclass": [0, 0, 0, 0, 0],
 7 |     "class_order": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 8 |     "total_sessions" : 7,
 9 |     "memory_size": 0,
10 |     "memory_per_class": 0,
11 |     "fixed_memory": true,
12 |     "shuffle": false,
13 |     "init_cls": 2,
14 |     "increment": 2,
15 |     "model_name": "sprompts",
16 |     "net_type": "slip",
17 |     "embd_dim" : 768,
18 |     "prompt_length" : 10,
19 |     "device": ["0","1"],
20 |     "seed": [1993],
21 |     "EPSILON" : 1e-8,
22 |     "init_epoch" : 20,
23 |     "init_lr" : 0.001,
24 |     "init_milestones" : [20,30,40],
25 |     "init_lr_decay" : 0.1,
26 |     "init_weight_decay" : 0.0005,
27 |     "epochs" : 50,
28 |     "lrate" : 0.01,
29 |     "milestones" : [20, 30],
30 |     "lrate_decay" : 0.1,
31 |     "batch_size" : 128,
32 |     "weight_decay" : 2e-4,
33 |     "num_workers" : 16
34 | }


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from .coco import COCODataset
 3 | from .voc import PascalVOCDataset
 4 | from .concat_dataset import ConcatDataset
 5 | from .background import Background
 6 | from .tsv import TSVDataset, ODTSVDataset
 7 | 
 8 | from .modulated_coco import ModulatedDataset, CocoDetection, CocoGrounding
 9 | from .flickr import FlickrDataset
10 | from .refexp import RefExpDataset
11 | from .mixed import MixedDataset
12 | from .gqa import GQADataset
13 | 
14 | from .coco_dt import CocoDetectionTSV
15 | from .caption import CaptionTSV
16 | from .lvis import LvisDetection
17 | from .pseudo_data import PseudoData
18 | from .phrasecut import PhrasecutDetection
19 | 
20 | __all__ = ["COCODataset", "TSVDataset", "ODTSVDataset", "ConcatDataset", "PascalVOCDataset", "Background",
21 |            "ModulatedDataset", "MixedDataset", "CocoDetection", "FlickrDataset", "RefExpDataset", "GQADataset",
22 |            "CocoDetectionTSV", "CocoGrounding", "CaptionTSV", "LvisDetection", "PseudoData", "PhrasecutDetection"
23 |            ]
24 | 


--------------------------------------------------------------------------------
/grounding/cmd/cmd.txt:
--------------------------------------------------------------------------------
 1 | Environment parameter
 2 | CUDA_VISIBLE_DEVICES=2,3;TOKENIZERS_PARALLELISM=(true | false)
 3 | 
 4 | # finetune
 5 | --nproc_per_node=2  tools/finetune.py       --config-file configs/refcoco/finetune_A.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_1_1       --evaluate_only_best_on_test --push_both_val_and_test
 6 | 
 7 | # inference
 8 | python   tools/testgrounding_net.py         --config-file configs/refcoco/refcoco.yaml         --task_config configs/refcoco/val.yaml         --weight MODEL/glip_a_tiny_o365.pth         OUTPUT_DIR ./ TEST.IMS_PER_BATCH 1 SOLVER.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding
 9 | 
10 | python  -m torch.distributed.launch --nproc_per_node=2  tools/testgrounding_net.py         --config-file configs/refcoco/refcoco.yaml         --task_config configs/refcoco/val.yaml         --weight MODEL/glip_a_tiny_o365.pth         OUTPUT_DIR ./ TEST.IMS_PER_BATCH 2 SOLVER.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding
11 | 
12 | python -m torch.distributed.launch --nproc_per_node=2  tools/finetune.py       --config-file configs/refcoco/finetune_A.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_5_1


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/list_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | """
 3 | Simple dataset class that wraps a list of path names
 4 | """
 5 | 
 6 | from PIL import Image
 7 | 
 8 | from maskrcnn_benchmark.structures.bounding_box import BoxList
 9 | 
10 | 
11 | class ListDataset(object):
12 |     def __init__(self, image_lists, transforms=None):
13 |         self.image_lists = image_lists
14 |         self.transforms = transforms
15 | 
16 |     def __getitem__(self, item):
17 |         img = Image.open(self.image_lists[item]).convert("RGB")
18 | 
19 |         # dummy target
20 |         w, h = img.size
21 |         target = BoxList([[0, 0, w, h]], img.size, mode="xyxy")
22 | 
23 |         if self.transforms is not None:
24 |             img, target = self.transforms(img, target)
25 | 
26 |         return img, target
27 | 
28 |     def __len__(self):
29 |         return len(self.image_lists)
30 | 
31 |     def get_img_info(self, item):
32 |         """
33 |         Return the image dimensions for the image, without
34 |         loading and pre-processing it
35 |         """
36 |         pass
37 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/backbone/mixer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | class MixedOperationRandom(nn.Module):
 5 |     def __init__(self, search_ops):
 6 |         super(MixedOperationRandom, self).__init__()
 7 |         self.ops = nn.ModuleList(search_ops)
 8 |         self.num_ops = len(search_ops)
 9 | 
10 |     def forward(self, x, x_path=None):
11 |         if x_path is None:
12 |             output = sum(op(x) for op in self.ops) / self.num_ops
13 |         else:
14 |             assert isinstance(x_path, (int, float)) and 0 <= x_path < self.num_ops or isinstance(x_path, torch.Tensor)
15 |             if isinstance(x_path, (int, float)):
16 |                 x_path = int(x_path)
17 |                 assert 0 <= x_path < self.num_ops
18 |                 output = self.ops[x_path](x)
19 |             elif isinstance(x_path, torch.Tensor):
20 |                 assert x_path.size(0) == x.size(0), 'batch_size should match length of y_idx'
21 |                 output = torch.cat([self.ops[int(x_path[i].item())](x.narrow(0, i, 1))
22 |                                     for i in range(x.size(0))], dim=0)
23 |         return output


--------------------------------------------------------------------------------
/retrieval/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Fu-Yun Wang.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/grounding/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/retrieval/configs/lpi/coco_l2p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "Coco",
 4 |     "data_path": "/home/wangye/wangye/data/",
 5 |     "memory_size": 0,
 6 |     "memory_per_class": 0,
 7 |     "fixed_memory": true,
 8 |     "shuffle": false,
 9 |     "init_cls": 345,
10 |     "increment": 345,
11 |     "model_name": "sprompts",
12 |     "net_type": "slip",
13 |     "embd_dim" : 768,
14 |     "prompt_length" : 10,
15 |     "total_sessions" : 12,
16 |     "device": ["0"],
17 |     "seed": [1993],
18 |     "EPSILON" : 1e-8,
19 |     "init_epoch" : 5,
20 |     "init_lr" : 0.05,
21 |     "init_lr_decay" : 0.1,
22 |     "init_weight_decay" : 0.0005,
23 |     "epochs" : 5,
24 |     "lrate" : 0.05,
25 |     "lrate_decay" : 0.1,
26 |     "batch_size" : 16,
27 |     "weight_decay" : 2e-4,
28 |     "num_workers" : 8,
29 | 
30 |     "trainer": "AMPL",
31 |     "vision_depth": 0,
32 |     "language_depth": 0,
33 |     "vision_ctx": 0,
34 |     "language_ctx": 0,
35 |     "cmpa_length": 10,
36 |     "fusing": "mean",
37 |     "parameter_sharing": false,
38 | 
39 |     "backbonename": "ViT-B/16",
40 |     "NCTX": 10,
41 |     "CTXINIT": "",
42 |     "CSC": false,
43 |     "CLASS_TOKEN_POSITION": "end",
44 |     "prompt_type" : "l2p"
45 | }


--------------------------------------------------------------------------------
/retrieval/configs/lpi/coco_clip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "Coco",
 4 |     "data_path": "/home/wangye/wangye/data/",
 5 |     "memory_size": 0,
 6 |     "memory_per_class": 0,
 7 |     "fixed_memory": true,
 8 |     "shuffle": false,
 9 |     "init_cls": 345,
10 |     "increment": 345,
11 |     "model_name": "sprompts",
12 |     "net_type": "slip",
13 |     "embd_dim" : 768,
14 |     "prompt_length" : 10,
15 |     "total_sessions" : 12,
16 |     "device": ["0"],
17 |     "seed": [1993],
18 |     "EPSILON" : 1e-8,
19 |     "init_epoch" : 0,
20 |     "init_lr" : 0.05,
21 |     "init_lr_decay" : 0.1,
22 |     "init_weight_decay" : 0.0005,
23 |     "epochs" : 0,
24 |     "lrate" : 0.05,
25 |     "lrate_decay" : 0.1,
26 |     "batch_size" : 128,
27 |     "weight_decay" : 2e-4,
28 |     "num_workers" : 8,
29 | 
30 |     "trainer": "AMPL",
31 |     "vision_depth": 0,
32 |     "language_depth": 0,
33 |     "vision_ctx": 0,
34 |     "language_ctx": 0,
35 |     "cmpa_length": 16,
36 |     "fusing": "mean",
37 |     "parameter_sharing": false,
38 | 
39 |     "backbonename": "ViT-B/16",
40 |     "NCTX": 10,
41 |     "CTXINIT": "",
42 |     "CSC": false,
43 |     "CLASS_TOKEN_POSITION": "none",
44 |     "prompt_type" : "clip"
45 | }


--------------------------------------------------------------------------------
/retrieval/configs/lpi/coco_lpi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "Coco",
 4 |     "image_root": "/home1/yanweicai/DATA/CV/coco",
 5 |     "annotation_train_root": "/home1/yanweicai/DATA/CV/coco/annotations/retrieval_train2014.json",
 6 |     "annotation_val_root": "/home1/yanweicai/DATA/CV/coco/annotations/retrieval_val2014.json",
 7 |     "memory_size": 0,
 8 |     "memory_per_class": 0,
 9 |     "fixed_memory": true,
10 |     "shuffle": false,
11 |     "model_name": "sprompts",
12 |     "net_type": "slip",
13 |     "embd_dim" : 768,
14 |     "visual_dim": 768,
15 |     "textual_dim": 512,
16 |     "prompt_length" : 16,
17 |     "total_sessions" : 12,
18 |     "device": ["0"],
19 |     "seed": [1993],
20 |     "EPSILON" : 1e-8,
21 |     "init_epoch" : 10,
22 |     "init_lr" : 0.05,
23 |     "init_lr_decay" : 0.1,
24 |     "init_weight_decay" : 0.0005,
25 |     "epochs" : 10, 
26 |     "lrate" : 0.05,
27 |     "lrate_decay" : 0.1,
28 |     "batch_size" : 64,
29 |     "weight_decay" : 2e-4,
30 |     "num_workers" : 8,
31 | 
32 |     "backbonename": "ViT-B/16",
33 |     "NCTX": 16,
34 |     "CTXINIT": "",
35 |     "CSC": false,
36 |     "CLASS_TOKEN_POSITION": "end",
37 |     "prompt_depth": 3,
38 |     "prompt_type" : "lpi"
39 | }


--------------------------------------------------------------------------------
/retrieval/configs/lpi/coco_sprompts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "Coco",
 4 |     "data_path": "/home/wangye/wangye/data/",
 5 |     "memory_size": 0,
 6 |     "memory_per_class": 0,
 7 |     "fixed_memory": true,
 8 |     "shuffle": false,
 9 |     "init_cls": 345,
10 |     "increment": 345,
11 |     "model_name": "sprompts",
12 |     "net_type": "slip",
13 |     "embd_dim" : 768,
14 |     "prompt_length" : 16,
15 |     "total_sessions" : 12,
16 |     "device": ["0"],
17 |     "seed": [1993],
18 |     "EPSILON" : 1e-8,
19 |     "init_epoch" : 10,
20 |     "init_lr" : 0.05,
21 |     "init_lr_decay" : 0.1,
22 |     "init_weight_decay" : 0.0005,
23 |     "epochs" : 10,
24 |     "lrate" : 0.05,
25 |     "lrate_decay" : 0.1,
26 |     "batch_size" : 128,
27 |     "weight_decay" : 2e-4,
28 |     "num_workers" : 8,
29 | 
30 |     "trainer": "AMPL",
31 |     "vision_depth": 0,
32 |     "language_depth": 0,
33 |     "vision_ctx": 0,
34 |     "language_ctx": 0,
35 |     "cmpa_length": 16,
36 |     "fusing": "mean",
37 |     "parameter_sharing": false,
38 | 
39 |     "backbonename": "ViT-B/16",
40 |     "NCTX": 16,
41 |     "CTXINIT": "",
42 |     "CSC": false,
43 |     "CLASS_TOKEN_POSITION": "end",
44 |     "prompt_type" : "sprompts"
45 | }


--------------------------------------------------------------------------------
/retrieval/main.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from trainer import train
 4 | 
 5 | 
 6 | def main():
 7 |     args = setup_parser().parse_args()
 8 |     # param = load_json('configs/coco_org_sprompt.json')
 9 |     '''
10 |     configs:
11 |     clip: configs/lpi/coco_clip.json
12 |     l2p: configs/lpi/coco_l2p.json
13 |     S-prompts: configs/lpi/coco_sprompts.json
14 |     lpi(ours): configs/lpi/coco_sprompts.json
15 |     '''
16 |     # param = load_json('configs/lpi/coco_org_sprompt.json')
17 | 
18 |     param = load_json(args.config)
19 |     args = vars(args)  # Converting argparse Namespace to a dict.
20 |     args.update(param)  # Add parameters from json
21 |     train(args)
22 | 
23 | 
24 | def load_json(settings_path):
25 |     with open(settings_path) as data_file:
26 |         param = json.load(data_file)
27 | 
28 |     return param
29 | 
30 | 
31 | def setup_parser():
32 |     parser = argparse.ArgumentParser(description='Reproduce of multiple continual learning algorthms.')
33 |     parser.add_argument('--config', type=str, default='./exps/finetune.json',
34 |                         help='Json file of settings.')
35 |     parser.add_argument('--local_rank',  default=-1)
36 |     return parser
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/grounding/shell/cmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /root/workspace/grounding/prompt_grounding
 4 | #python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_task_layer_interact.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1
 5 | 
 6 | python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_task_interact.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1
 7 | 
 8 | python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_layer_interact.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1
 9 | 
10 | python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_layer_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1
11 | 
12 | #python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_task.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1
13 | #
14 | #python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_layer.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1
15 | #
16 | #python tools/finetune.py       --config-file configs/refcoco/finetune_A_decompose_interact.yaml --skip-test       --custom_shot_and_epoch_and_general_copy 0_10_1


--------------------------------------------------------------------------------
/retrieval/configs/coco_slip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prefix": "reproduce",
 3 |     "dataset": "Coco",
 4 |     "image_root": "/root/autodl-tmp/coco",
 5 |     "annotation_root": "/root/autodl-tmp/coco/annotations/retrieval_train2014.json",
 6 |     "memory_size": 0,
 7 |     "memory_per_class": 0,
 8 |     "fixed_memory": true,
 9 |     "shuffle": false,
10 |     "init_cls": 345,
11 |     "increment": 345,
12 |     "model_name": "sprompts",
13 |     "net_type": "slip",
14 |     "embd_dim" : 768,
15 |     "prompt_length" : 16,
16 |     "total_sessions" : 12,
17 |     "device": ["0"],
18 |     "seed": [1993],
19 |     "EPSILON" : 1e-8,
20 |     "init_epoch" : 5,
21 |     "init_lr" : 0.05,
22 |     "init_lr_decay" : 0.1,
23 |     "init_weight_decay" : 0.0005,
24 |     "epochs" : 5,
25 |     "lrate" : 0.05,
26 |     "lrate_decay" : 0.1,
27 |     "batch_size" : 128,
28 |     "weight_decay" : 2e-4,
29 |     "num_workers" : 16,
30 | 
31 |     "trainer": "CMPA",
32 |     "vision_depth": 0,
33 |     "language_depth": 0,
34 |     "vision_ctx": 0,
35 |     "language_ctx": 0,
36 |     "cmpa_length": 16,
37 |     "fusing": "mean",
38 |     "parameter_sharing": true,
39 | 
40 |     "backbonename": "ViT-B/16",
41 |     "NCTX": 16,
42 |     "CTXINIT": "",
43 |     "CSC": false,
44 |     "CLASS_TOKEN_POSITION": "end"
45 | }


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | // Interface for Python
10 | at::Tensor SigmoidFocalLoss_forward(
11 | 		const at::Tensor& logits,
12 |                 const at::Tensor& targets,
13 | 		const int num_classes, 
14 | 		const float gamma, 
15 | 		const float alpha) {
16 |   if (logits.device().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   AT_ERROR("Not implemented on the CPU");
24 | }
25 | 
26 | at::Tensor SigmoidFocalLoss_backward(
27 | 			     const at::Tensor& logits,
28 |                              const at::Tensor& targets,
29 | 			     const at::Tensor& d_losses,
30 | 			     const int num_classes,
31 | 			     const float gamma,
32 | 			     const float alpha) {
33 |   if (logits.device().is_cuda()) {
34 | #ifdef WITH_CUDA
35 |     return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
36 | #else
37 |     AT_ERROR("Not compiled with GPU support");
38 | #endif
39 |   }
40 |   AT_ERROR("Not implemented on the CPU");
41 | }
42 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from torch.utils.data.sampler import BatchSampler
 3 | 
 4 | 
 5 | class IterationBasedBatchSampler(BatchSampler):
 6 |     """
 7 |     Wraps a BatchSampler, resampling from it until
 8 |     a specified number of iterations have been sampled
 9 |     """
10 | 
11 |     def __init__(self, batch_sampler, num_iterations, start_iter=0):
12 |         self.batch_sampler = batch_sampler
13 |         self.num_iterations = num_iterations
14 |         self.start_iter = start_iter
15 | 
16 |     def __iter__(self):
17 |         iteration = self.start_iter
18 |         while iteration <= self.num_iterations:
19 |             # if the underlying sampler has a set_epoch method, like
20 |             # DistributedSampler, used for making each process see
21 |             # a different split of the dataset, then set it
22 |             if hasattr(self.batch_sampler.sampler, "set_epoch"):
23 |                 self.batch_sampler.sampler.set_epoch(iteration)
24 |             for batch in self.batch_sampler:
25 |                 iteration += 1
26 |                 if iteration > self.num_iterations:
27 |                     break
28 |                 yield batch
29 | 
30 |     def __len__(self):
31 |         return self.num_iterations
32 | 


--------------------------------------------------------------------------------
/grounding/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import os
 3 | 
 4 | from maskrcnn_benchmark.utils.imports import import_file
 5 | 
 6 | 
 7 | def setup_environment():
 8 |     """Perform environment setup work. The default setup is a no-op, but this
 9 |     function allows the user to specify a Python source file that performs
10 |     custom setup work that may be necessary to their computing environment.
11 |     """
12 |     custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE")
13 |     if custom_module_path:
14 |         setup_custom_environment(custom_module_path)
15 |     else:
16 |         # The default setup is a no-op
17 |         pass
18 | 
19 | 
20 | def setup_custom_environment(custom_module_path):
21 |     """Load custom environment setup from a Python source file and run the setup
22 |     function.
23 |     """
24 |     module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path)
25 |     assert hasattr(module, "setup_environment") and callable(
26 |         module.setup_environment
27 |     ), (
28 |         "Custom environment module defined in {} does not have the "
29 |         "required callable attribute 'setup_environment'."
30 |     ).format(
31 |         custom_module_path
32 |     )
33 |     module.setup_environment()
34 | 
35 | 
36 | # Force environment setup when this module is imported
37 | setup_environment()
38 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.nn import functional as F
 3 | 
 4 | from maskrcnn_benchmark import layers
 5 | 
 6 | 
 7 | class KeypointRCNNPredictor(nn.Module):
 8 |     def __init__(self, cfg):
 9 |         super(KeypointRCNNPredictor, self).__init__()
10 |         input_features = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS[-1]
11 |         num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES
12 |         deconv_kernel = 4
13 |         self.kps_score_lowres = layers.ConvTranspose2d(
14 |             input_features,
15 |             num_keypoints,
16 |             deconv_kernel,
17 |             stride=2,
18 |             padding=deconv_kernel // 2 - 1,
19 |         )
20 |         nn.init.kaiming_normal_(
21 |             self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu"
22 |         )
23 |         nn.init.constant_(self.kps_score_lowres.bias, 0)
24 |         self.up_scale = 2
25 | 
26 |     def forward(self, x):
27 |         x = self.kps_score_lowres(x)
28 |         x = layers.interpolate(
29 |             x, scale_factor=self.up_scale, mode="bilinear", align_corners=False
30 |         )
31 |         return x
32 | 
33 | 
34 | _ROI_KEYPOINT_PREDICTOR = {"KeypointRCNNPredictor": KeypointRCNNPredictor}
35 | 
36 | 
37 | def make_roi_keypoint_predictor(cfg):
38 |     func = _ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR]
39 |     return func(cfg)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # grounding
 2 | grounding/*.egg-info
 3 | grounding/*.pyc
 4 | grounding/build/
 5 | grounding/DATASET/
 6 | grounding/OUTPUT/
 7 | grounding/MODEL/
 8 | grounding/best_model/
 9 | grounding/all_key/
10 | grounding/OUTPUT_org
11 | 
12 | # compilation and distribution
13 | __pycache__
14 | _ext
15 | *.so
16 | grounding/maskrcnn_benchmark.egg-info/
17 | grounding/dist/
18 | 
19 | # pytorch/python/numpy formats
20 | grounding/*.pth
21 | grounding/*.pkl
22 | grounding/*.npy
23 | 
24 | # ipython/jupyter notebooks
25 | *.ipynb
26 | **/.ipynb_checkpoints/
27 | 
28 | # Editor temporaries
29 | *.swn
30 | *.swo
31 | *.swp
32 | *~
33 | 
34 | # Pycharm editor settings
35 | .idea
36 | 
37 | # vscode editor settings
38 | .vscode
39 | 
40 | # MacOS
41 | .DS_Store
42 | 
43 | # Custom
44 | *.custom.py
45 | 
46 | # logs
47 | logs/
48 | log_new/
49 | 
50 | # res
51 | grounding/FINAL_RES/
52 | grounding/FINAL_RES_v2/
53 | grounding/eval/
54 | 
55 | # visualize
56 | grounding/visualize_base/
57 | grounding/visualize_glip/
58 | grounding/visualize_lpi/
59 | grounding/visualize_lpi_v2/
60 | grounding/visualize_maple/
61 | grounding/visualize_sprompts/
62 | 
63 | # task_div
64 | grounding/TASK_DIV/
65 | grounding/TASK_DIV_REFCOCO/
66 | 
67 | # prompt_save
68 | grounding/prompt_save/prompt_grounding/
69 | 
70 | # embedding save
71 | grounding/embedding_save/
72 | 
73 | # retrieval
74 | __pycache__
75 | .idea/
76 | retrieval/logs/
77 | retrieval/logss/
78 | retrieval/res/
79 | retrieval/.vscode/
80 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/nms.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | 
10 | at::Tensor nms(const at::Tensor& dets,
11 |                const at::Tensor& scores,
12 |                const float threshold) {
13 | 
14 |   if (dets.device().is_cuda()) {
15 | #ifdef WITH_CUDA
16 |     // TODO raise error if not compiled with CUDA
17 |     if (dets.numel() == 0)
18 |       return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
19 |     auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
20 |     return nms_cuda(b, threshold);
21 | #else
22 |     AT_ERROR("Not compiled with GPU support");
23 | #endif
24 |   }
25 | 
26 |   at::Tensor result = nms_cpu(dets, scores, threshold);
27 |   return result;
28 | }
29 | 
30 | 
31 | std::pair<at::Tensor, at::Tensor> soft_nms(const at::Tensor& dets,
32 |                                            const at::Tensor& scores,
33 |                                            const float threshold,
34 |                                            const float sigma) {
35 | 
36 |   if (dets.device().is_cuda()) {
37 | #ifdef WITH_CUDA
38 |     AT_ERROR("Soft NMS Does Not have GPU support");
39 | #endif
40 |   }
41 | 
42 |   std::pair<at::Tensor, at::Tensor> result = soft_nms_cpu(dets, scores, threshold, sigma);
43 | 
44 |   return result;
45 | }


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/evonorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class EvoNorm2d(nn.Module):
 6 |     __constants__ = ['num_features', 'eps', 'nonlinearity']
 7 | 
 8 |     def __init__(self, num_features, eps=1e-5, nonlinearity=True, group=32):
 9 |         super(EvoNorm2d, self).__init__()
10 | 
11 |         self.num_features = num_features
12 |         self.eps = eps
13 |         self.nonlinearity = nonlinearity
14 |         self.group = group
15 | 
16 |         self.weight = nn.Parameter(torch.Tensor(1, num_features, 1, 1))
17 |         self.bias = nn.Parameter(torch.Tensor(1, num_features, 1, 1))
18 |         if self.nonlinearity:
19 |             self.v = nn.Parameter(torch.Tensor(1, num_features, 1, 1))
20 | 
21 |         self.reset_parameters()
22 | 
23 |     def reset_parameters(self):
24 |         nn.init.ones_(self.weight)
25 |         nn.init.zeros_(self.bias)
26 |         if self.nonlinearity:
27 |             nn.init.ones_(self.v)
28 | 
29 |     def group_std(self, x, groups=32):
30 |         N, C, H, W = x.shape
31 |         x = torch.reshape(x, (N, groups, C // groups, H, W))
32 |         std = torch.std(x, (3, 4), keepdim=True)
33 |         return torch.reshape(std + self.eps, (N, C, 1, 1))
34 | 
35 |     def forward(self, x):
36 |         if self.nonlinearity:
37 |             num = x * torch.sigmoid(self.v * x)
38 |             return num / self.group_std(x, self.group) * self.weight + self.bias
39 |         else:
40 |             return x * self.weight + self.bias


--------------------------------------------------------------------------------
/grounding/test/task_visual.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import spatial
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.manifold import TSNE
 5 | import matplotlib.cm as cm
 6 | import numpy as np
 7 | from openai import OpenAI
 8 | api_key = "sk-MazpnWiEWQhrgtP8526a79F8D7254a5894296d2d81Ea6c7a"
 9 | api_base = "https://oneapi.xty.app/v1"
10 | 
11 | client = OpenAI(api_key=api_key, base_url=api_base)
12 | 
13 | def get_embedding(text, model="text-embedding-3-large"):
14 |    text = text.replace("\n", " ")
15 |    return client.embeddings.create(input = [text], model=model).data[0].embedding
16 | 
17 | 
18 | tsne = TSNE(n_components=2, random_state=0, perplexity=1)
19 | words = ['appliance', 'sports', 'outdoor', 'electronic', 'accessory', 'indoor', 'kitchen', 'furniture', 'vehicle', 'food', 'animal', 'person']
20 | vectors = [get_embedding(word) for word in words]
21 | vectors = np.array(vectors)
22 | 
23 | np.savetxt('../MID/tasks_array.txt', vectors)
24 | 
25 | Y = tsne.fit_transform(vectors)
26 | 
27 | # colors = cm.rainbow(np.linspace(0, 1, Y.shape[0]))
28 | colors = plt.get_cmap('tab20')(range(12))
29 | for dataset, color, label in zip(Y, colors, words):
30 |     plt.scatter(dataset[0], dataset[1], color=color, label=label)
31 | 
32 | plt.xlabel("X")
33 | plt.ylabel("Y")
34 | plt.legend()
35 | plt.savefig('../MID/task_visual.png')
36 | # for dataset, label in zip(Y, words):
37 | #     plt.annotate(label, (dataset[0], dataset[1]), textcoords='offset points',xytext=(0,10), ha='center')
38 | plt.show()
39 | 
40 | # plt.savefig('../MID/task_visual.png')


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/language_backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | from maskrcnn_benchmark.modeling import registry
 6 | from . import bert_model
 7 | from . import rnn_model
 8 | from . import clip_model
 9 | from . import word_utils
10 | 
11 | 
12 | @registry.LANGUAGE_BACKBONES.register("bert-base-uncased")
13 | def build_bert_backbone(cfg):
14 |     body = bert_model.BertEncoder(cfg)
15 |     model = nn.Sequential(OrderedDict([("body", body)]))
16 |     return model
17 | 
18 | 
19 | @registry.LANGUAGE_BACKBONES.register("roberta-base")
20 | def build_bert_backbone(cfg):
21 |     body = bert_model.BertEncoder(cfg)
22 |     model = nn.Sequential(OrderedDict([("body", body)]))
23 |     return model
24 | 
25 | 
26 | @registry.LANGUAGE_BACKBONES.register("rnn")
27 | def build_rnn_backbone(cfg):
28 |     body = rnn_model.RNNEnoder(cfg)
29 |     model = nn.Sequential(OrderedDict([("body", body)]))
30 |     return model
31 | 
32 | 
33 | @registry.LANGUAGE_BACKBONES.register("clip")
34 | def build_clip_backbone(cfg):
35 |     body = clip_model.CLIPTransformer(cfg)
36 |     model = nn.Sequential(OrderedDict([("body", body)]))
37 |     return model
38 | 
39 | 
40 | def build_backbone(cfg):
41 |     assert cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE in registry.LANGUAGE_BACKBONES, \
42 |         "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format(
43 |             cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
44 |         )
45 |     return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE](cfg)
46 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | 
 4 | def _register_generic(module_dict, module_name, module):
 5 |     assert module_name not in module_dict
 6 |     module_dict[module_name] = module
 7 | 
 8 | 
 9 | class Registry(dict):
10 |     '''
11 |     A helper class for managing registering modules, it extends a dictionary
12 |     and provides a register functions.
13 | 
14 |     Eg. creeting a registry:
15 |         some_registry = Registry({"default": default_module})
16 | 
17 |     There're two ways of registering new modules:
18 |     1): normal way is just calling register function:
19 |         def foo():
20 |             ...
21 |         some_registry.register("foo_module", foo)
22 |     2): used as decorator when declaring the module:
23 |         @some_registry.register("foo_module")
24 |         @some_registry.register("foo_modeul_nickname")
25 |         def foo():
26 |             ...
27 | 
28 |     Access of module is just like using a dictionary, eg:
29 |         f = some_registry["foo_modeul"]
30 |     '''
31 |     def __init__(self, *args, **kwargs):
32 |         super(Registry, self).__init__(*args, **kwargs)
33 | 
34 |     def register(self, module_name, module=None):
35 |         # used as function call
36 |         if module is not None:
37 |             _register_generic(self, module_name, module)
38 |             return
39 | 
40 |         # used as decorator
41 |         def register_fn(fn):
42 |             _register_generic(self, module_name, fn)
43 |             return fn
44 | 
45 |         return register_fn
46 | 


--------------------------------------------------------------------------------
/grounding/odinw/download.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | argparser = argparse.ArgumentParser()
 5 | argparser.add_argument("--dataset_names", default="all", type=str) # "all" or names joined by comma
 6 | argparser.add_argument("--dataset_path", default="DATASET/odinw", type=str)
 7 | args = argparser.parse_args()
 8 | 
 9 | root = "https://huggingface.co/GLIPModel/GLIP/tree/main/odinw_35"
10 | 
11 | all_datasets = ["AerialMaritimeDrone", "AmericanSignLanguageLetters", "Aquarium", "BCCD", "ChessPieces", "CottontailRabbits", "DroneControl", "EgoHands", "HardHatWorkers", "MaskWearing", "MountainDewCommercial", "NorthAmericaMushrooms", "OxfordPets", "PKLot", "Packages", "PascalVOC", "Raccoon", "ShellfishOpenImages", "ThermalCheetah", "UnoCards", "VehiclesOpenImages", "WildfireSmoke", "boggleBoards", "brackishUnderwater", "dice", "openPoetryVision", "pistols", "plantdoc", "pothole", "selfdrivingCar", "thermalDogsAndPeople", "vector", "websiteScreenshots"]
12 | 
13 | datasets_to_download = []
14 | if args.dataset_names == "all":
15 |     datasets_to_download = all_datasets
16 | else:
17 |     datasets_to_download = args.dataset_names.split(",")
18 | 
19 | for dataset in datasets_to_download:
20 |     if dataset in all_datasets:
21 |         print("Downloading dataset: ", dataset)
22 |         os.system("wget " + root + "/" + dataset + ".zip" + " -O " + args.dataset_path + "/" + dataset + ".zip")
23 |         os.system("unzip " + args.dataset_path + "/" + dataset + ".zip -d " + args.dataset_path)
24 |         os.system("rm " + args.dataset_path + "/" + dataset + ".zip")
25 |     else:
26 |         print("Dataset not found: ", dataset)
27 | 


--------------------------------------------------------------------------------
/grounding/matrix/matrix.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | from torch.nn.functional import binary_cross_entropy_with_logits
 5 | 
 6 | def nt_bxent_loss(x, target, temperature=1.0):
 7 |     assert len(x.size()) == 2
 8 |     target = target.type(torch.float32).to(x.device)
 9 |     # Cosine similarity
10 |     xcs = F.cosine_similarity(x[None, :, :], x[:, None, :], dim=-1)
11 |     # Set logit of diagonal element to "inf" signifying complete
12 |     # correlation. sigmoid(inf) = 1.0 so this will work out nicely
13 |     # when computing the Binary cross-entropy Loss.
14 |     xcs[torch.eye(x.size(0)).bool()] = float("inf")
15 | 
16 |     # Standard binary cross-entropy loss. We use binary_cross_entropy() here and not
17 |     # binary_cross_entropy_with_logits() because of
18 |     # https://github.com/pytorch/pytorch/issues/102894
19 |     # The method *_with_logits() uses the log-sum-exp-trick, which causes inf and -inf values
20 |     # to result in a NaN result.
21 |     loss = binary_cross_entropy_with_logits(input=(xcs / temperature).sigmoid(), target=target, reduction="none")
22 | 
23 |     target_pos = target.bool()
24 |     target_neg = ~target_pos
25 | 
26 |     loss_pos = torch.zeros(x.size(0), x.size(0)).to(x.device).masked_scatter(target_pos, loss[target_pos])
27 |     loss_neg = torch.zeros(x.size(0), x.size(0)).to(x.device).masked_scatter(target_neg, loss[target_neg])
28 |     loss_pos = loss_pos.sum(dim=1)
29 |     loss_neg = loss_neg.sum(dim=1)
30 |     num_pos = target.sum(dim=1)
31 |     num_neg = x.size(0) - num_pos
32 | 
33 |     return ((loss_pos / num_pos) + (loss_neg / num_neg)).mean()


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | from .batch_norm import FrozenBatchNorm2d, NaiveSyncBatchNorm2d
 5 | from .misc import Conv2d, _NewEmptyTensorOp
 6 | from .misc import ConvTranspose2d
 7 | from .misc import DFConv2d
 8 | from .misc import interpolate
 9 | from .misc import Scale
10 | from .nms import nms
11 | from .nms import ml_nms
12 | from .nms import soft_nms
13 | from .roi_align import ROIAlign
14 | from .roi_align import roi_align
15 | from .roi_align import ROIAlignV2
16 | from .roi_pool import ROIPool
17 | from .roi_pool import roi_pool
18 | from .smooth_l1_loss import smooth_l1_loss
19 | from .sigmoid_focal_loss import SigmoidFocalLoss, TokenSigmoidFocalLoss
20 | from .iou_loss import IOULoss, IOUWHLoss
21 | from .deform_conv import DeformConv, ModulatedDeformConv
22 | from .dropblock import DropBlock2D, DropBlock3D
23 | from .evonorm import EvoNorm2d
24 | from .dyrelu import DYReLU, swish
25 | from .se import SELayer, SEBlock
26 | from .dyhead import DyHead
27 | from .set_loss import HungarianMatcher, SetCriterion
28 | 
29 | __all__ = ["nms", "ml_nms", "soft_nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool",
30 |            "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "swish",
31 |            "FrozenBatchNorm2d", "NaiveSyncBatchNorm2d", "SigmoidFocalLoss", "TokenSigmoidFocalLoss", "IOULoss",
32 |            "IOUWHLoss", "Scale", "DeformConv", "ModulatedDeformConv", "DyHead",
33 |            "DropBlock2D", "DropBlock3D", "EvoNorm2d", "DYReLU", "SELayer", "SEBlock",
34 |            "HungarianMatcher", "SetCriterion", "ROIAlignV2", "_NewEmptyTensorOp"]
35 | 


--------------------------------------------------------------------------------
/grounding/configs/odinw_35/_all.json:
--------------------------------------------------------------------------------
1 | ["configs/odinw_35/AerialMaritimeDrone_large.yaml","configs/odinw_35/AerialMaritimeDrone_tiled.yaml","configs/odinw_35/AmericanSignLanguageLetters_American_Sign_Language_Letters.v1-v1.coco.yaml","configs/odinw_35/Aquarium_Aquarium_Combined.v2-raw-1024.coco.yaml","configs/odinw_35/BCCD_BCCD.v3-raw.coco.yaml","configs/odinw_35/ChessPieces_Chess_Pieces.v23-raw.coco.yaml","configs/odinw_35/CottontailRabbits.yaml","configs/odinw_35/DroneControl_Drone_Control.v3-raw.coco.yaml","configs/odinw_35/EgoHands_generic.yaml","configs/odinw_35/EgoHands_specific.yaml","configs/odinw_35/HardHatWorkers_raw.yaml","configs/odinw_35/MaskWearing_raw.yaml","configs/odinw_35/MountainDewCommercial.yaml","configs/odinw_35/NorthAmericaMushrooms_North_American_Mushrooms.v1-416x416.coco.yaml","configs/odinw_35/OxfordPets_by-breed.yaml","configs/odinw_35/OxfordPets_by-species.yaml","configs/odinw_35/PKLot_640.yaml","configs/odinw_35/Packages_Raw.yaml","configs/odinw_35/PascalVOC.yaml","configs/odinw_35/Raccoon_Raccoon.v2-raw.coco.yaml","configs/odinw_35/ShellfishOpenImages_raw.yaml","configs/odinw_35/ThermalCheetah.yaml","configs/odinw_35/UnoCards_raw.yaml","configs/odinw_35/VehiclesOpenImages_416x416.yaml","configs/odinw_35/WildfireSmoke.yaml","configs/odinw_35/boggleBoards_416x416AutoOrient_export_.yaml","configs/odinw_35/brackishUnderwater_960x540.yaml","configs/odinw_35/dice_mediumColor_export.yaml","configs/odinw_35/openPoetryVision_512x512.yaml","configs/odinw_35/pistols_export.yaml","configs/odinw_35/plantdoc_416x416.yaml","configs/odinw_35/pothole.yaml","configs/odinw_35/selfdrivingCar_fixedLarge_export_.yaml","configs/odinw_35/thermalDogsAndPeople.yaml","configs/odinw_35/websiteScreenshots.yaml"]


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/vision.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include "nms.h"
 3 | #include "ml_nms.h"
 4 | #include "ROIAlign.h"
 5 | #include "ROIPool.h"
 6 | #include "SigmoidFocalLoss.h"
 7 | #include "deform_conv.h"
 8 | #include "deform_pool.h"
 9 | 
10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11 |   m.def("nms", &nms, "non-maximum suppression");
12 |   m.def("ml_nms", &ml_nms, "multi-label non-maximum suppression");
13 |   m.def("soft_nms", &soft_nms, "soft non-maximum suppression");
14 |   m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
15 |   m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
16 |   m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
17 |   m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
18 |   m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
19 |   m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
20 |   m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
21 |   m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
22 |   m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
23 |   m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward");
24 |   m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
25 |   m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
26 |   m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
27 | }
28 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/background.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path
 3 | import json
 4 | from PIL import Image
 5 | 
 6 | import torch
 7 | import torchvision
 8 | import torch.utils.data as data
 9 | from maskrcnn_benchmark.structures.bounding_box import BoxList
10 | 
11 | class Background(data.Dataset):
12 |     """ Background
13 | 
14 |     Args:
15 |         root (string): Root directory where images are downloaded to.
16 |         annFile (string): Path to json annotation file.
17 |         transform (callable, optional): A function/transform that  takes in an PIL image
18 |             and returns a transformed version. E.g, ``transforms.ToTensor``
19 |     """
20 | 
21 |     def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None):
22 |         self.root = root
23 | 
24 |         with open(ann_file, 'r') as f:
25 |             self.ids = json.load(f)['images']
26 |         self.transform = transforms
27 | 
28 |     def __getitem__(self, index):
29 |         """
30 |         Args:
31 |             index (int): Index
32 | 
33 |         Returns:
34 |             tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
35 |         """
36 |         im_info = self.ids[index]
37 |         path = im_info['file_name']
38 |         fp = os.path.join(self.root, path)
39 | 
40 |         img = Image.open(fp).convert('RGB')
41 |         if self.transform is not None:
42 |             img, _ = self.transform(img, None)
43 |         null_target = BoxList(torch.zeros((0,4)), (img.shape[-1], img.shape[-2]))
44 |         null_target.add_field('labels', torch.zeros(0))
45 | 
46 |         return img, null_target, index
47 | 
48 |     def __len__(self):
49 |         return len(self.ids)
50 | 
51 |     def get_img_info(self, index):
52 |         im_info = self.ids[index]
53 |         return im_info


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/ema.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from collections import OrderedDict
 3 | import torch
 4 | 
 5 | 
 6 | class ModelEma:
 7 |     def __init__(self, model, decay=0.9999, device=''):
 8 |         self.ema = deepcopy(model)
 9 |         self.ema.eval()
10 |         self.decay = decay
11 |         self.device = device
12 |         if device:
13 |             self.ema.to(device=device)
14 |         self.ema_is_dp = hasattr(self.ema, 'module')
15 |         for p in self.ema.parameters():
16 |             p.requires_grad_(False)
17 | 
18 |     def load_checkpoint(self, checkpoint):
19 |         if isinstance(checkpoint, str):
20 |             checkpoint = torch.load(checkpoint)
21 | 
22 |         assert isinstance(checkpoint, dict)
23 |         if 'model_ema' in checkpoint:
24 |             new_state_dict = OrderedDict()
25 |             for k, v in checkpoint['model_ema'].items():
26 |                 if self.ema_is_dp:
27 |                     name = k if k.startswith('module') else 'module.' + k
28 |                 else:
29 |                     name = k.replace('module.', '') if k.startswith('module') else k
30 |                 new_state_dict[name] = v
31 |             self.ema.load_state_dict(new_state_dict)
32 | 
33 |     def state_dict(self):
34 |         return self.ema.state_dict()
35 | 
36 |     def update(self, model):
37 |         pre_module = hasattr(model, 'module') and not self.ema_is_dp
38 |         with torch.no_grad():
39 |             curr_msd = model.state_dict()
40 |             for k, ema_v in self.ema.state_dict().items():
41 |                 k = 'module.' + k if pre_module else k
42 |                 model_v = curr_msd[k].detach()
43 |                 if self.device:
44 |                     model_v = model_v.to(device=self.device)
45 |                 ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
46 | 
47 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/pretrain_model_loading.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from collections import OrderedDict
 6 | 
 7 | def _remove_bn_statics(state_dict):
 8 |     layer_keys = sorted(state_dict.keys())
 9 |     remove_list = []
10 |     for key in layer_keys:
11 |         if 'running_mean' in key or 'running_var' in key or 'num_batches_tracked' in key:
12 |             remove_list.append(key)
13 |     for key in remove_list:
14 |         del state_dict[key]
15 |     return state_dict
16 | 
17 | def _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg):
18 |     import re
19 |     layer_keys = sorted(state_dict.keys())
20 |     for ix, stage_with_dcn in enumerate(cfg.MODEL.RESNETS.STAGE_WITH_DCN, 1):
21 |         if not stage_with_dcn:
22 |             continue
23 |         for old_key in layer_keys:
24 |             pattern = ".*layer{}.*conv2.*".format(ix)
25 |             r = re.match(pattern, old_key)
26 |             if r is None:
27 |                 continue
28 |             for param in ["weight", "bias"]:
29 |                 if old_key.find(param) is -1:
30 |                     continue
31 |                 if 'unit01' in old_key:
32 |                     continue
33 |                 new_key = old_key.replace(
34 |                     "conv2.{}".format(param), "conv2.conv.{}".format(param)
35 |                 )
36 |                 print("pattern: {}, old_key: {}, new_key: {}".format(
37 |                     pattern, old_key, new_key
38 |                 ))
39 |                 state_dict[new_key] = state_dict[old_key]
40 |                 del state_dict[old_key]
41 |     return state_dict
42 | 
43 | 
44 | def load_pretrain_format(cfg, f):
45 |     model = torch.load(f)
46 |     model = _remove_bn_statics(model)
47 |     model = _rename_conv_weights_for_deformable_conv_layers(model, cfg)
48 | 
49 |     return dict(model=model)
50 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/ROIPool.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cpu/vision.h"
 5 | 
 6 | #ifdef WITH_CUDA
 7 | #include "cuda/vision.h"
 8 | #endif
 9 | 
10 | 
11 | std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
12 |                                 const at::Tensor& rois,
13 |                                 const float spatial_scale,
14 |                                 const int pooled_height,
15 |                                 const int pooled_width) {
16 |   if (input.device().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   AT_ERROR("Not implemented on the CPU");
24 | }
25 | 
26 | at::Tensor ROIPool_backward(const at::Tensor& grad,
27 |                                  const at::Tensor& input,
28 |                                  const at::Tensor& rois,
29 |                                  const at::Tensor& argmax,
30 |                                  const float spatial_scale,
31 |                                  const int pooled_height,
32 |                                  const int pooled_width,
33 |                                  const int batch_size,
34 |                                  const int channels,
35 |                                  const int height,
36 |                                  const int width) {
37 |   if (grad.device().is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/ROIAlign.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cpu/vision.h"
 5 | 
 6 | #ifdef WITH_CUDA
 7 | #include "cuda/vision.h"
 8 | #endif
 9 | 
10 | // Interface for Python
11 | at::Tensor ROIAlign_forward(const at::Tensor& input,
12 |                             const at::Tensor& rois,
13 |                             const float spatial_scale,
14 |                             const int pooled_height,
15 |                             const int pooled_width,
16 |                             const int sampling_ratio) {
17 |   if (input.device().is_cuda()) {
18 | #ifdef WITH_CUDA
19 |     return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
20 | #else
21 |     AT_ERROR("Not compiled with GPU support");
22 | #endif
23 |   }
24 |   return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
25 | }
26 | 
27 | at::Tensor ROIAlign_backward(const at::Tensor& grad,
28 |                              const at::Tensor& rois,
29 |                              const float spatial_scale,
30 |                              const int pooled_height,
31 |                              const int pooled_width,
32 |                              const int batch_size,
33 |                              const int channels,
34 |                              const int height,
35 |                              const int width,
36 |                              const int sampling_ratio) {
37 |   if (grad.device().is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/transforms/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from . import transforms as T
 3 | from torchvision import transforms
 4 | 
 5 | def build_transforms(cfg, is_train=True):
 6 |     if is_train:
 7 |         if len(cfg.AUGMENT.MULT_MIN_SIZE_TRAIN)>0:
 8 |             min_size = cfg.AUGMENT.MULT_MIN_SIZE_TRAIN
 9 |         else:
10 |             min_size = cfg.INPUT.MIN_SIZE_TRAIN
11 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
12 |         flip_horizontal_prob = cfg.AUGMENT.FLIP_PROB_TRAIN
13 |         flip_vertical_prob = cfg.AUGMENT.VERTICAL_FLIP_PROB_TRAIN
14 |         brightness = cfg.AUGMENT.BRIGHTNESS
15 |         contrast = cfg.AUGMENT.CONTRAST
16 |         saturation = cfg.AUGMENT.SATURATION
17 |         hue = cfg.AUGMENT.HUE
18 | 
19 |         crop_prob = cfg.AUGMENT.CROP_PROB
20 |         min_ious = cfg.AUGMENT.CROP_MIN_IOUS
21 |         min_crop_size = cfg.AUGMENT.CROP_MIN_SIZE
22 | 
23 |     else:
24 |         min_size = cfg.INPUT.MIN_SIZE_TEST
25 |         max_size = cfg.INPUT.MAX_SIZE_TEST
26 |         flip_horizontal_prob = 0.0
27 | 
28 |     fix_res = cfg.INPUT.FIX_RES
29 |     if cfg.INPUT.FORMAT is not '':
30 |         input_format = cfg.INPUT.FORMAT
31 |     elif cfg.INPUT.TO_BGR255:
32 |         input_format = 'bgr255'
33 |     normalize_transform = T.Normalize(
34 |         mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format
35 |     )
36 |     min_size = 448
37 |     max_size = 448
38 |     transform = T.Compose(
39 |         [
40 |             # T.Resize(min_size, max_size, restrict=fix_res),
41 |             T.Resize(min_size=min_size, max_size=max_size, restrict=True),
42 |             # transforms.Resize(320),
43 |             # T.Resize(320),
44 |             T.RandomHorizontalFlip(flip_horizontal_prob),
45 |             T.ToTensor(),
46 |             normalize_transform,
47 |         ]
48 |     )
49 |     return transform
50 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/se.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class SELayer(nn.Module):
 5 |     def __init__(self, channel, reduction=16):
 6 |         super(SELayer, self).__init__()
 7 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 8 |         self.fc = nn.Sequential(
 9 |             nn.Linear(channel, channel // reduction, bias=False),
10 |             nn.ReLU(inplace=True),
11 |             nn.Linear(channel // reduction, channel, bias=False),
12 |             nn.Sigmoid()
13 |         )
14 | 
15 |     def forward(self, x):
16 |         b, c, _, _ = x.size()
17 |         y = self.avg_pool(x).view(b, c)
18 |         y = self.fc(y).view(b, c, 1, 1)
19 |         return x * y.expand_as(x)
20 | 
21 | 
22 | class SEBlock(nn.Module):
23 |     def __init__(self, channels, reduction=16,
24 |                  use_conv=True, mid_activation=nn.ReLU(inplace=True), out_activation=nn.Sigmoid()):
25 |         super(SEBlock, self).__init__()
26 |         self.use_conv = use_conv
27 |         mid_channels = channels // reduction
28 | 
29 |         self.pool = nn.AdaptiveAvgPool2d(output_size=1)
30 |         if use_conv:
31 |             self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, bias=True)
32 |         else:
33 |             self.fc1 = nn.Linear(channels, mid_channels)
34 |         self.activ = mid_activation
35 |         if use_conv:
36 |             self.conv2 = nn.Conv2d(mid_channels, channels, kernel_size=1, bias=True)
37 |         else:
38 |             self.fc2 = nn.Linear(mid_channels, channels)
39 |         self.sigmoid = out_activation
40 | 
41 |     def forward(self, x):
42 |         w = self.pool(x)
43 |         if not self.use_conv:
44 |             w = w.view(x.size(0), -1)
45 |         w = self.conv1(w) if self.use_conv else self.fc1(w)
46 |         w = self.activ(w)
47 |         w = self.conv2(w) if self.use_conv else self.fc2(w)
48 |         w = self.sigmoid(w)
49 |         if not self.use_conv:
50 |             w = w.unsqueeze(2).unsqueeze(3)
51 |         x = x * w
52 |         return x


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/rpn/transformer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn, Tensor
 4 | 
 5 | import copy
 6 | from typing import Optional, List
 7 | 
 8 | 
 9 | def _get_clones(module, N):
10 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
11 | 
12 | 
13 | def _get_activation_fn(activation):
14 |     """Return an activation function given a string"""
15 |     if activation == "relu":
16 |         return F.relu
17 |     if activation == "gelu":
18 |         return F.gelu
19 |     if activation == "glu":
20 |         return F.glu
21 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
22 | 
23 | 
24 | class TransformerEncoderLayer(nn.Module):
25 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
26 |                  activation="relu", normalize_before=False):
27 |         super(TransformerEncoderLayer, self).__init__()
28 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
29 |         # Implementation of Feedforward model
30 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
31 |         self.dropout = nn.Dropout(dropout)
32 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
33 | 
34 |         self.norm1 = nn.LayerNorm(d_model)
35 |         self.norm2 = nn.LayerNorm(d_model)
36 |         self.dropout1 = nn.Dropout(dropout)
37 |         self.dropout2 = nn.Dropout(dropout)
38 | 
39 |         self.activation = _get_activation_fn(activation)
40 |         self.normalize_before = normalize_before
41 | 
42 |     def forward(self, src,
43 |                 src_mask: Optional[Tensor] = None,
44 |                 src_key_padding_mask: Optional[Tensor] = None):
45 |         src2 = self.self_attn(src, src, src, attn_mask=src_mask,
46 |                               key_padding_mask=src_key_padding_mask)[0]
47 |         src = src + self.dropout1(src2)
48 |         src = self.norm1(src)
49 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
50 |         src = src + self.dropout2(src2)
51 |         src = self.norm2(src)
52 |         return src
53 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/roi_pool.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Function
 5 | from torch.autograd.function import once_differentiable
 6 | from torch.nn.modules.utils import _pair
 7 | 
 8 | from maskrcnn_benchmark import _C
 9 | 
10 | 
11 | class _ROIPool(Function):
12 |     @staticmethod
13 |     def forward(ctx, input, roi, output_size, spatial_scale):
14 |         ctx.output_size = _pair(output_size)
15 |         ctx.spatial_scale = spatial_scale
16 |         ctx.input_shape = input.size()
17 |         output, argmax = _C.roi_pool_forward(
18 |             input, roi, spatial_scale, output_size[0], output_size[1]
19 |         )
20 |         ctx.save_for_backward(input, roi, argmax)
21 |         return output
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, grad_output):
26 |         input, rois, argmax = ctx.saved_tensors
27 |         output_size = ctx.output_size
28 |         spatial_scale = ctx.spatial_scale
29 |         bs, ch, h, w = ctx.input_shape
30 |         grad_input = _C.roi_pool_backward(
31 |             grad_output,
32 |             input,
33 |             rois,
34 |             argmax,
35 |             spatial_scale,
36 |             output_size[0],
37 |             output_size[1],
38 |             bs,
39 |             ch,
40 |             h,
41 |             w,
42 |         )
43 |         return grad_input, None, None, None
44 | 
45 | 
46 | roi_pool = _ROIPool.apply
47 | 
48 | 
49 | class ROIPool(nn.Module):
50 |     def __init__(self, output_size, spatial_scale):
51 |         super(ROIPool, self).__init__()
52 |         self.output_size = output_size
53 |         self.spatial_scale = spatial_scale
54 | 
55 |     def forward(self, input, rois):
56 |         return roi_pool(input, rois, self.output_size, self.spatial_scale)
57 | 
58 |     def __repr__(self):
59 |         tmpstr = self.__class__.__name__ + "("
60 |         tmpstr += "output_size=" + str(self.output_size)
61 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
62 |         tmpstr += ")"
63 |         return tmpstr
64 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/deform_pool.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | 
10 | // Interface for Python
11 | void deform_psroi_pooling_forward(
12 |     at::Tensor input, 
13 |     at::Tensor bbox, 
14 |     at::Tensor trans, 
15 |     at::Tensor out,
16 |     at::Tensor top_count, 
17 |     const int no_trans, 
18 |     const float spatial_scale,
19 |     const int output_dim, 
20 |     const int group_size, 
21 |     const int pooled_size,
22 |     const int part_size, 
23 |     const int sample_per_part, 
24 |     const float trans_std)
25 | {
26 |   if (input.device().is_cuda()) {
27 | #ifdef WITH_CUDA
28 |     return deform_psroi_pooling_cuda_forward(
29 |         input, bbox, trans, out, top_count, 
30 |         no_trans, spatial_scale, output_dim, group_size,
31 |         pooled_size, part_size, sample_per_part, trans_std
32 |     );
33 | #else
34 |     AT_ERROR("Not compiled with GPU support");
35 | #endif
36 |   }
37 |   AT_ERROR("Not implemented on the CPU");
38 | }
39 | 
40 | 
41 | void deform_psroi_pooling_backward(
42 |     at::Tensor out_grad, 
43 |     at::Tensor input, 
44 |     at::Tensor bbox, 
45 |     at::Tensor trans,
46 |     at::Tensor top_count, 
47 |     at::Tensor input_grad, 
48 |     at::Tensor trans_grad,
49 |     const int no_trans, 
50 |     const float spatial_scale, 
51 |     const int output_dim,
52 |     const int group_size, 
53 |     const int pooled_size, 
54 |     const int part_size,
55 |     const int sample_per_part, 
56 |     const float trans_std) 
57 | {
58 |   if (input.device().is_cuda()) {
59 | #ifdef WITH_CUDA
60 |     return deform_psroi_pooling_cuda_backward(
61 |         out_grad, input, bbox, trans, top_count, input_grad, trans_grad,
62 |         no_trans, spatial_scale, output_dim, group_size, pooled_size, 
63 |         part_size, sample_per_part, trans_std
64 |     );
65 | #else
66 |     AT_ERROR("Not compiled with GPU support");
67 | #endif
68 |   }
69 |   AT_ERROR("Not implemented on the CPU");
70 | }
71 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor
 4 | from .roi_keypoint_predictors import make_roi_keypoint_predictor
 5 | from .inference import make_roi_keypoint_post_processor
 6 | from .loss import make_roi_keypoint_loss_evaluator
 7 | 
 8 | 
 9 | class ROIKeypointHead(torch.nn.Module):
10 |     def __init__(self, cfg):
11 |         super(ROIKeypointHead, self).__init__()
12 |         self.cfg = cfg.clone()
13 |         self.feature_extractor = make_roi_keypoint_feature_extractor(cfg)
14 |         self.predictor = make_roi_keypoint_predictor(cfg)
15 |         self.post_processor = make_roi_keypoint_post_processor(cfg)
16 |         self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg)
17 | 
18 |     def forward(self, features, proposals, targets=None):
19 |         """
20 |         Arguments:
21 |             features (list[Tensor]): feature-maps from possibly several levels
22 |             proposals (list[BoxList]): proposal boxes
23 |             targets (list[BoxList], optional): the ground-truth targets.
24 | 
25 |         Returns:
26 |             x (Tensor): the result of the feature extractor
27 |             proposals (list[BoxList]): during training, the original proposals
28 |                 are returned. During testing, the predicted boxlists are returned
29 |                 with the `mask` field set
30 |             losses (dict[Tensor]): During training, returns the losses for the
31 |                 head. During testing, returns an empty dict.
32 |         """
33 |         if self.training:
34 |             with torch.no_grad():
35 |                 proposals = self.loss_evaluator.subsample(proposals, targets)
36 | 
37 |         x = self.feature_extractor(features, proposals)
38 |         kp_logits = self.predictor(x)
39 | 
40 |         if not self.training:
41 |             result = self.post_processor(kp_logits, proposals)
42 |             return x, result, {}
43 | 
44 |         loss_kp = self.loss_evaluator(proposals, kp_logits)
45 | 
46 |         return x, proposals, dict(loss_kp=loss_kp)
47 | 
48 | 
49 | def build_roi_keypoint_head(cfg):
50 |     return ROIKeypointHead(cfg)


--------------------------------------------------------------------------------
/grounding/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #!/usr/bin/env python
 3 | 
 4 | import glob
 5 | import os
 6 | 
 7 | import torch
 8 | from setuptools import find_packages
 9 | from setuptools import setup
10 | from torch.utils.cpp_extension import CUDA_HOME
11 | from torch.utils.cpp_extension import CppExtension
12 | from torch.utils.cpp_extension import CUDAExtension
13 | 
14 | requirements = ["torch", "torchvision"]
15 | 
16 | 
17 | def get_extensions():
18 |     this_dir = os.path.dirname(os.path.abspath(__file__))
19 |     extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc")
20 | 
21 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 | 
25 |     sources = main_file + source_cpu
26 |     extension = CppExtension
27 | 
28 |     extra_compile_args = {"cxx": []}
29 |     define_macros = []
30 | 
31 |     if torch.cuda.is_available() and CUDA_HOME is not None:
32 |         extension = CUDAExtension
33 |         sources += source_cuda
34 |         define_macros += [("WITH_CUDA", None)]
35 |         extra_compile_args["nvcc"] = [
36 |             "-DCUDA_HAS_FP16=1",
37 |             "-D__CUDA_NO_HALF_OPERATORS__",
38 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
39 |             "-D__CUDA_NO_HALF2_OPERATORS__",
40 |         ]
41 | 
42 |     sources = [os.path.join(extensions_dir, s) for s in sources]
43 | 
44 |     include_dirs = [extensions_dir]
45 | 
46 |     ext_modules = [
47 |         extension(
48 |             "maskrcnn_benchmark._C",
49 |             sources,
50 |             include_dirs=include_dirs,
51 |             define_macros=define_macros,
52 |             extra_compile_args=extra_compile_args,
53 |         )
54 |     ]
55 | 
56 |     return ext_modules
57 | 
58 | 
59 | setup(
60 |     name="maskrcnn_benchmark",
61 |     description="object detection in pytorch",
62 |     packages=find_packages(exclude=("configs", "tests",)),
63 |     # install_requires=requirements,
64 |     ext_modules=get_extensions(),
65 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension.with_options(use_ninja=False)},
66 | )
67 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/imagenet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path
 3 | import json
 4 | from PIL import Image
 5 | 
 6 | import torch.utils.data as data
 7 | 
 8 | def pil_loader(path):
 9 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
10 |     with open(path, 'rb') as f:
11 |         img = Image.open(f)
12 |         return img.convert('RGB')
13 | 
14 | class ImageNet(data.Dataset):
15 |     """ ImageNet
16 | 
17 |     Args:
18 |         root (string): Root directory where images are downloaded to.
19 |         annFile (string): Path to json annotation file.
20 |         transform (callable, optional): A function/transform that  takes in an PIL image
21 |             and returns a transformed version. E.g, ``transforms.ToTensor``
22 |     """
23 | 
24 |     def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None):
25 | 
26 | 
27 |         self.root = root
28 |         self.transform = transforms
29 | 
30 |         meta_file = os.path.join(root, ann_file)
31 |         assert os.path.exists(meta_file), 'meta file %s under root %s not found' % (os.path.basename(meta_file), root)
32 | 
33 |         with open(meta_file, 'r') as f:
34 |             meta = json.load(f)
35 | 
36 |         self.classes = meta['classes']
37 |         self.class_to_idx = meta['class_to_idx']
38 |         self.samples = meta['samples']
39 |         self.num_sample = len(self.samples)
40 |         self.allsamples = self.samples
41 | 
42 |     def select_class(self, cls):
43 |         new_samples = [sample for sample in self.allsamples if sample[-1] in cls]
44 |         self.samples = new_samples
45 |         self.num_sample = len(self.samples)
46 | 
47 |     def __getitem__(self, index):
48 |         """
49 |         Args:
50 |             index (int): Index
51 | 
52 |         Returns:
53 |             tuple: (sample, target) where target is class_index of the target class.
54 |         """
55 |         img_path, target = self.samples[index]
56 |         sample = pil_loader(self.root + '/' + img_path)
57 |         if self.transform is not None:
58 |             sample = self.transform(sample)
59 | 
60 |         return sample, target, index
61 | 
62 |     def __len__(self):
63 |         return len(self.samples)


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from torch import nn
 3 | 
 4 | 
 5 | class FastRCNNPredictor(nn.Module):
 6 |     def __init__(self, config, pretrained=None):
 7 |         super(FastRCNNPredictor, self).__init__()
 8 | 
 9 |         stage_index = 4
10 |         stage2_relative_factor = 2 ** (stage_index - 1)
11 |         res2_out_channels = config.MODEL.RESNETS.RES2_OUT_CHANNELS
12 |         num_inputs = res2_out_channels * stage2_relative_factor
13 | 
14 |         num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES
15 |         self.avgpool = nn.AvgPool2d(kernel_size=7, stride=7)
16 |         self.cls_score = nn.Linear(num_inputs, num_classes)
17 |         self.bbox_pred = nn.Linear(num_inputs, num_classes * 4)
18 | 
19 |         nn.init.normal_(self.cls_score.weight, mean=0, std=0.01)
20 |         nn.init.constant_(self.cls_score.bias, 0)
21 | 
22 |         nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001)
23 |         nn.init.constant_(self.bbox_pred.bias, 0)
24 | 
25 |     def forward(self, x):
26 |         x = self.avgpool(x)
27 |         x = x.view(x.size(0), -1)
28 |         cls_logit = self.cls_score(x)
29 |         bbox_pred = self.bbox_pred(x)
30 |         return cls_logit, bbox_pred
31 | 
32 | 
33 | class FPNPredictor(nn.Module):
34 |     def __init__(self, cfg):
35 |         super(FPNPredictor, self).__init__()
36 |         num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
37 |         representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
38 | 
39 |         self.cls_score = nn.Linear(representation_size, num_classes)
40 |         self.bbox_pred = nn.Linear(representation_size, num_classes * 4)
41 | 
42 |         nn.init.normal_(self.cls_score.weight, std=0.01)
43 |         nn.init.normal_(self.bbox_pred.weight, std=0.001)
44 |         for l in [self.cls_score, self.bbox_pred]:
45 |             nn.init.constant_(l.bias, 0)
46 | 
47 |     def forward(self, x):
48 |         scores = self.cls_score(x)
49 |         bbox_deltas = self.bbox_pred(x)
50 | 
51 |         return scores, bbox_deltas
52 | 
53 | 
54 | _ROI_BOX_PREDICTOR = {
55 |     "FastRCNNPredictor": FastRCNNPredictor,
56 |     "FPNPredictor": FPNPredictor,
57 | }
58 | 
59 | 
60 | def make_roi_box_predictor(cfg):
61 |     func = _ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR]
62 |     return func(cfg)
63 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/shallow_contrastive_loss_helper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import maskrcnn_benchmark.utils.dist as dist
 3 | 
 4 | 
 5 | def normalized_positive_map(positive_map):
 6 |     positive_map = positive_map.float()
 7 |     positive_map_num_pos = positive_map.sum(2)
 8 |     positive_map_num_pos[positive_map_num_pos == 0] = 1e-6
 9 |     positive_map = positive_map / positive_map_num_pos.unsqueeze(-1)
10 |     return positive_map
11 | 
12 | 
13 | def pad_tensor_given_dim_length(tensor, dim, length, padding_value=0, batch_first=True):
14 |     new_size = list(tensor.size()[:dim]) + [length] + list(tensor.size()[dim + 1:])
15 |     out_tensor = tensor.data.new(*new_size).fill_(padding_value)
16 |     if batch_first:
17 |         out_tensor[:, :tensor.size(1), ...] = tensor
18 |     else:
19 |         out_tensor[:tensor.size(0), ...] = tensor
20 |     return out_tensor
21 | 
22 | 
23 | def pad_random_negative_tensor_given_length(positive_tensor, negative_padding_tensor, length=None):
24 |     assert positive_tensor.shape[0] + negative_padding_tensor.shape[0] == length
25 |     return torch.cat((positive_tensor, negative_padding_tensor), dim=0)
26 | 
27 | 
28 | def gather_tensors(tensor):
29 |     """
30 |     Performs all_gather operation on the provided tensors.
31 |     *** Warning ***: torch.distributed.all_gather has no gradient.
32 |     """
33 |     if not dist.is_dist_avail_and_initialized():
34 |         return torch.stack([tensor], dim=0)
35 | 
36 |     total = dist.get_world_size()
37 |     rank = torch.distributed.get_rank()
38 |     # gathered_normalized_img_emb = [torch.zeros_like(normalized_img_emb) for _ in range(total)]
39 |     # torch.distributed.all_gather(gathered_normalized_img_emb, normalized_img_emb)
40 | 
41 |     tensors_gather = [
42 |         torch.zeros_like(tensor)
43 |         for _ in range(total)
44 |     ]
45 |     torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
46 | 
47 |     # need to do this to restore propagation of the gradients
48 |     tensors_gather[rank] = tensor
49 |     output = torch.stack(tensors_gather, dim=0)
50 |     return output
51 | 
52 | 
53 | def convert_to_roi_format(boxes):
54 |     concat_boxes = boxes.bbox
55 |     device, dtype = concat_boxes.device, concat_boxes.dtype
56 |     ids = torch.full((len(boxes), 1), 0, dtype=dtype, device=device)
57 |     rois = torch.cat([ids, concat_boxes], dim=1)
58 |     return rois


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/mask_head/hourglass.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from maskrcnn_benchmark.modeling.make_layers import make_conv3x3
 4 | 
 5 | 
 6 | class Residual(nn.Module):
 7 |     def __init__(self, inp_dim, out_dim, use_gn=False):
 8 |         super(Residual, self).__init__()
 9 |         self.relu = nn.ReLU()
10 |         # self.bn1 = nn.BatchNorm2d(inp_dim)
11 |         self.conv1 = make_conv3x3(inp_dim, int(out_dim / 2), 1, use_relu=False, use_gn=use_gn)
12 |         # self.bn2 = nn.BatchNorm2d(int(out_dim / 2))
13 |         self.conv2 = make_conv3x3(int(out_dim / 2), int(out_dim / 2), 3, use_relu=False, use_gn=use_gn)
14 |         # self.bn3 = nn.BatchNorm2d(int(out_dim / 2))
15 |         self.conv3 = make_conv3x3(int(out_dim / 2), out_dim, 1, use_relu=False, use_gn=use_gn)
16 |         if inp_dim == out_dim:
17 |             self.need_skip = False
18 |         else:
19 |             self.need_skip = True
20 |             self.skip_layer = make_conv3x3(inp_dim, out_dim, 1, use_relu=False, use_gn=False)
21 | 
22 |     def forward(self, x):
23 |         if self.need_skip:
24 |             residual = self.skip_layer(x)
25 |         else:
26 |             residual = x
27 |         out = x
28 |         # out = self.bn1(out)
29 |         out = self.relu(out)
30 |         out = self.conv1(out)
31 |         # out = self.bn2(out)
32 |         out = self.relu(out)
33 |         out = self.conv2(out)
34 |         # out = self.bn3(out)
35 |         out = self.relu(out)
36 |         out = self.conv3(out)
37 |         out += residual
38 |         return out
39 | 
40 | 
41 | class Hourglass(nn.Module):
42 |     def __init__(self, n, f, gn=False, increase=0):
43 |         super(Hourglass, self).__init__()
44 |         nf = f + increase
45 |         self.up1 = Residual(f, f)
46 |         # Lower branch
47 |         self.pool1 = nn.MaxPool2d(2, 2)
48 |         self.low1 = Residual(f, nf)
49 |         self.n = n
50 |         # Recursive hourglass
51 |         if self.n > 1:
52 |             self.low2 = Hourglass(n-1, nf, gn=gn)
53 |         else:
54 |             self.low2 = Residual(nf, nf, gn)
55 |         self.low3 = Residual(nf, f, gn)
56 |         self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
57 | 
58 |     def forward(self, x):
59 |         up1 = self.up1(x)
60 |         pool1 = self.pool1(x)
61 |         low1 = self.low1(pool1)
62 |         low2 = self.low2(low1)
63 |         low3 = self.low3(low2)
64 |         up2 = self.up2(low3)
65 |         return up1 + up2


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from maskrcnn_benchmark.data import datasets
 2 | 
 3 | from .coco import coco_evaluation
 4 | from .voc import voc_evaluation
 5 | from .vg import vg_evaluation
 6 | from .box_aug import im_detect_bbox_aug
 7 | from .od_to_grounding import od_to_grounding_evaluation
 8 | 
 9 | 
10 | def evaluate(dataset, predictions, output_folder, **kwargs):
11 |     """evaluate dataset using different methods based on dataset type.
12 |     Args:
13 |         dataset: Dataset object
14 |         predictions(list[BoxList]): each item in the list represents the
15 |             prediction results for one image.
16 |         output_folder: output folder, to save evaluation files or results.
17 |         **kwargs: other args.
18 |     Returns:
19 |         evaluation result
20 |     """
21 |     args = dict(
22 |         dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
23 |     )
24 |     if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset):
25 |         return coco_evaluation(**args)
26 |     # elif isinstance(dataset, datasets.VGTSVDataset):
27 |     #     return vg_evaluation(**args)
28 |     elif isinstance(dataset, datasets.PascalVOCDataset):
29 |         return voc_evaluation(**args)
30 |     elif isinstance(dataset, datasets.CocoDetectionTSV):
31 |         return od_to_grounding_evaluation(**args)
32 |     elif isinstance(dataset, datasets.LvisDetection):
33 |         pass
34 |     else:
35 |         dataset_name = dataset.__class__.__name__
36 |         raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
37 | 
38 | 
39 | def evaluate_mdetr(dataset, predictions, output_folder, cfg):
40 |    
41 |     args = dict(
42 |         dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
43 |     )
44 |     if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset):
45 |         return coco_evaluation(**args)
46 |     # elif isinstance(dataset, datasets.VGTSVDataset):
47 |     #     return vg_evaluation(**args)
48 |     elif isinstance(dataset, datasets.PascalVOCDataset):
49 |         return voc_evaluation(**args)
50 |     elif isinstance(dataset, datasets.CocoDetectionTSV):
51 |         return od_to_grounding_evaluation(**args)
52 |     elif isinstance(dataset, datasets.LvisDetection):
53 |         pass
54 |     else:
55 |         dataset_name = dataset.__class__.__name__
56 |         raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
57 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/structures/image_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from __future__ import division
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class ImageList(object):
 8 |     """
 9 |     Structure that holds a list of images (of possibly
10 |     varying sizes) as a single tensor.
11 |     This works by padding the images to the same size,
12 |     and storing in a field the original sizes of each image
13 |     """
14 | 
15 |     def __init__(self, tensors, image_sizes):
16 |         """
17 |         Arguments:
18 |             tensors (tensor)
19 |             image_sizes (list[tuple[int, int]])
20 |         """
21 |         self.tensors = tensors
22 |         self.image_sizes = image_sizes
23 | 
24 |     def to(self, *args, **kwargs):
25 |         cast_tensor = self.tensors.to(*args, **kwargs)
26 |         return ImageList(cast_tensor, self.image_sizes)
27 | 
28 | 
29 | def to_image_list(tensors, size_divisible=0):
30 |     """
31 |     tensors can be an ImageList, a torch.Tensor or
32 |     an iterable of Tensors. It can't be a numpy array.
33 |     When tensors is an iterable of Tensors, it pads
34 |     the Tensors with zeros so that they have the same
35 |     shape
36 |     """
37 |     if isinstance(tensors, torch.Tensor) and size_divisible > 0:
38 |         tensors = [tensors]
39 | 
40 |     if isinstance(tensors, ImageList):
41 |         return tensors
42 |     elif isinstance(tensors, torch.Tensor):
43 |         # single tensor shape can be inferred
44 |         assert tensors.dim() == 4
45 |         image_sizes = [tensor.shape[-2:] for tensor in tensors]
46 |         return ImageList(tensors, image_sizes)
47 |     elif isinstance(tensors, (tuple, list)):
48 |         max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors]))
49 | 
50 |         # TODO Ideally, just remove this and let me model handle arbitrary
51 |         # input sizs
52 |         if size_divisible > 0:
53 |             import math
54 | 
55 |             stride = size_divisible
56 |             max_size = list(max_size)
57 |             max_size[1] = int(math.ceil(max_size[1] / stride) * stride)
58 |             max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
59 |             max_size = tuple(max_size)
60 | 
61 |         batch_shape = (len(tensors),) + max_size
62 |         batched_imgs = tensors[0].new(*batch_shape).zero_()
63 |         for img, pad_img in zip(tensors, batched_imgs):
64 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
65 | 
66 |         image_sizes = [im.shape[-2:] for im in tensors]
67 | 
68 |         return ImageList(batched_imgs, image_sizes)
69 |     else:
70 |         raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors)))
71 | 


--------------------------------------------------------------------------------
/grounding/configs/pretrain/glip_Swin_T_O365.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "swin_tiny_patch4_window7_224.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: True
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: True
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 |            
 51 |     USE_CHECKPOINT: True
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: False
 55 |   IMS_PER_BATCH: 64
 56 | 
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("object365_dt_train", )
 60 |   TEST: ("coco_2017_val", )
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 | INPUT:
 69 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 70 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 71 |   MIN_SIZE_TRAIN: 800
 72 |   MAX_SIZE_TRAIN: 1333
 73 |   MIN_SIZE_TEST: 800
 74 |   MAX_SIZE_TEST: 1333
 75 | 
 76 | AUGMENT:
 77 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 78 | 
 79 | DATALOADER:
 80 |   SIZE_DIVISIBILITY: 32
 81 | 
 82 | SOLVER:
 83 |   OPTIMIZER: ADAMW
 84 |   BASE_LR: 0.0001
 85 |   LANG_LR: 0.00001
 86 |   WEIGHT_DECAY: 0.0001
 87 |   STEPS: (0.67, 0.89)
 88 |   MAX_EPOCH: 30
 89 |   IMS_PER_BATCH: 64
 90 |   WARMUP_ITERS: 2000
 91 |   WARMUP_FACTOR: 0.001
 92 |   USE_AMP: True
 93 |   MODEL_EMA: 0.999
 94 |   FIND_UNUSED_PARAMETERS: False
 95 | 
 96 |   CLIP_GRADIENTS:
 97 |     ENABLED: True
 98 |     CLIP_TYPE: "full_model"
 99 |     CLIP_VALUE: 1.0
100 |     NORM_TYPE: 2.0


--------------------------------------------------------------------------------
/grounding/configs/pretrain/glip_A_Swin_T_O365.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "swin_tiny_patch4_window7_224.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 |            
 51 |     USE_CHECKPOINT: True
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: False
 55 |   IMS_PER_BATCH: 64
 56 | 
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("object365_dt_train", )
 60 |   TEST: ("coco_2017_val", )
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 | INPUT:
 69 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 70 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 71 |   MIN_SIZE_TRAIN: 800
 72 |   MAX_SIZE_TRAIN: 1333
 73 |   MIN_SIZE_TEST: 800
 74 |   MAX_SIZE_TEST: 1333
 75 | 
 76 | AUGMENT:
 77 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 78 | 
 79 | DATALOADER:
 80 |   SIZE_DIVISIBILITY: 32
 81 | 
 82 | SOLVER:
 83 |   OPTIMIZER: ADAMW
 84 |   BASE_LR: 0.0001
 85 |   LANG_LR: 0.00001
 86 |   WEIGHT_DECAY: 0.0001
 87 |   STEPS: (0.67, 0.89)
 88 |   MAX_EPOCH: 30
 89 |   IMS_PER_BATCH: 64
 90 |   WARMUP_ITERS: 2000
 91 |   WARMUP_FACTOR: 0.001
 92 |   USE_AMP: True
 93 |   MODEL_EMA: 0.999
 94 |   FIND_UNUSED_PARAMETERS: False
 95 | 
 96 |   CLIP_GRADIENTS:
 97 |     ENABLED: True
 98 |     CLIP_TYPE: "full_model"
 99 |     CLIP_VALUE: 1.0
100 |     NORM_TYPE: 2.0


--------------------------------------------------------------------------------
/grounding/configs/pretrain/glip_Swin_T_O365_GoldG.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "swin_tiny_patch4_window7_224.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: True
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: True
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 |            
 51 |     USE_CHECKPOINT: True
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: False
 55 |   IMS_PER_BATCH: 64
 56 | 
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
 60 |   TEST: ("coco_2017_val", )
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 | INPUT:
 69 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 70 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 71 |   MIN_SIZE_TRAIN: 800
 72 |   MAX_SIZE_TRAIN: 1333
 73 |   MIN_SIZE_TEST: 800
 74 |   MAX_SIZE_TEST: 1333
 75 | 
 76 | AUGMENT:
 77 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 78 | 
 79 | DATALOADER:
 80 |   SIZE_DIVISIBILITY: 32
 81 | 
 82 | SOLVER:
 83 |   OPTIMIZER: ADAMW
 84 |   BASE_LR: 0.0001
 85 |   LANG_LR: 0.00001
 86 |   WEIGHT_DECAY: 0.0001
 87 |   STEPS: (0.67, 0.89)
 88 |   MAX_EPOCH: 30
 89 |   IMS_PER_BATCH: 64
 90 |   WARMUP_ITERS: 2000
 91 |   WARMUP_FACTOR: 0.001
 92 |   USE_AMP: True
 93 |   MODEL_EMA: 0.999
 94 |   FIND_UNUSED_PARAMETERS: False
 95 | 
 96 |   CLIP_GRADIENTS:
 97 |     ENABLED: True
 98 |     CLIP_TYPE: "full_model"
 99 |     CLIP_VALUE: 1.0
100 |     NORM_TYPE: 2.0


--------------------------------------------------------------------------------
/grounding/tools/cityscapes/instances2dict_with_polygons.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Convert instances from png files to a dictionary
 4 | # This files is created according to https://github.com/facebookresearch/Detectron/issues/111
 5 | 
 6 | from __future__ import print_function, absolute_import, division
 7 | import os, sys
 8 | 
 9 | sys.path.append( os.path.normpath( os.path.join( os.path.dirname( __file__ ) , '..' , 'helpers' ) ) )
10 | from csHelpers import *
11 | 
12 | # Cityscapes imports
13 | from cityscapesscripts.evaluation.instance import *
14 | from cityscapesscripts.helpers.csHelpers import *
15 | import cv2
16 | from maskrcnn_benchmark.utils import cv2_util
17 | 
18 | 
19 | def instances2dict_with_polygons(imageFileList, verbose=False):
20 |     imgCount     = 0
21 |     instanceDict = {}
22 | 
23 |     if not isinstance(imageFileList, list):
24 |         imageFileList = [imageFileList]
25 | 
26 |     if verbose:
27 |         print("Processing {} images...".format(len(imageFileList)))
28 | 
29 |     for imageFileName in imageFileList:
30 |         # Load image
31 |         img = Image.open(imageFileName)
32 | 
33 |         # Image as numpy array
34 |         imgNp = np.array(img)
35 | 
36 |         # Initialize label categories
37 |         instances = {}
38 |         for label in labels:
39 |             instances[label.name] = []
40 | 
41 |         # Loop through all instance ids in instance image
42 |         for instanceId in np.unique(imgNp):
43 |             if instanceId < 1000:
44 |                 continue
45 |             instanceObj = Instance(imgNp, instanceId)
46 |             instanceObj_dict = instanceObj.toDict()
47 | 
48 |             #instances[id2label[instanceObj.labelID].name].append(instanceObj.toDict())
49 |             if id2label[instanceObj.labelID].hasInstances:
50 |                 mask = (imgNp == instanceId).astype(np.uint8)
51 |                 contour, hier = cv2_util.findContours(
52 |                     mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
53 | 
54 |                 polygons = [c.reshape(-1).tolist() for c in contour]
55 |                 instanceObj_dict['contours'] = polygons
56 | 
57 |             instances[id2label[instanceObj.labelID].name].append(instanceObj_dict)
58 | 
59 |         imgKey = os.path.abspath(imageFileName)
60 |         instanceDict[imgKey] = instances
61 |         imgCount += 1
62 | 
63 |         if verbose:
64 |             print("\rImages Processed: {}".format(imgCount), end=' ')
65 |             sys.stdout.flush()
66 | 
67 |     if verbose:
68 |         print("")
69 | 
70 |     return instanceDict
71 | 
72 | def main(argv):
73 |     fileList = []
74 |     if (len(argv) > 2):
75 |         for arg in argv:
76 |             if ("png" in arg):
77 |                 fileList.append(arg)
78 |     instances2dict_with_polygons(fileList, True)
79 | 
80 | if __name__ == "__main__":
81 |     main(sys.argv[1:])
82 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/finetune.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_tiny_model_o365_goldg.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: True
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: True
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: True
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: False
 55 |   IMS_PER_BATCH: 64
 56 | 
 57 | # use for grounding model
 58 | DATASETS:
 59 | #  TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
 60 | #  TEST: ("coco_2017_val", )
 61 |   TRAIN: ("refexp_train",)
 62 |   TEST: ("refexp_val",)
 63 |   DISABLE_SHUFFLE: False
 64 |   ADD_DET_PROMPT: False
 65 |   RANDOM_SAMPLE_NEG: 85
 66 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 67 | 
 68 |   SEPARATION_TOKENS: ". "
 69 | 
 70 | INPUT:
 71 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 72 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 73 |   MIN_SIZE_TRAIN: 800
 74 |   MAX_SIZE_TRAIN: 1333
 75 |   MIN_SIZE_TEST: 800
 76 |   MAX_SIZE_TEST: 1333
 77 | 
 78 | AUGMENT:
 79 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 80 | 
 81 | DATALOADER:
 82 |   SIZE_DIVISIBILITY: 32
 83 | 
 84 | SOLVER:
 85 |   OPTIMIZER: ADAMW
 86 |   BASE_LR: 0.0001
 87 |   LANG_LR: 0.00001
 88 |   WEIGHT_DECAY: 0.0001
 89 |   STEPS: (0.67, 0.89)
 90 |   MAX_EPOCH: 30
 91 |   IMS_PER_BATCH: 64
 92 |   WARMUP_ITERS: 2000
 93 |   WARMUP_FACTOR: 0.001
 94 |   USE_AMP: True
 95 |   MODEL_EMA: 0.999
 96 |   FIND_UNUSED_PARAMETERS: False
 97 | 
 98 |   CLIP_GRADIENTS:
 99 |     ENABLED: True
100 |     CLIP_TYPE: "full_model"
101 |     CLIP_VALUE: 1.0
102 |     NORM_TYPE: 2.0


--------------------------------------------------------------------------------
/grounding/configs/refcoco/refcoco.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "swin_tiny_patch4_window7_224.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 |       MLM_LOSS: False
 51 | 
 52 |     USE_CHECKPOINT: True
 53 | 
 54 | TEST:
 55 |   DURING_TRAINING: False
 56 |   IMS_PER_BATCH: 1
 57 |   MDETR_STYLE_AGGREGATE_CLASS_NUM: 100
 58 |   EVAL_TASK: grounding
 59 | # use for grounding model
 60 | DATASETS:
 61 |   TRAIN: ("refcoco_train", )
 62 |   TEST: ("refcoco_val", )
 63 |   DISABLE_SHUFFLE: False
 64 |   ADD_DET_PROMPT: False
 65 |   RANDOM_SAMPLE_NEG: 85
 66 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 67 | 
 68 |   SEPARATION_TOKENS: ". "
 69 | 
 70 | INPUT:
 71 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 72 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 73 |   MIN_SIZE_TRAIN: 800
 74 |   MAX_SIZE_TRAIN: 1333
 75 |   MIN_SIZE_TEST: 800
 76 |   MAX_SIZE_TEST: 1333
 77 | 
 78 | AUGMENT:
 79 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 80 | 
 81 | DATALOADER:
 82 |   SIZE_DIVISIBILITY: 32
 83 | 
 84 | SOLVER:
 85 |   OPTIMIZER: ADAMW
 86 |   BASE_LR: 0.0001
 87 |   LANG_LR: 0.00001
 88 |   WEIGHT_DECAY: 0.0001
 89 |   STEPS: (0.67, 0.89)
 90 |   MAX_EPOCH: 30
 91 |   IMS_PER_BATCH: 1
 92 |   WARMUP_ITERS: 2000
 93 |   WARMUP_FACTOR: 0.001
 94 |   USE_AMP: True
 95 |   MODEL_EMA: 0.999
 96 |   FIND_UNUSED_PARAMETERS: False
 97 | 
 98 |   CLIP_GRADIENTS:
 99 |     ENABLED: True
100 |     CLIP_TYPE: "full_model"
101 |     CLIP_VALUE: 1.0
102 |     NORM_TYPE: 2.0
103 | 
104 | OUTPUT_DIR: OUTPUT
105 | 


--------------------------------------------------------------------------------
/grounding/configs/flickr/flickr.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "swin_tiny_patch4_window7_224.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 |       MLM_LOSS: False
 51 | 
 52 |     USE_CHECKPOINT: True
 53 | 
 54 | TEST:
 55 |   DURING_TRAINING: False
 56 |   IMS_PER_BATCH: 1
 57 |   MDETR_STYLE_AGGREGATE_CLASS_NUM: 100
 58 |   EVAL_TASK: grounding
 59 | # use for grounding model
 60 | DATASETS:
 61 |   TRAIN: ("object365_dt_train", )
 62 |   TEST: ("coco_2017_val", )
 63 |   DISABLE_SHUFFLE: False
 64 |   ADD_DET_PROMPT: False
 65 |   RANDOM_SAMPLE_NEG: 85
 66 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 67 | 
 68 |   SEPARATION_TOKENS: ". "
 69 | 
 70 | INPUT:
 71 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 72 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 73 |   MIN_SIZE_TRAIN: 800
 74 |   MAX_SIZE_TRAIN: 1333
 75 |   MIN_SIZE_TEST: 800
 76 |   MAX_SIZE_TEST: 1333
 77 | 
 78 | AUGMENT:
 79 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 80 | 
 81 | DATALOADER:
 82 |   SIZE_DIVISIBILITY: 32
 83 | 
 84 | SOLVER:
 85 |   OPTIMIZER: ADAMW
 86 |   BASE_LR: 0.0001
 87 |   LANG_LR: 0.00001
 88 |   WEIGHT_DECAY: 0.0001
 89 |   STEPS: (0.67, 0.89)
 90 |   MAX_EPOCH: 30
 91 |   IMS_PER_BATCH: 1
 92 |   WARMUP_ITERS: 2000
 93 |   WARMUP_FACTOR: 0.001
 94 |   USE_AMP: True
 95 |   MODEL_EMA: 0.999
 96 |   FIND_UNUSED_PARAMETERS: False
 97 | 
 98 |   CLIP_GRADIENTS:
 99 |     ENABLED: True
100 |     CLIP_TYPE: "full_model"
101 |     CLIP_VALUE: 1.0
102 |     NORM_TYPE: 2.0
103 | 
104 | OUTPUT_DIR: OUTPUT
105 | 


--------------------------------------------------------------------------------
/grounding/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include "cpu/vision.h"
 3 | 
 4 | 
 5 | template <typename scalar_t>
 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets,
 7 |                           const at::Tensor& scores,
 8 |                           const float threshold) {
 9 |   AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
10 |   AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
11 |   AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
12 | 
13 |   if (dets.numel() == 0) {
14 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
15 |   }
16 | 
17 |   auto x1_t = dets.select(1, 0).contiguous();
18 |   auto y1_t = dets.select(1, 1).contiguous();
19 |   auto x2_t = dets.select(1, 2).contiguous();
20 |   auto y2_t = dets.select(1, 3).contiguous();
21 | 
22 |   at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
23 | 
24 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
25 | 
26 |   auto ndets = dets.size(0);
27 |   at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
28 | 
29 |   auto suppressed = suppressed_t.data_ptr<uint8_t>();
30 |   auto order = order_t.data_ptr<int64_t>();
31 |   auto x1 = x1_t.data_ptr<scalar_t>();
32 |   auto y1 = y1_t.data_ptr<scalar_t>();
33 |   auto x2 = x2_t.data_ptr<scalar_t>();
34 |   auto y2 = y2_t.data_ptr<scalar_t>();
35 |   auto areas = areas_t.data_ptr<scalar_t>();
36 | 
37 |   for (int64_t _i = 0; _i < ndets; _i++) {
38 |     auto i = order[_i];
39 |     if (suppressed[i] == 1)
40 |       continue;
41 |     auto ix1 = x1[i];
42 |     auto iy1 = y1[i];
43 |     auto ix2 = x2[i];
44 |     auto iy2 = y2[i];
45 |     auto iarea = areas[i];
46 | 
47 |     for (int64_t _j = _i + 1; _j < ndets; _j++) {
48 |       auto j = order[_j];
49 |       if (suppressed[j] == 1)
50 |         continue;
51 |       auto xx1 = std::max(ix1, x1[j]);
52 |       auto yy1 = std::max(iy1, y1[j]);
53 |       auto xx2 = std::min(ix2, x2[j]);
54 |       auto yy2 = std::min(iy2, y2[j]);
55 | 
56 |       auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
57 |       auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
58 |       auto inter = w * h;
59 |       auto ovr = inter / (iarea + areas[j] - inter);
60 |       if (ovr >= threshold)
61 |         suppressed[j] = 1;
62 |    }
63 |   }
64 |   return at::nonzero(suppressed_t == 0).squeeze(1);
65 | }
66 | 
67 | at::Tensor nms_cpu(const at::Tensor& dets,
68 |                const at::Tensor& scores,
69 |                const float threshold) {
70 |   at::Tensor result;
71 |   AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
72 |     result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
73 |   });
74 |   return result;
75 | }
76 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/backbone/ops.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def conv7x7(in_planes, out_planes, stride=1, groups=1, dilation=1):
 8 |     """7x7 convolution with padding"""
 9 |     return nn.Conv2d(in_planes, out_planes, kernel_size=7, stride=stride,
10 |                      padding=3*dilation, groups=groups, bias=False, dilation=dilation)
11 | 
12 | 
13 | def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1):
14 |     """5x5 convolution with padding"""
15 |     return nn.Conv2d(in_planes, out_planes, kernel_size=5, stride=stride,
16 |                      padding=2*dilation, groups=groups, bias=False, dilation=dilation)
17 | 
18 | 
19 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
20 |     """3x3 convolution with padding"""
21 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
22 |                      padding=dilation, groups=groups, bias=False, dilation=dilation)
23 | 
24 | 
25 | def conv1x1(in_planes, out_planes, stride=1):
26 |     """1x1 convolution"""
27 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
28 | 
29 | 
30 | def maxpool(**kwargs):
31 |     return nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
32 | 
33 | 
34 | def avgpool(**kwargs):
35 |     return nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
36 | 
37 | def dropout(prob):
38 |     return nn.Dropout(prob)
39 | 
40 | 
41 | conv3x3sep = lambda i, o, s=1: conv3x3(i, o, s, groups=i)
42 | conv3x3g2 = lambda i, o, s=1: conv3x3(i, o, s, groups=2)
43 | conv3x3g4 = lambda i, o, s=1: conv3x3(i, o, s, groups=4)
44 | conv3x3g8 = lambda i, o, s=1: conv3x3(i, o, s, groups=8)
45 | conv3x3dw = lambda i, o, s=1: conv3x3(i, o, s, groups=i)
46 | 
47 | conv3x3d2 = lambda i, o, s=1: conv3x3(i, o, s, dilation=2)
48 | conv3x3d3 = lambda i, o, s=1: conv3x3(i, o, s, dilation=3)
49 | conv3x3d4 = lambda i, o, s=1: conv3x3(i, o, s, dilation=4)
50 | 
51 | 
52 | conv5x5sep = lambda i, o, s=1: conv5x5(i, o, s, groups=i)
53 | conv5x5g2 = lambda i, o, s=1: conv5x5(i, o, s, groups=2)
54 | conv5x5g4 = lambda i, o, s=1: conv5x5(i, o, s, groups=4)
55 | conv5x5g8 = lambda i, o, s=1: conv5x5(i, o, s, groups=8)
56 | conv5x5dw = lambda i, o, s=1: conv5x5(i, o, s, groups=i)
57 | 
58 | 
59 | conv5x5d2 = lambda i, o, s=1: conv5x5(i, o, s, dilation=2)
60 | conv5x5d3 = lambda i, o, s=1: conv5x5(i, o, s, dilation=3)
61 | conv5x5d4 = lambda i, o, s=1: conv5x5(i, o, s, dilation=4)
62 | 
63 | conv7x7sep = lambda i, o, s=1: conv7x7(i, o, s, groups=i)
64 | conv7x7g2 = lambda i, o, s=1: conv7x7(i, o, s, groups=2)
65 | conv7x7g4 = lambda i, o, s=1: conv7x7(i, o, s, groups=4)
66 | conv7x7g8 = lambda i, o, s=1: conv7x7(i, o, s, groups=8)
67 | conv7x7dw = lambda i, o, s=1: conv7x7(i, o, s, groups=i)
68 | 
69 | conv7x7d2 = lambda i, o, s=1: conv7x7(i, o, s, dilation=2)
70 | conv7x7d3 = lambda i, o, s=1: conv7x7(i, o, s, dilation=3)
71 | conv7x7d4 = lambda i, o, s=1: conv7x7(i, o, s, dilation=4)


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class BalancedPositiveNegativeSampler(object):
 6 |     """
 7 |     This class samples batches, ensuring that they contain a fixed proportion of positives
 8 |     """
 9 | 
10 |     def __init__(self, batch_size_per_image, positive_fraction):
11 |         """
12 |         Arguments:
13 |             batch_size_per_image (int): number of elements to be selected per image
14 |             positive_fraction (float): percentace of positive elements per batch
15 |         """
16 |         self.batch_size_per_image = batch_size_per_image
17 |         self.positive_fraction = positive_fraction
18 | 
19 |     def __call__(self, matched_idxs):
20 |         """
21 |         Arguments:
22 |             matched idxs: list of tensors containing -1, 0 or positive values.
23 |                 Each tensor corresponds to a specific image.
24 |                 -1 values are ignored, 0 are considered as negatives and > 0 as
25 |                 positives.
26 | 
27 |         Returns:
28 |             pos_idx (list[tensor])
29 |             neg_idx (list[tensor])
30 | 
31 |         Returns two lists of binary masks for each image.
32 |         The first list contains the positive elements that were selected,
33 |         and the second list the negative example.
34 |         """
35 |         pos_idx = []
36 |         neg_idx = []
37 |         for matched_idxs_per_image in matched_idxs:
38 |             positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
39 |             negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
40 | 
41 |             num_pos = int(self.batch_size_per_image * self.positive_fraction)
42 |             # protect against not enough positive examples
43 |             num_pos = min(positive.numel(), num_pos)
44 |             num_neg = self.batch_size_per_image - num_pos
45 |             # protect against not enough negative examples
46 |             num_neg = min(negative.numel(), num_neg)
47 | 
48 |             # randomly select positive and negative examples
49 |             perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
50 |             perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
51 | 
52 |             pos_idx_per_image = positive[perm1]
53 |             neg_idx_per_image = negative[perm2]
54 | 
55 |             # create binary mask from indices
56 |             pos_idx_per_image_mask = torch.zeros_like(
57 |                 matched_idxs_per_image, dtype=torch.bool
58 |             )
59 |             neg_idx_per_image_mask = torch.zeros_like(
60 |                 matched_idxs_per_image, dtype=torch.bool
61 |             )
62 |             pos_idx_per_image_mask[pos_idx_per_image] = 1
63 |             neg_idx_per_image_mask[neg_idx_per_image] = 1
64 | 
65 |             pos_idx.append(pos_idx_per_image_mask)
66 |             neg_idx.append(neg_idx_per_image_mask)
67 | 
68 |         return pos_idx, neg_idx
69 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | """
 3 | Miscellaneous utility functions
 4 | """
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def cat(tensors, dim=0):
10 |     """
11 |     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
12 |     """
13 |     assert isinstance(tensors, (list, tuple))
14 |     if len(tensors) == 1:
15 |         return tensors[0]
16 |     return torch.cat(tensors, dim)
17 | 
18 | 
19 | def permute_and_flatten(layer, N, A, C, H, W):
20 |     layer = layer.view(N, -1, C, H, W)
21 |     layer = layer.permute(0, 3, 4, 1, 2)
22 |     layer = layer.reshape(N, -1, C)
23 |     return layer
24 | 
25 | 
26 | def concat_box_prediction_layers(box_regression, box_cls=None, token_logits=None):
27 |     box_regression_flattened = []
28 |     box_cls_flattened = []
29 |     token_logit_flattened = []
30 | 
31 |     # for each feature level, permute the outputs to make them be in the
32 |     # same format as the labels. Note that the labels are computed for
33 |     # all feature levels concatenated, so we keep the same representation
34 |     # for the objectness and the box_regression
35 |     for box_cls_per_level, box_regression_per_level in zip(
36 |             box_cls, box_regression
37 |     ):
38 |         N, AxC, H, W = box_cls_per_level.shape
39 |         Ax4 = box_regression_per_level.shape[1]
40 |         A = Ax4 // 4
41 |         C = AxC // A
42 |         box_cls_per_level = permute_and_flatten(
43 |             box_cls_per_level, N, A, C, H, W
44 |         )
45 |         box_cls_flattened.append(box_cls_per_level)
46 | 
47 |         box_regression_per_level = permute_and_flatten(
48 |             box_regression_per_level, N, A, 4, H, W
49 |         )
50 |         box_regression_flattened.append(box_regression_per_level)
51 | 
52 |     if token_logits is not None:
53 |         for token_logit_per_level in token_logits:
54 |             N, AXT, H, W = token_logit_per_level.shape
55 |             T = AXT // A
56 |             token_logit_per_level = permute_and_flatten(
57 |                 token_logit_per_level, N, A, T, H, W
58 |             )
59 |             token_logit_flattened.append(token_logit_per_level)
60 | 
61 |     # concatenate on the first dimension (representing the feature levels), to
62 |     # take into account the way the labels were generated (with all feature maps
63 |     # being concatenated as well)
64 |     box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C)
65 |     box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4)
66 | 
67 |     token_logits_stacked = None
68 |     if token_logits is not None:
69 |         # stacked
70 |         token_logits_stacked = cat(token_logit_flattened, dim=1)
71 | 
72 |     return box_regression, box_cls, token_logits_stacked
73 | 
74 | 
75 | def round_channels(channels, divisor=8):
76 |     rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor)
77 |     if float(rounded_channels) < 0.9 * channels:
78 |         rounded_channels += divisor
79 |     return rounded_channels
80 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/data/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | from maskrcnn_benchmark.utils.comm import shared_random_seed
10 | 
11 | 
12 | class DistributedSampler(Sampler):
13 |     """Sampler that restricts data loading to a subset of the dataset.
14 |     It is especially useful in conjunction with
15 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
16 |     process can pass a DistributedSampler instance as a DataLoader sampler,
17 |     and load a subset of the original dataset that is exclusive to it.
18 |     .. note::
19 |         Dataset is assumed to be of constant size.
20 |     Arguments:
21 |         dataset: Dataset used for sampling.
22 |         num_replicas (optional): Number of processes participating in
23 |             distributed training.
24 |         rank (optional): Rank of the current process within num_replicas.
25 |     """
26 | 
27 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, use_random=False):
28 |         if num_replicas is None:
29 |             if not dist.is_available():
30 |                 raise RuntimeError("Requires distributed package to be available")
31 |             num_replicas = dist.get_world_size()
32 |         if rank is None:
33 |             if not dist.is_available():
34 |                 raise RuntimeError("Requires distributed package to be available")
35 |             rank = dist.get_rank()
36 |         self.dataset = dataset
37 |         self.num_replicas = num_replicas
38 |         self.rank = rank
39 |         self.epoch = 0
40 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
41 |         self.total_size = self.num_samples * self.num_replicas
42 |         self.shuffle = shuffle
43 |         self.use_random = use_random
44 | 
45 |     def __iter__(self):
46 |         if self.shuffle:
47 |             # deterministically shuffle based on epoch
48 |             _seed = self.epoch
49 |             if self.use_random:
50 |                 _seed = int(shared_random_seed())
51 |             g = torch.Generator()
52 |             g.manual_seed(_seed)
53 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
54 |         else:
55 |             indices = torch.arange(len(self.dataset)).tolist()
56 | 
57 |         # add extra samples to make it evenly divisible
58 |         indices += indices[: (self.total_size - len(indices))]
59 |         assert len(indices) == self.total_size
60 | 
61 |         # subsample
62 |         offset = self.num_samples * self.rank
63 |         indices = indices[offset : offset + self.num_samples]
64 |         assert len(indices) == self.num_samples
65 | 
66 |         return iter(indices)
67 | 
68 |     def __len__(self):
69 |         return self.num_samples
70 | 
71 |     def set_epoch(self, epoch):
72 |         self.epoch = epoch
73 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/iou_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class IOULoss(nn.Module):
 6 |     def __init__(self, loss_type="iou"):
 7 |         super(IOULoss, self).__init__()
 8 |         self.loss_type = loss_type
 9 | 
10 |     def forward(self, pred, target, weight=None):
11 |         pred_left = pred[:, 0]
12 |         pred_top = pred[:, 1]
13 |         pred_right = pred[:, 2]
14 |         pred_bottom = pred[:, 3]
15 | 
16 |         target_left = target[:, 0]
17 |         target_top = target[:, 1]
18 |         target_right = target[:, 2]
19 |         target_bottom = target[:, 3]
20 | 
21 |         target_area = (target_left + target_right) * \
22 |                       (target_top + target_bottom)
23 |         pred_area = (pred_left + pred_right) * \
24 |                     (pred_top + pred_bottom)
25 | 
26 |         w_intersect = torch.min(pred_left, target_left) + torch.min(pred_right, target_right)
27 |         g_w_intersect = torch.max(pred_left, target_left) + torch.max(
28 |             pred_right, target_right)
29 |         h_intersect = torch.min(pred_bottom, target_bottom) + torch.min(pred_top, target_top)
30 |         g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max(pred_top, target_top)
31 |         ac_uion = g_w_intersect * g_h_intersect + 1e-7
32 |         area_intersect = w_intersect * h_intersect
33 |         area_union = target_area + pred_area - area_intersect
34 |         ious = (area_intersect + 1.0) / (area_union + 1.0)
35 |         gious = ious - (ac_uion - area_union) / ac_uion
36 |         if self.loss_type == 'iou':
37 |             losses = -torch.log(ious)
38 |         elif self.loss_type == 'linear_iou':
39 |             losses = 1 - ious
40 |         elif self.loss_type == 'giou':
41 |             losses = 1 - gious
42 |         else:
43 |             raise NotImplementedError
44 | 
45 |         if weight is not None and weight.sum() > 0:
46 |             return (losses * weight).sum()
47 |         else:
48 |             assert losses.numel() != 0
49 |             return losses.sum()
50 | 
51 | 
52 | class IOUWHLoss(nn.Module):  # used for anchor guiding
53 |     def __init__(self, reduction='none'):
54 |         super(IOUWHLoss, self).__init__()
55 |         self.reduction = reduction
56 | 
57 |     def forward(self, pred, target):
58 |         orig_shape = pred.shape
59 |         pred = pred.view(-1, 4)
60 |         target = target.view(-1, 4)
61 |         target[:, :2] = 0
62 |         tl = torch.max((target[:, :2] - pred[:, 2:] / 2),
63 |                        (target[:, :2] - target[:, 2:] / 2))
64 | 
65 |         br = torch.min((target[:, :2] + pred[:, 2:] / 2),
66 |                        (target[:, :2] + target[:, 2:] / 2))
67 | 
68 |         area_p = torch.prod(pred[:, 2:], 1)
69 |         area_g = torch.prod(target[:, 2:], 1)
70 | 
71 |         en = (tl < br).type(tl.type()).prod(dim=1)
72 |         area_i = torch.prod(br - tl, 1) * en
73 |         U = area_p + area_g - area_i + 1e-16
74 |         iou = area_i / U
75 | 
76 |         loss = 1 - iou ** 2
77 |         if self.reduction == 'mean':
78 |             loss = loss.mean()
79 |         elif self.reduction == 'sum':
80 |             loss = loss.sum()
81 | 
82 |         return loss
83 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | from .roi_box_feature_extractors import make_roi_box_feature_extractor
 6 | from .roi_box_predictors import make_roi_box_predictor
 7 | from .inference import make_roi_box_post_processor
 8 | from .loss import make_roi_box_loss_evaluator
 9 | from maskrcnn_benchmark.utils.amp import custom_fwd, custom_bwd
10 | 
11 | class ROIBoxHead(torch.nn.Module):
12 |     """
13 |     Generic Box Head class.
14 |     """
15 | 
16 |     def __init__(self, cfg):
17 |         super(ROIBoxHead, self).__init__()
18 |         self.feature_extractor = make_roi_box_feature_extractor(cfg)
19 |         self.predictor = make_roi_box_predictor(cfg)
20 |         self.post_processor = make_roi_box_post_processor(cfg)
21 |         self.loss_evaluator = make_roi_box_loss_evaluator(cfg)
22 |         self.onnx = cfg.MODEL.ONNX
23 | 
24 |     @custom_fwd(cast_inputs=torch.float32)
25 |     def forward(self, features, proposals, targets=None):
26 |         """
27 |         Arguments:
28 |             features (list[Tensor]): feature-maps from possibly several levels
29 |             proposals (list[BoxList]): proposal boxes
30 |             targets (list[BoxList], optional): the ground-truth targets.
31 | 
32 |         Returns:
33 |             x (Tensor): the result of the feature extractor
34 |             proposals (list[BoxList]): during training, the subsampled proposals
35 |                 are returned. During testing, the predicted boxlists are returned
36 |             losses (dict[Tensor]): During training, returns the losses for the
37 |                 head. During testing, returns an empty dict.
38 |         """
39 | 
40 |         if self.training:
41 |             # Faster R-CNN subsamples during training the proposals with a fixed
42 |             # positive / negative ratio
43 |             with torch.no_grad():
44 |                 proposals = self.loss_evaluator.subsample(proposals, targets)
45 | 
46 |         # extract features that will be fed to the final classifier. The
47 |         # feature_extractor generally corresponds to the pooler + heads
48 |         x = self.feature_extractor(features, proposals)
49 |         # final classifier that converts the features into predictions
50 |         class_logits, box_regression = self.predictor(x)
51 | 
52 |         if self.onnx:
53 |             return x, (class_logits, box_regression, [box.bbox for box in proposals]), {}
54 | 
55 |         if not self.training:
56 |             result = self.post_processor((class_logits, box_regression), proposals)
57 |             return x, result, {}
58 | 
59 |         loss_classifier, loss_box_reg = self.loss_evaluator(
60 |             [class_logits], [box_regression]
61 |         )
62 |         return (
63 |             x,
64 |             proposals,
65 |             dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg),
66 |         )
67 | 
68 | 
69 | def build_roi_box_head(cfg):
70 |     """
71 |     Constructs a new box head.
72 |     By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class
73 |     and make it a parameter in the config
74 |     """
75 |     return ROIBoxHead(cfg)
76 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/utils/model_zoo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import os
 3 | import sys
 4 | 
 5 | try:
 6 |     from torch.hub import download_url_to_file
 7 |     from torch.hub import urlparse
 8 |     from torch.hub import HASH_REGEX
 9 | except ImportError:
10 |     from torch.utils.model_zoo import _download_url_to_file
11 |     from torch.utils.model_zoo import urlparse
12 |     from torch.utils.model_zoo import HASH_REGEX
13 | 
14 | from maskrcnn_benchmark.utils.comm import is_main_process
15 | from maskrcnn_benchmark.utils.comm import synchronize
16 | 
17 | 
18 | # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py
19 | # but with a few improvements and modifications
20 | def cache_url(url, model_dir='model', progress=True):
21 |     r"""Loads the Torch serialized object at the given URL.
22 |     If the object is already present in `model_dir`, it's deserialized and
23 |     returned. The filename part of the URL should follow the naming convention
24 |     ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
25 |     digits of the SHA256 hash of the contents of the file. The hash is used to
26 |     ensure unique names and to verify the contents of the file.
27 |     The default value of `model_dir` is ``$TORCH_HOME/models`` where
28 |     ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
29 |     overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
30 |     Args:
31 |         url (string): URL of the object to download
32 |         model_dir (string, optional): directory in which to save the object
33 |         progress (bool, optional): whether or not to display a progress bar to stderr
34 |     Example:
35 |         >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
36 |     """
37 |     if model_dir is None:
38 |         torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch"))
39 |         model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models"))
40 |     if not os.path.exists(model_dir):
41 |         os.makedirs(model_dir, exist_ok=True)
42 |     parts = urlparse(url)
43 |     filename = os.path.basename(parts.path)
44 |     if filename == "model_final.pkl":
45 |         # workaround as pre-trained Caffe2 models from Detectron have all the same filename
46 |         # so make the full path the filename by replacing / with _
47 |         filename = parts.path.replace("/", "_")
48 |     cached_file = os.path.join(model_dir, filename)
49 |     if not os.path.exists(cached_file):
50 |         sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
51 |         hash_prefix = HASH_REGEX.search(filename)
52 |         if hash_prefix is not None:
53 |             hash_prefix = hash_prefix.group(1)
54 |             # workaround: Caffe2 models don't have a hash, but follow the R-50 convention,
55 |             # which matches the hash PyTorch uses. So we skip the hash matching
56 |             # if the hash_prefix is less than 6 characters
57 |             if len(hash_prefix) < 6:
58 |                 hash_prefix = None
59 |         _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
60 |     synchronize()
61 |     return cached_file
62 | 


--------------------------------------------------------------------------------
/grounding/maskrcnn_benchmark/layers/roi_align.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Function
 5 | from torch.autograd.function import once_differentiable
 6 | from torch.nn.modules.utils import _pair
 7 | 
 8 | from maskrcnn_benchmark import _C
 9 | 
10 | class _ROIAlign(Function):
11 |     @staticmethod
12 |     def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
13 |         ctx.save_for_backward(roi)
14 |         ctx.output_size = _pair(output_size)
15 |         ctx.spatial_scale = spatial_scale
16 |         ctx.sampling_ratio = sampling_ratio
17 |         ctx.input_shape = input.size()
18 |         output = _C.roi_align_forward(
19 |             input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
20 |         )
21 |         return output
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, grad_output):
26 |         rois, = ctx.saved_tensors
27 |         output_size = ctx.output_size
28 |         spatial_scale = ctx.spatial_scale
29 |         sampling_ratio = ctx.sampling_ratio
30 |         bs, ch, h, w = ctx.input_shape
31 |         grad_input = _C.roi_align_backward(
32 |             grad_output,
33 |             rois,
34 |             spatial_scale,
35 |             output_size[0],
36 |             output_size[1],
37 |             bs,
38 |             ch,
39 |             h,
40 |             w,
41 |             sampling_ratio,
42 |         )
43 |         return grad_input, None, None, None, None
44 | 
45 | try:
46 |     import torchvision
47 |     from torchvision.ops import roi_align
48 | except:
49 |     roi_align = _ROIAlign.apply
50 | 
51 | class ROIAlign(nn.Module):
52 |     def __init__(self, output_size, spatial_scale, sampling_ratio):
53 |         super(ROIAlign, self).__init__()
54 |         self.output_size = output_size
55 |         self.spatial_scale = spatial_scale
56 |         self.sampling_ratio = sampling_ratio
57 | 
58 |     def forward(self, input, rois):
59 |         return roi_align(
60 |             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
61 |         )
62 | 
63 |     def __repr__(self):
64 |         tmpstr = self.__class__.__name__ + "("
65 |         tmpstr += "output_size=" + str(self.output_size)
66 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
67 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
68 |         tmpstr += ")"
69 |         return tmpstr
70 | 
71 | class ROIAlignV2(nn.Module):
72 |     def __init__(self, output_size, spatial_scale, sampling_ratio):
73 |         super(ROIAlignV2, self).__init__()
74 |         self.output_size = output_size
75 |         self.spatial_scale = spatial_scale
76 |         self.sampling_ratio = sampling_ratio
77 | 
78 |     def forward(self, input, rois):
79 |         return torchvision.ops.roi_align(
80 |             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, aligned=True
81 |         )
82 | 
83 |     def __repr__(self):
84 |         tmpstr = self.__class__.__name__ + "("
85 |         tmpstr += "output_size=" + str(self.output_size)
86 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
87 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
88 |         tmpstr += ")"
89 |         return tmpstr
90 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: False
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 | 
123 |   PROMPT_LORA_D: 4
124 |   INTERACT_LORA_D: 4
125 |   PROMPT_LORA: True
126 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_tt.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 16
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: True
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 | 
123 |   PROMPT_LORA_D: 4
124 |   INTERACT_LORA_D: 4
125 |   PROMPT_LORA: True
126 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_base.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: False
116 |   TEXTUAL_PROMPT: False
117 |   TASK_ALIGNMENT: False
118 |   LAYER_ALIGNMENT: False
119 |   INTERACT: False
120 |   PROMPT_DEPTH: 9
121 | 
122 | 
123 |   PROMPT_LORA_D: 4
124 |   INTERACT_LORA_D: 4
125 |   PROMPT_LORA: True
126 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_layer.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: False
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: False
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_task.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 | #  USE_AUTOSTEP: True
101 |   USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: True
118 |   LAYER_ALIGNMENT: False
119 |   INTERACT: False
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_interact.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 16
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: False
118 |   LAYER_ALIGNMENT: False
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_layer_task.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: True
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: False
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_task_interact.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: True
118 |   LAYER_ALIGNMENT: False
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_layer_interact.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: False
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_decompose_task_layer_interact.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_val",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: True
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 |   PROMPT_LORA_D: 4
123 |   INTERACT_LORA_D: 4
124 |   PROMPT_LORA: True
125 | 


--------------------------------------------------------------------------------
/grounding/configs/refcoco/org/finetune_A_test.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "GeneralizedVLRCNN"
  3 |   WEIGHT: "MODEL/glip_a_tiny_o365.pth"
  4 |   RPN_ONLY: True
  5 |   RPN_ARCHITECTURE: "VLDYHEAD"
  6 | 
  7 |   BACKBONE:
  8 |     CONV_BODY: "SWINT-FPN-RETINANET"
  9 |     OUT_CHANNELS: 256
 10 |     FREEZE_CONV_BODY_AT: -1
 11 | 
 12 |   LANGUAGE_BACKBONE:
 13 |     FREEZE: False
 14 |     MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
 15 |     MASK_SPECIAL: False
 16 | 
 17 |   RPN:
 18 |     USE_FPN: True
 19 |     ANCHOR_SIZES: (64, 128, 256, 512, 1024)
 20 |     ANCHOR_STRIDE: (8, 16, 32, 64, 128)
 21 |     ASPECT_RATIOS: (1.0,)
 22 |     SCALES_PER_OCTAVE: 1
 23 | 
 24 |   DYHEAD:
 25 |     CHANNELS: 256
 26 |     NUM_CONVS: 6
 27 |     USE_GN: True
 28 |     USE_DYRELU: True
 29 |     USE_DFCONV: True
 30 |     USE_DYFUSE: True
 31 |     TOPK: 9 # topk for selecting candidate positive samples from each level
 32 |     SCORE_AGG: "MEAN"
 33 |     LOG_SCALE: 0.0
 34 | 
 35 |     FUSE_CONFIG:
 36 |       EARLY_FUSE_ON: False
 37 |       TYPE: "MHA-B"
 38 |       USE_CLASSIFICATION_LOSS: False
 39 |       USE_TOKEN_LOSS: False
 40 |       USE_CONTRASTIVE_ALIGN_LOSS: False
 41 |       CONTRASTIVE_HIDDEN_DIM: 64
 42 |       USE_DOT_PRODUCT_TOKEN_LOSS: True
 43 |       USE_FUSED_FEATURES_DOT_PRODUCT: False
 44 |       USE_LAYER_SCALE: True
 45 |       CLAMP_MIN_FOR_UNDERFLOW: True
 46 |       CLAMP_MAX_FOR_OVERFLOW: True
 47 |       CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
 48 |       CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
 49 |       CLAMP_DOT_PRODUCT: True
 50 | 
 51 |     USE_CHECKPOINT: False
 52 | 
 53 | TEST:
 54 |   DURING_TRAINING: True
 55 |   IMS_PER_BATCH: 1
 56 |   EVAL_TASK: grounding
 57 | # use for grounding model
 58 | DATASETS:
 59 |   TRAIN: ("refexp_train", )
 60 |   TEST: ("refexp_testA", "refexp_testB",)
 61 |   DISABLE_SHUFFLE: False
 62 |   ADD_DET_PROMPT: False
 63 |   RANDOM_SAMPLE_NEG: 85
 64 |   CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
 65 | 
 66 |   SEPARATION_TOKENS: ". "
 67 | 
 68 |   USE_OVERRIDE_CATEGORY: True
 69 |   SHUFFLE_SEED: 3
 70 | 
 71 | INPUT:
 72 |   PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
 73 |   PIXEL_STD: [ 57.375, 57.120, 58.395 ]
 74 |   MIN_SIZE_TRAIN: 800
 75 |   MAX_SIZE_TRAIN: 1333
 76 |   MIN_SIZE_TEST: 800
 77 |   MAX_SIZE_TEST: 1333
 78 | 
 79 | AUGMENT:
 80 |   MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
 81 | 
 82 | DATALOADER:
 83 |   SIZE_DIVISIBILITY: 32
 84 | 
 85 | SOLVER:
 86 |   OPTIMIZER: ADAMW
 87 |   BASE_LR: 0.0001
 88 |   LANG_LR: 0.0001
 89 |   WEIGHT_DECAY: 0.05
 90 |   STEPS: (0.67, 0.89)
 91 |   MAX_EPOCH: 30
 92 |   IMS_PER_BATCH: 32
 93 |   WARMUP_ITERS: 2000
 94 |   WARMUP_FACTOR: 0.001
 95 |   USE_AMP: True
 96 |   MODEL_EMA: 0.0
 97 |   FIND_UNUSED_PARAMETERS: True
 98 | 
 99 |   TEST_WITH_INFERENCE: True
100 |   USE_AUTOSTEP: True
101 | #  USE_COSINE: True
102 | 
103 |   CLIP_GRADIENTS:
104 |     ENABLED: True
105 |     CLIP_TYPE: "full_model"
106 |     CLIP_VALUE: 1.0
107 |     NORM_TYPE: 2.0
108 | 
109 |   SEED: 10
110 |   STEP_PATIENCE: 2
111 |   AUTO_TERMINATE_PATIENCE: 4
112 |   TUNING_HIGHLEVEL_OVERRIDE: language_prompt_v4
113 | 
114 | LPAI:
115 |   VISUAL_PROMPT: True
116 |   TEXTUAL_PROMPT: True
117 |   TASK_ALIGNMENT: True
118 |   LAYER_ALIGNMENT: True
119 |   INTERACT: True
120 |   PROMPT_DEPTH: 9
121 | 
122 | 
123 |   PROMPT_LORA_D: 4
124 |   INTERACT_LORA_D: 4
125 |   PROMPT_LORA: True
126 | 


--------------------------------------------------------------------------------