├── .gitignore
├── LICENSE
├── README.md
├── README_CLIP.md
├── metadata
    ├── coco_panoptic_clip_hand_craft_EVACLIP_ViTB16.npy
    ├── coco_panoptic_clip_hand_craft_EVACLIP_ViTL14x336.npy
    ├── coco_panoptic_clip_hand_craft_RN50x64.npy
    ├── coco_panoptic_clip_hand_craft_ViTB16.npy
    └── coco_panoptic_clip_hand_craft_ViTL14x336.npy
├── ovdet
    ├── DATA.md
    ├── INSTALLATION.md
    ├── configs
    │   ├── _base_
    │   │   ├── datasets
    │   │   │   ├── coco_ovd_base.py
    │   │   │   ├── coco_ovd_base_lsj.py
    │   │   │   ├── coco_ovd_detic.py
    │   │   │   ├── coco_ovd_detic_clim.py
    │   │   │   ├── lvis_v1_ovd_base.py
    │   │   │   ├── lvis_v1_ovd_base_lsj.py
    │   │   │   └── lvis_v1_ovd_base_lsj_640.py
    │   │   ├── iter_based_runtime.py
    │   │   ├── models
    │   │   │   ├── faster-rcnn_r50_fpn_syncbn.py
    │   │   │   └── mask-rcnn_r50_fpn_syncbn.py
    │   │   └── schedules
    │   │   │   ├── schedule_180k.py
    │   │   │   ├── schedule_45k.py
    │   │   │   └── schedule_90k.py
    │   ├── clip_based
    │   │   ├── README.md
    │   │   ├── openai_rn50x64
    │   │   │   └── mask_rcnn_fpn_openai_rn50x64_clim_bs256_ov_lvis_2.88k.py
    │   │   └── openai_vitb16
    │   │   │   ├── faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py
    │   │   │   └── mask_rcnn_nasfpn_openai_vitb16_clim_bs64_ov_lvis_4x.py
    │   └── detic
    │   │   ├── README.md
    │   │   ├── ov_coco
    │   │       ├── detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py
    │   │       ├── detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py
    │   │       └── faster_rcnn_r50_caffe_c4_90k_ovcoco.py
    │   │   └── ov_lvis
    │   │       ├── detic_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py
    │   │       ├── detic_centernet2_r50_fpn_4x_lvis_boxsup.py
    │   │       └── detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py
    ├── data
    │   └── metadata
    │   │   ├── coco_clip_hand_craft.npy
    │   │   ├── coco_openai_vitb16_hand_craft.npy
    │   │   ├── coco_openai_vitb16_hand_craft_with_background.npy
    │   │   ├── lvis_openai_rn50x64_hand_craft.npy
    │   │   ├── lvis_openai_rn50x64_hand_craft_with_background.npy
    │   │   ├── lvis_openai_vitb16_hand_craft.npy
    │   │   ├── lvis_openai_vitb16_hand_craft_with_background.npy
    │   │   ├── lvis_v1_clip_a+cname.npy
    │   │   ├── lvis_v1_train_cat_info.json
    │   │   └── lvis_v1_train_cat_norare_info.json
    ├── ovdet
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── cc3m_lvis_v1.py
    │   │   ├── coco_caption.py
    │   │   ├── pipelines
    │   │   │   ├── __init__.py
    │   │   │   └── mosaic.py
    │   │   └── samplers
    │   │   │   └── multi_source_sampler.py
    │   ├── methods
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   ├── detic
    │   │   │   ├── __init__.py
    │   │   │   ├── detic_caption.py
    │   │   │   ├── detic_tags.py
    │   │   │   └── utils.py
    │   │   └── queues.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── backbones
    │   │   │   ├── __init__.py
    │   │   │   ├── clip_resnet.py
    │   │   │   └── clip_vit.py
    │   │   ├── dense_heads
    │   │   │   ├── __init__.py
    │   │   │   ├── centernet_rpn_head.py
    │   │   │   ├── iou_loss.py
    │   │   │   └── rpn_head.py
    │   │   ├── detectors
    │   │   │   ├── __init__.py
    │   │   │   ├── centernet2.py
    │   │   │   ├── detic.py
    │   │   │   ├── fvlm.py
    │   │   │   └── two_stage.py
    │   │   ├── losses
    │   │   │   ├── __init__.py
    │   │   │   ├── cross_entropy_loss.py
    │   │   │   └── heatmap_focal_loss.py
    │   │   ├── roi_heads
    │   │   │   ├── __init__.py
    │   │   │   ├── detic_bbox_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bbox_head.py
    │   │   │   │   ├── detic_bbox_head.py
    │   │   │   │   └── zero_shot_classifier.py
    │   │   │   ├── detic_roi_head.py
    │   │   │   ├── fvlm_bbox_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── convfc_bbox_head.py
    │   │   │   └── standard_roi_head.py
    │   │   └── vlms
    │   │   │   ├── __init__.py
    │   │   │   └── clip
    │   │   │       ├── README.md
    │   │   │       ├── __init__.py
    │   │   │       ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │       ├── clip.py
    │   │   │       ├── common.py
    │   │   │       ├── image_encoder.py
    │   │   │       ├── model.py
    │   │   │       ├── openai_model.py
    │   │   │       ├── simple_tokenizer.py
    │   │   │       ├── text_encoder.py
    │   │   │       └── utils.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── misc.py
    └── tools
    │   ├── dist_test.sh
    │   ├── dist_train.sh
    │   ├── generate_text_embeddings.py
    │   ├── pre_processors
    │       ├── keep_coco_base.py
    │       ├── keep_coco_novel.py
    │       └── keep_lvis_base.py
    │   ├── slurm_test.sh
    │   ├── slurm_train.sh
    │   ├── test.py
    │   └── train.py
├── requirements-training.txt
├── requirements.txt
├── scripts
    ├── test_openai_vitb16_macc_boxes_masks.sh
    ├── train_clim_cc3m_3e_openai_vitb16.sh
    └── train_clim_coco_100e_openai_vitb16.sh
├── setup.py
├── src
    ├── open_clip
    │   ├── __init__.py
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── coca_model.py
    │   ├── constants.py
    │   ├── customs.py
    │   ├── eva_clip
    │   │   ├── __init__.py
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── constants.py
    │   │   ├── eva_vit_model.py
    │   │   ├── factory.py
    │   │   ├── hf_configs.py
    │   │   ├── hf_model.py
    │   │   ├── loss.py
    │   │   ├── model.py
    │   │   ├── model_configs
    │   │   │   ├── EVA01-CLIP-B-16.json
    │   │   │   ├── EVA01-CLIP-g-14-plus.json
    │   │   │   ├── EVA01-CLIP-g-14.json
    │   │   │   ├── EVA02-CLIP-B-16.json
    │   │   │   ├── EVA02-CLIP-L-14-336.json
    │   │   │   ├── EVA02-CLIP-L-14.json
    │   │   │   ├── EVA02-CLIP-bigE-14-plus.json
    │   │   │   └── EVA02-CLIP-bigE-14.json
    │   │   ├── modified_resnet.py
    │   │   ├── openai.py
    │   │   ├── pretrained.py
    │   │   ├── rope.py
    │   │   ├── timm_model.py
    │   │   ├── tokenizer.py
    │   │   ├── transform.py
    │   │   ├── transformer.py
    │   │   └── utils.py
    │   ├── factory.py
    │   ├── generation_utils.py
    │   ├── hf_configs.py
    │   ├── hf_model.py
    │   ├── loss.py
    │   ├── model.py
    │   ├── model_configs
    │   │   ├── RN101-quickgelu.json
    │   │   ├── RN101.json
    │   │   ├── RN50-quickgelu.json
    │   │   ├── RN50.json
    │   │   ├── RN50x16.json
    │   │   ├── RN50x4.json
    │   │   ├── RN50x64.json
    │   │   ├── ViT-B-16-plus-240.json
    │   │   ├── ViT-B-16-plus.json
    │   │   ├── ViT-B-16.json
    │   │   ├── ViT-B-32-plus-256.json
    │   │   ├── ViT-B-32-quickgelu.json
    │   │   ├── ViT-B-32.json
    │   │   ├── ViT-H-14.json
    │   │   ├── ViT-H-16.json
    │   │   ├── ViT-L-14-280.json
    │   │   ├── ViT-L-14-336.json
    │   │   ├── ViT-L-14.json
    │   │   ├── ViT-L-16-320.json
    │   │   ├── ViT-L-16.json
    │   │   ├── ViT-M-16-alt.json
    │   │   ├── ViT-M-16.json
    │   │   ├── ViT-M-32-alt.json
    │   │   ├── ViT-M-32.json
    │   │   ├── ViT-S-16-alt.json
    │   │   ├── ViT-S-16.json
    │   │   ├── ViT-S-32-alt.json
    │   │   ├── ViT-S-32.json
    │   │   ├── ViT-bigG-14.json
    │   │   ├── ViT-e-14.json
    │   │   ├── ViT-g-14.json
    │   │   ├── coca_ViT-B-32.json
    │   │   ├── coca_ViT-L-14.json
    │   │   ├── coca_base.json
    │   │   ├── coca_roberta-ViT-B-32.json
    │   │   ├── convnext_base.json
    │   │   ├── convnext_base_w.json
    │   │   ├── convnext_base_w_320.json
    │   │   ├── convnext_large.json
    │   │   ├── convnext_large_d.json
    │   │   ├── convnext_large_d_320.json
    │   │   ├── convnext_small.json
    │   │   ├── convnext_tiny.json
    │   │   ├── convnext_xlarge.json
    │   │   ├── convnext_xxlarge.json
    │   │   ├── convnext_xxlarge_320.json
    │   │   ├── mt5-base-ViT-B-32.json
    │   │   ├── mt5-xl-ViT-H-14.json
    │   │   ├── roberta-ViT-B-32.json
    │   │   ├── swin_base_patch4_window7_224.json
    │   │   ├── vit_medium_patch16_gap_256.json
    │   │   ├── vit_relpos_medium_patch16_cls_224.json
    │   │   ├── xlm-roberta-base-ViT-B-32.json
    │   │   └── xlm-roberta-large-ViT-H-14.json
    │   ├── modified_resnet.py
    │   ├── openai.py
    │   ├── pretrained.py
    │   ├── push_to_hf_hub.py
    │   ├── timm_model.py
    │   ├── tokenizer.py
    │   ├── transform.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── version.py
    └── training
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── clim.py
    │   ├── coco_api.py
    │   ├── custom_transforms.py
    │   ├── data.py
    │   ├── dist_utils.py
    │   ├── distributed.py
    │   ├── file_utils.py
    │   ├── logger.py
    │   ├── main.py
    │   ├── params.py
    │   ├── precision.py
    │   ├── profile.py
    │   ├── region_clip.py
    │   ├── scheduler.py
    │   ├── train.py
    │   ├── utils.py
    │   └── zero_shot.py
└── tools
    └── generate_text_embeddings.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | logs/
  2 | wandb/
  3 | features/
  4 | results/
  5 | 
  6 | tests/data/
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | sync.sh
138 | gpu1sync.sh
139 | .idea
140 | *.pdf
141 | **/._*
142 | **/*DS_*
143 | **.jsonl
144 | src/sbatch
145 | src/misc
146 | .vscode
147 | src/debug
148 | core.*
149 | 
150 | # Allow
151 | !src/evaluation/misc/results_dbs/*
152 | data/coco
153 | data/lvis
154 | checkpoints/
155 | logs
156 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | S-Lab License 1.0
 2 | 
 3 | Copyright 2022 S-Lab
 4 | 
 5 | Redistribution and use for non-commercial purpose in source and 
 6 | binary forms, with or without modification, are permitted provided 
 7 | that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright 
10 |    notice, this list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright 
13 |    notice, this list of conditions and the following disclaimer in 
14 |    the documentation and/or other materials provided with the 
15 |    distribution.
16 | 
17 | 3. Neither the name of the copyright holder nor the names of its 
18 |    contributors may be used to endorse or promote products derived 
19 |    from this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 
33 | In the event that redistribution and/or use for commercial purpose in 
34 | source or binary forms, with or without modification is required, 
35 | please contact the contributor(s) of the work.
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CLIM: Contrastive Language-Image Mosaic for Region Representation
 2 | ## Introduction
 3 | 
 4 | This is an official release of the paper 
 5 | **CLIM: Contrastive Language-Image Mosaic for Region Representation**.
 6 | 
 7 | > [**CLIM: Contrastive Language-Image Mosaic for Region Representation**](https://arxiv.org/abs/2312.11376),            
 8 | > Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Wentao Liu, Chen Change Loy            
 9 | > [Bibetex](https://github.com/wusize/CLIM#citation)
10 | 
11 | 
12 | ## Application to CLIP
13 | 
14 | Please refer to the instructions in this [README](README_CLIP.md).
15 | 
16 | ## Application to Detic
17 | Please refer to the instructions in this [README](ovdet/configs/detic/README.md).
18 | 
19 | ## License
20 | This project is licensed under [NTU S-Lab License 1.0](LICENSE).
21 | 
22 | ## Citation
23 | 
24 | ```bibtex
25 | @article{wu2023clim,
26 |     title={CLIM: Contrastive Language-Image Mosaic for Region Representation},
27 |     author={Size Wu and Wenwei Zhang and Lumin Xu and Sheng Jin and Wentao Liu and Chen Change Loy},
28 |     journal={arXiv preprint arXiv:2312.11376},
29 |     year={2023}
30 | }
31 | ```
32 | 
33 | 
34 | ## Acknowledgement
35 | 
36 | We thank [OpenCLIP](https://github.com/mlfoundations/open_clip/tree/v2.16.0) and [MMDetection](https://github.com/open-mmlab/mmdetection) for their valuable code bases.
37 | 


--------------------------------------------------------------------------------
/README_CLIP.md:
--------------------------------------------------------------------------------
 1 | # Application to CLIP
 2 | 
 3 | ## Installation
 4 | The code for applying CLIM to CLIP model is adapted from [OpenCLIP-v2.16.0](https://github.com/mlfoundations/open_clip/tree/v2.16.0). Run the
 5 | following command to install the package
 6 | 
 7 | ```bash
 8 | cd CLIM/
 9 | pip install -e . -v
10 | ```
11 | 
12 | ## Data Preparation
13 | The main experiments are conducted using images from [COCO](https://cocodataset.org/#home) and 
14 | [CC3M](https://ai.google.com/research/ConceptualCaptions/download)
15 | Please prepare datasets and organize them like the following:
16 | 
17 | ```text
18 | CLIM/
19 | ├── data
20 |     ├── coco
21 |         ├── annotations
22 |             ├── panoptic_val2017.json
23 |             ├── panoptic_val2017     # panoptic masks
24 |         ├── wusize
25 |             ├── captions_train2017_tags_allcaps.json
26 |         ├── train2017
27 |         ├── val2017
28 |     ├── cc3m
29 |         ├── cc3m_captions_train.json
30 |         ├── train
31 | ```
32 | The json file `captions_train2017_tags_allcaps.json` for coco captions can be obtained from 
33 | [GoogleDrive](https://drive.google.com/drive/folders/1O6rt6WN2ePPg6j-wVgF89T7ql2HiuRIG?usp=sharing).
34 | For CC3M dataset, please download the image using the csv file from the official 
35 | [website](https://ai.google.com/research/ConceptualCaptions/download), and then generate the json file
36 | following the COCO format. The json file `cc3m_captions_train.json` might look like:
37 | 
38 | ```json lines
39 | {'images': 
40 |   [
41 |     {'id': 1, 'file_name': 'train/0/0.jpg', 'captions': ['a very typical bus station']},
42 |     {'id': 4, 'file_name': 'train/3/3.jpg', 'captions': ['interior design of modern living room with fireplace in a new house']},
43 |   ]
44 | }
45 | ```
46 | 
47 | ## Run
48 | ### Original Models 
49 | To run CLIM, first obtain the original models using these 
50 | [links](https://github.com/openai/CLIP/blob/a1d071733d7111c9c014f024669f959182114e33/clip/clip.py#L30), 
51 | and put them under 
52 | `checkpoints/` like the following:
53 | 
54 | ```text
55 | CLIM/
56 | ├── checkpoints
57 |     ├── ViT-B-16.pt
58 |     ├── RN50x64.pt
59 |     
60 | ```
61 | 
62 | ### Applying CLIM
63 | We provide the [scripts](scripts) to run CLIM. For example, if we want to refine ViT-B/16 on the COCO dataset, simply run:
64 | ```bash
65 | bash scripts/train_clim_coco_100e_openai_vitb16.sh
66 | ```
67 | We also provide the checkpoints of the models trained by CLIM in
68 | [Google Drive](https://drive.google.com/drive/folders/1v91n5SSXSOtgo2SlEESj_Gquwh9KMj3J?usp=sharing).
69 | 
70 | ### Open-Vocabulary Object Detection
71 | 
72 | To build open-vocabulary detectors using the models trained by CLIM, 
73 | please refer to the instructions in this [README](ovdet/configs/clip_based/README.md).
74 | 


--------------------------------------------------------------------------------
/metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTB16.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTB16.npy


--------------------------------------------------------------------------------
/metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTL14x336.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTL14x336.npy


--------------------------------------------------------------------------------
/metadata/coco_panoptic_clip_hand_craft_RN50x64.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_RN50x64.npy


--------------------------------------------------------------------------------
/metadata/coco_panoptic_clip_hand_craft_ViTB16.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_ViTB16.npy


--------------------------------------------------------------------------------
/metadata/coco_panoptic_clip_hand_craft_ViTL14x336.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_ViTL14x336.npy


--------------------------------------------------------------------------------
/ovdet/DATA.md:
--------------------------------------------------------------------------------
 1 | # Data preparation
 2 | ## Open-Vocabulary COCO
 3 | Prepare data following [MMDetection](https://mmdetection.readthedocs.io/en/latest/user_guides/useful_tools.html#dataset-download). 
 4 | Obtain the json files for OV-COCO from [GoogleDrive](https://drive.google.com/drive/folders/1O6rt6WN2ePPg6j-wVgF89T7ql2HiuRIG?usp=sharing) and put them
 5 | under `data/coco/wusize`
 6 | The data structure looks like:
 7 | 
 8 | ```text
 9 | CLIM/ovdet/data
10 | ├── coco
11 |     ├── annotations
12 |         ├── instances_{train,val}2017.json
13 |     ├── wusize
14 |         ├── instances_train2017_base.json
15 |         ├── instances_val2017_base.json
16 |         ├── instances_val2017_novel.json
17 |         ├── captions_train2017_tags_allcaps.json
18 |     ├── train2017
19 |     ├── val2017
20 |     ├── test2017
21 | ```
22 | 
23 | 
24 | ## Open-Vocabulary LVIS
25 | Prepare data following [MMDetection](https://mmdetection.readthedocs.io/en/latest/user_guides/useful_tools.html#dataset-download).
26 | ```text
27 | CLIM/ovdet/data
28 | ├── lvis_v1
29 |     ├── annotations
30 |         ├── lvis_v1_val.json
31 |         ├── lvis_v1_train.json
32 |     ├── wusize
33 |         ├── lvis_v1_train_base.json
34 |     ├── train2017
35 |     ├── val2017
36 | ├── cc3m
37 |     ├── annotations
38 |         ├── train_image_info_tags.json
39 |     ├── images
40 | ```
41 | We provide the json file `lvis_v1_train_base.json` than only contains annotations of base categories in 
42 | [Google Drive](https://drive.google.com/file/d/1ahmCUXyFAQqnlMb-ZDDSQUMnIosYqhu5/view?usp=sharing). To obtain cc3m, please refer 
43 | to [Detic](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md).
44 | 


--------------------------------------------------------------------------------
/ovdet/INSTALLATION.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | This code for open-vocabulary object detection is based on [MMDetection 3.x](https://github.com/open-mmlab/mmdetection/tree/3.x)
 4 | 
 5 | It requires the following OpenMMLab packages:
 6 | 
 7 | - MMEngine >= 0.6.0
 8 | - MMCV-full >= v2.0.0rc4
 9 | - MMDetection >= v3.0.0rc6
10 | - lvisapi
11 | 
12 | ```bash
13 | pip install openmim mmengine
14 | mim install "mmcv>=2.0.0rc4"
15 | pip install git+https://github.com/lvis-dataset/lvis-api.git
16 | mim install mmdet>=3.0.0rc6
17 | ```
18 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/coco_ovd_base.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py'
 3 | 
 4 | data_root = 'data/coco/'
 5 | 
 6 | train_dataloader = dict(
 7 |     sampler=dict(type='InfiniteSampler'),
 8 |     dataset=dict(
 9 |         ann_file='wusize/instances_train2017_base.json',
10 |         data_prefix=dict(img='train2017/'),
11 |     )
12 | )
13 | val_evaluator = [
14 |     dict(
15 |         type='CocoMetric',
16 |         ann_file=data_root + 'wusize/instances_val2017_base.json',
17 |         metric='bbox',
18 |         prefix='Base',
19 |         format_only=False),
20 |     dict(
21 |         type='CocoMetric',
22 |         ann_file=data_root + 'wusize/instances_val2017_novel.json',
23 |         metric='bbox',
24 |         prefix='Novel',
25 |         format_only=False)
26 | ]
27 | test_evaluator = val_evaluator
28 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/coco_ovd_base_lsj.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py'
 3 | 
 4 | data_root = 'data/coco/'
 5 | image_size = (640, 640)
 6 | 
 7 | image_backend_args = None
 8 | # image_backend_args = dict(
 9 | #     backend='petrel',
10 | #     path_mapping=dict({
11 | #         'data/coco': 's3://openmmlab/datasets/detection/coco'
12 | #     }))
13 | train_pipeline = [
14 |     dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True),
15 |     dict(type="LoadAnnotations", with_bbox=True, with_mask=False),
16 |     dict(
17 |         type="RandomResize",
18 |         scale=image_size,
19 |         ratio_range=(0.1, 2.0),
20 |         keep_ratio=True),
21 |     dict(
22 |         type="RandomCrop",
23 |         crop_type='absolute_range',
24 |         crop_size=image_size,
25 |         recompute_bbox=True,
26 |         allow_negative_crop=True),
27 |     dict(type="Pad", size=image_size,
28 |          pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)),
29 |     dict(type="FilterAnnotations", min_gt_bbox_wh=(1e-2, 1e-2)),
30 |     dict(type="RandomFlip", prob=0.5),
31 |     dict(type="PackDetInputs")
32 | ]
33 | 
34 | train_dataloader = dict(
35 |     dataset=dict(
36 |         ann_file='wusize/instances_train2017_base.json',
37 |         data_prefix=dict(img='train2017/'),
38 |         pipeline=train_pipeline,
39 |     )
40 | )
41 | 
42 | test_pipeline = [
43 |     dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True),
44 |     dict(type="Resize", scale=image_size, keep_ratio=True),
45 |     dict(type="Pad", size=image_size,
46 |          pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)),
47 |     # If you don't have a gt annotation, delete the pipeline
48 |     dict(type="LoadAnnotations", with_bbox=True),
49 |     dict(
50 |         type="PackDetInputs",
51 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
52 |                    'scale_factor'))
53 | ]
54 | 
55 | val_dataloader = dict(
56 |     dataset=dict(
57 |         pipeline=test_pipeline)
58 | )
59 | test_dataloader = val_dataloader
60 | 
61 | 
62 | val_evaluator = [
63 |     dict(
64 |         type='CocoMetric',
65 |         ann_file=data_root + 'wusize/instances_val2017_base.json',
66 |         metric='bbox',
67 |         prefix='Base',
68 |         format_only=False),
69 |     dict(
70 |         type='CocoMetric',
71 |         ann_file=data_root + 'wusize/instances_val2017_novel.json',
72 |         metric='bbox',
73 |         prefix='Novel',
74 |         format_only=False)
75 | ]
76 | test_evaluator = val_evaluator
77 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/coco_ovd_detic.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py'
 3 | dataset_type = 'CocoDataset'
 4 | data_root = 'data/coco/'
 5 | file_client_args = dict(backend='disk')
 6 | branch_field = ['det_batch', 'caption_batch']
 7 | det_pipeline = [
 8 |     dict(type='LoadImageFromFile', file_client_args=file_client_args),
 9 |     dict(type='LoadAnnotations', with_bbox=True),
10 |     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
11 |     dict(type='RandomFlip', prob=0.5),
12 |     # dict(type='PackDetInputs')
13 |     dict(type='MultiBranch',
14 |          branch_field=branch_field,
15 |          det_batch=dict(type='PackDetInputs'))
16 | ]
17 | 
18 | ovd_pipeline = [
19 |     dict(type='LoadImageFromFile', file_client_args=file_client_args),
20 |     dict(type='LoadAnnotations', with_bbox=True),
21 |     dict(type='Resize', scale=(667, 400), keep_ratio=True),
22 |     dict(type='RandomFlip', prob=0.5),
23 |     # dict(type='PackDetInputs')
24 |     dict(type='MultiBranch',
25 |          branch_field=branch_field,
26 |          caption_batch=dict(type='PackDetInputs',
27 |                             meta_keys=['img_id', 'img_path', 'ori_shape',
28 |                                        'img_shape', 'scale_factor',
29 |                                        'flip', 'flip_direction', 'captions',
30 |                                        'tags', 'image_ids']
31 |                             )
32 |          )
33 | ]
34 | det_dataset = dict(
35 |     type='CocoDataset',
36 |     data_root=data_root,
37 |     ann_file='wusize/instances_train2017_base.json',
38 |     data_prefix=dict(img='train2017/'),
39 |     filter_cfg=dict(filter_empty_gt=True, min_size=32),
40 |     pipeline=det_pipeline)
41 | 
42 | ovd_dataset = dict(
43 |     type='CocoCaptionOVDDataset',
44 |     data_root=data_root,
45 |     ann_file='wusize/captions_train2017_tags_allcaps.json',
46 |     data_prefix=dict(img='train2017/'),
47 |     filter_cfg=dict(filter_empty_gt=False),
48 |     pipeline=ovd_pipeline
49 | )
50 | batch_split = [2, 4]
51 | train_dataloader = dict(
52 |     batch_size=sum(batch_split),
53 |     num_workers=sum(batch_split),
54 |     persistent_workers=True,
55 |     sampler=dict(type='CustomGroupMultiSourceSampler',
56 |                  batch_size=sum(batch_split),
57 |                  source_ratio=batch_split),
58 |     batch_sampler=None,
59 |     dataset=dict(
60 |         _delete_=True,
61 |         type='ConcatDataset',
62 |         datasets=[det_dataset, ovd_dataset])
63 | )
64 | 
65 | val_evaluator = [
66 |     dict(
67 |         type='CocoMetric',
68 |         ann_file=data_root + 'wusize/instances_val2017_base.json',
69 |         metric='bbox',
70 |         prefix='Base',
71 |         format_only=False),
72 |     dict(
73 |         type='CocoMetric',
74 |         ann_file=data_root + 'wusize/instances_val2017_novel.json',
75 |         metric='bbox',
76 |         prefix='Novel',
77 |         format_only=False)
78 | ]
79 | test_evaluator = val_evaluator
80 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/coco_ovd_detic_clim.py:
--------------------------------------------------------------------------------
  1 | # dataset settings
  2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py'
  3 | dataset_type = 'CocoDataset'
  4 | data_root = 'data/coco/'
  5 | file_client_args = dict(backend='disk')
  6 | branch_field = ['det_batch', 'caption_batch', 'mosaic_batch']
  7 | det_pipeline = [
  8 |     dict(type='LoadImageFromFile', file_client_args=file_client_args),
  9 |     dict(type='LoadAnnotations', with_bbox=True),
 10 |     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
 11 |     dict(type='RandomFlip', prob=0.5),
 12 |     # dict(type='PackDetInputs')
 13 |     dict(type='MultiBranch',
 14 |          branch_field=branch_field,
 15 |          det_batch=dict(type='PackDetInputs'))
 16 | ]
 17 | 
 18 | ovd_pipeline = [
 19 |     dict(type='LoadImageFromFile', file_client_args=file_client_args),
 20 |     dict(type='LoadAnnotations', with_bbox=True),
 21 |     dict(type='Resize', scale=(667, 400), keep_ratio=True),
 22 |     dict(type='RandomFlip', prob=0.5),
 23 |     # dict(type='PackDetInputs')
 24 |     dict(type='MultiBranch',
 25 |          branch_field=branch_field,
 26 |          caption_batch=dict(type='PackDetInputs',
 27 |                             meta_keys=['img_id', 'img_path', 'ori_shape',
 28 |                                        'img_shape', 'scale_factor',
 29 |                                        'flip', 'flip_direction', 'captions',
 30 |                                        'tags', 'image_ids']
 31 |                             )
 32 |          )
 33 | ]
 34 | 
 35 | 
 36 | mosaic_pipeline = [
 37 |     dict(type='LoadImageFromFile', file_client_args=file_client_args),
 38 |     dict(type='LoadAnnotations', with_bbox=True),
 39 |     dict(type='Resize', scale=(400, 400), keep_ratio=True),
 40 |     dict(type='RandomFlip', prob=0.5),
 41 |     dict(type='MultiChoicesMosaic',
 42 |          choices=[(2, 2), (3, 3), (4, 4)],
 43 |          max_cached_images=1024,
 44 |          img_scale=(400, 400),
 45 |          pad_val=114.0,
 46 |          prob=1.0, center_ratio_range=(1.0, 1.0)),
 47 |     dict(type='Resize', scale=(800, 800), keep_ratio=True),   # resize to a fixed value
 48 |     dict(type='MultiBranch',
 49 |          branch_field=branch_field,
 50 |          mosaic_batch=dict(type='PackDetInputs',
 51 |                            meta_keys=['img_id', 'img_path', 'ori_shape',
 52 |                                       'img_shape', 'scale_factor',
 53 |                                       'flip', 'flip_direction', 'captions',
 54 |                                       'tags', 'image_ids'])
 55 |          )
 56 | ]
 57 | 
 58 | 
 59 | det_dataset = dict(
 60 |     type='CocoDataset',
 61 |     data_root=data_root,
 62 |     ann_file='wusize/instances_train2017_base.json',
 63 |     data_prefix=dict(img='train2017/'),
 64 |     filter_cfg=dict(filter_empty_gt=True, min_size=32),
 65 |     pipeline=det_pipeline)
 66 | 
 67 | ovd_dataset = dict(
 68 |     type='CocoCaptionOVDDataset',
 69 |     data_root=data_root,
 70 |     ann_file='wusize/captions_train2017_tags_allcaps.json',
 71 |     data_prefix=dict(img='train2017/'),
 72 |     filter_cfg=dict(filter_empty_gt=False),
 73 |     pipeline=ovd_pipeline
 74 | )
 75 | 
 76 | mosaic_dataset = dict(
 77 |     type='CocoCaptionOVDDataset',
 78 |     data_root=data_root,
 79 |     ann_file='wusize/captions_train2017_tags_allcaps.json',
 80 |     data_prefix=dict(img='train2017/'),
 81 |     filter_cfg=dict(filter_empty_gt=False),
 82 |     pipeline=mosaic_pipeline
 83 | )
 84 | 
 85 | 
 86 | batch_split = [2, 2, 2]
 87 | train_dataloader = dict(
 88 |     batch_size=sum(batch_split),
 89 |     num_workers=sum(batch_split),
 90 |     persistent_workers=True,
 91 |     sampler=dict(type='CustomGroupMultiSourceSampler',
 92 |                  batch_size=sum(batch_split),
 93 |                  source_ratio=batch_split),
 94 |     batch_sampler=None,
 95 |     dataset=dict(
 96 |         _delete_=True,
 97 |         type='ConcatDataset',
 98 |         datasets=[det_dataset, ovd_dataset, mosaic_dataset])
 99 | )
100 | 
101 | val_evaluator = [
102 |     dict(
103 |         type='CocoMetric',
104 |         ann_file=data_root + 'wusize/instances_val2017_base.json',
105 |         metric='bbox',
106 |         prefix='Base',
107 |         format_only=False),
108 |     dict(
109 |         type='CocoMetric',
110 |         ann_file=data_root + 'wusize/instances_val2017_novel.json',
111 |         metric='bbox',
112 |         prefix='Novel',
113 |         format_only=False)
114 | ]
115 | test_evaluator = val_evaluator
116 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/lvis_v1_ovd_base.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'mmdet::_base_/datasets/lvis_v1_instance.py'
 3 | train_dataloader = dict(
 4 |     sampler=dict(type='InfiniteSampler'),
 5 |     dataset=dict(
 6 |         dataset=dict(
 7 |             ann_file='wusize/lvis_v1_train_base.json')
 8 |     )
 9 | )
10 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/lvis_v1_ovd_base_lsj.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'mmdet::_base_/datasets/lvis_v1_instance.py'
 3 | image_size = (1024, 1024)
 4 | 
 5 | image_backend_args = None
 6 | # image_backend_args = dict(
 7 | #     backend='petrel',
 8 | #     path_mapping=dict({
 9 | #         'data/lvis_v1/train2017': 's3://openmmlab/datasets/detection/coco/train2017',
10 | #         'data/lvis_v1/val2017': 's3://openmmlab/datasets/detection/coco/val2017'
11 | #     })
12 | # )
13 | 
14 | train_pipeline = [
15 |     dict(type='LoadImageFromFile', backend_args=image_backend_args, to_float32=True),
16 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
17 |     dict(
18 |         type='RandomResize',
19 |         scale=image_size,
20 |         ratio_range=(0.1, 2.0),
21 |         keep_ratio=True),
22 |     dict(
23 |         type='RandomCrop',
24 |         crop_type='absolute_range',
25 |         crop_size=image_size,
26 |         recompute_bbox=True,
27 |         allow_negative_crop=True),
28 |     dict(type="Pad", size=image_size,
29 |          pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)),
30 |     dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
31 |     dict(type='RandomFlip', prob=0.5),
32 |     dict(type='PackDetInputs')
33 | ]
34 | train_dataloader = dict(
35 |     dataset=dict(
36 |         dataset=dict(
37 |             ann_file='wusize/lvis_v1_train_base.json',
38 |             pipeline=train_pipeline,)
39 |     )
40 | )
41 | 
42 | 
43 | test_pipeline = [
44 |     dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True),
45 |     dict(type="Resize", scale=image_size, keep_ratio=True),
46 |     dict(type="Pad", size=image_size,
47 |          pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)),
48 |     # If you don't have a gt annotation, delete the pipeline
49 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
50 |     dict(
51 |         type="PackDetInputs",
52 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
53 |                    'scale_factor'))
54 | ]
55 | val_dataloader = dict(
56 |     dataset=dict(
57 |         pipeline=test_pipeline)
58 | )
59 | test_dataloader = val_dataloader
60 | 
61 | 
62 | val_evaluator = dict(metric=['segm'])
63 | test_evaluator = val_evaluator
64 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/datasets/lvis_v1_ovd_base_lsj_640.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'mmdet::_base_/datasets/lvis_v1_instance.py'
 3 | image_size = (640, 640)
 4 | 
 5 | image_backend_args = None
 6 | # image_backend_args = dict(
 7 | #     backend='petrel',
 8 | #     path_mapping=dict({
 9 | #         'data/lvis_v1/train2017': 's3://openmmlab/datasets/detection/coco/train2017',
10 | #         'data/lvis_v1/val2017': 's3://openmmlab/datasets/detection/coco/val2017'
11 | #     })
12 | # )
13 | 
14 | train_pipeline = [
15 |     dict(type='LoadImageFromFile', backend_args=image_backend_args, to_float32=True),
16 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
17 |     dict(
18 |         type='RandomResize',
19 |         scale=image_size,
20 |         ratio_range=(0.1, 2.0),
21 |         keep_ratio=True),
22 |     dict(
23 |         type='RandomCrop',
24 |         crop_type='absolute_range',
25 |         crop_size=image_size,
26 |         recompute_bbox=True,
27 |         allow_negative_crop=True),
28 |     dict(type="Pad", size=image_size,
29 |          pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)),
30 |     dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
31 |     dict(type='RandomFlip', prob=0.5),
32 |     dict(type='PackDetInputs')
33 | ]
34 | train_dataloader = dict(
35 |     dataset=dict(
36 |         dataset=dict(
37 |             ann_file='wusize/lvis_v1_train_base.json',
38 |             pipeline=train_pipeline,)
39 |     )
40 | )
41 | 
42 | 
43 | test_pipeline = [
44 |     dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True),
45 |     dict(type="Resize", scale=image_size, keep_ratio=True),
46 |     dict(type="Pad", size=image_size,
47 |          pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)),
48 |     # If you don't have a gt annotation, delete the pipeline
49 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
50 |     dict(
51 |         type="PackDetInputs",
52 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
53 |                    'scale_factor'))
54 | ]
55 | val_dataloader = dict(
56 |     dataset=dict(
57 |         pipeline=test_pipeline)
58 | )
59 | test_dataloader = val_dataloader
60 | 
61 | 
62 | val_evaluator = dict(metric=['segm'])
63 | test_evaluator = val_evaluator
64 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/iter_based_runtime.py:
--------------------------------------------------------------------------------
1 | _base_ = 'mmdet::_base_/default_runtime.py'
2 | default_hooks = dict(
3 |     # logger=dict(type='LoggerHook', interval=5),
4 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, max_keep_ckpts=1, interval=5000)
5 | )
6 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
7 | find_unused_parameters = True
8 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/models/faster-rcnn_r50_fpn_syncbn.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'mmdet::_base_/models/faster-rcnn_r50_fpn.py'
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | # model settings
 4 | model = dict(
 5 |     backbone=dict(
 6 |         frozen_stages=-1,
 7 |         norm_cfg=norm_cfg,
 8 |         norm_eval=False,
 9 |         init_cfg=None),
10 |     neck=dict(
11 |         norm_cfg=norm_cfg,),
12 |     roi_head=dict(
13 |         bbox_head=dict(
14 |             type='Shared4Conv1FCBBoxHead',
15 |             norm_cfg=dict(type='BN', requires_grad=False),    # freeze the bn at bbox head
16 |             norm_eval=True,
17 |             num_classes=80,
18 |             reg_class_agnostic=True,
19 |             loss_cls=dict(
20 |                 type='CustomCrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
21 |         )
22 |     ),
23 |     # model training and testing settings
24 |     test_cfg=dict(
25 |         rcnn=dict(
26 |             score_thr=0.05,)
27 |     )
28 | )
29 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/models/mask-rcnn_r50_fpn_syncbn.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'mmdet::_base_/models/mask-rcnn_r50_fpn.py'
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | # model settings
 4 | model = dict(
 5 |     backbone=dict(
 6 |         frozen_stages=-1,
 7 |         norm_cfg=norm_cfg,
 8 |         norm_eval=False,
 9 |         init_cfg=None),
10 |     neck=dict(
11 |         norm_cfg=norm_cfg,),
12 |     roi_head=dict(
13 |         bbox_head=dict(
14 |             type='Shared4Conv1FCBBoxHead',
15 |             norm_cfg=norm_cfg,
16 |             norm_eval=False,
17 |             num_classes=80,
18 |             reg_class_agnostic=True,
19 |             loss_cls=dict(
20 |                 type='CustomCrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
21 |         ),
22 |         mask_head=dict(
23 |             norm_cfg=norm_cfg,
24 |             class_agnostic=True,
25 |         )
26 |     ),
27 |     # model training and testing settings
28 |     test_cfg=dict(
29 |         rcnn=dict(
30 |             score_thr=0.05,)
31 |     )
32 | )
33 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/schedules/schedule_180k.py:
--------------------------------------------------------------------------------
 1 | # training schedule for 1x
 2 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=180000, val_interval=30000)
 3 | val_cfg = dict(type='ValLoop')
 4 | test_cfg = dict(type='TestLoop')
 5 | 
 6 | # learning rate
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='LinearLR', start_factor=0.0002, by_epoch=False, begin=0, end=5000),
10 |     dict(
11 |         type='MultiStepLR',
12 |         begin=0,
13 |         end=180000,
14 |         by_epoch=False,
15 |         milestones=[120000, 160000],
16 |         gamma=0.1)
17 | ]
18 | 
19 | # optimizer
20 | optim_wrapper = dict(
21 |     type='OptimWrapper',
22 |     optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
23 | 
24 | # Default setting for scaling LR automatically
25 | #   - `enable` means enable scaling LR automatically
26 | #       or not by default.
27 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
28 | auto_scale_lr = dict(enable=False, base_batch_size=16)
29 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/schedules/schedule_45k.py:
--------------------------------------------------------------------------------
 1 | # training schedule for 1x
 2 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=45000, val_interval=5000)
 3 | val_cfg = dict(type='ValLoop')
 4 | test_cfg = dict(type='TestLoop')
 5 | 
 6 | # learning rate
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=1000),
10 |     dict(
11 |         type='MultiStepLR',
12 |         begin=0,
13 |         end=45000,
14 |         by_epoch=False,
15 |         milestones=[30000, 40000],
16 |         gamma=0.1)
17 | ]
18 | 
19 | # optimizer
20 | optim_wrapper = dict(
21 |     type='OptimWrapper',
22 |     optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
23 | 
24 | # Default setting for scaling LR automatically
25 | #   - `enable` means enable scaling LR automatically
26 | #       or not by default.
27 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
28 | auto_scale_lr = dict(enable=False, base_batch_size=16)
29 | 


--------------------------------------------------------------------------------
/ovdet/configs/_base_/schedules/schedule_90k.py:
--------------------------------------------------------------------------------
 1 | # training schedule for 1x
 2 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=10000)
 3 | val_cfg = dict(type='ValLoop')
 4 | test_cfg = dict(type='TestLoop')
 5 | 
 6 | # learning rate
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=1000),
10 |     dict(
11 |         type='MultiStepLR',
12 |         begin=0,
13 |         end=90000,
14 |         by_epoch=False,
15 |         milestones=[60000, 80000],
16 |         gamma=0.1)
17 | ]
18 | 
19 | # optimizer
20 | optim_wrapper = dict(
21 |     type='OptimWrapper',
22 |     optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
23 | 
24 | # Default setting for scaling LR automatically
25 | #   - `enable` means enable scaling LR automatically
26 | #       or not by default.
27 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
28 | auto_scale_lr = dict(enable=False, base_batch_size=16)
29 | 


--------------------------------------------------------------------------------
/ovdet/configs/clip_based/README.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | First please make sure the modified OpenCLIP has been installed as follows
 3 | ```
 4 | cd CLIM
 5 | pip install -e . -v
 6 | ```
 7 | Then please refer to this [README](../../INSTALLATION.md) to install the detector.
 8 | 
 9 | ## Data Preparation
10 | Please refer to this [README](../../DATA.md).
11 | 
12 | 
13 | ## Usage
14 | ### Obtain Checkpoints
15 | We provide checkpoints of models that were trained by CLIM in 
16 | [Google Drive](https://drive.google.com/drive/folders/1v91n5SSXSOtgo2SlEESj_Gquwh9KMj3J?usp=sharing). Put them under 
17 | `CLIM/ovdet/checkpoints`.
18 | 
19 | ### Training
20 | Take ViT-B/16 on OV-COCO as example, run the following to train the detector
21 | 
22 | ```
23 | cd CLIM/ovdet
24 | bash tools/dist_train.sh \
25 |      configs/clip_based/openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py 8 \
26 |      --work-dir your/output/directory/ovdet_openai_vitb16_ov_coco_clim
27 | ```
28 | 
29 | ### Testing
30 | We also provide the following checkpoints of the trained detectors in 
31 | [Google Drive](https://drive.google.com/drive/folders/1v91n5SSXSOtgo2SlEESj_Gquwh9KMj3J?usp=sharing). Download and 
32 | put them under `CLIM/ovdet/checkpoints`. 
33 | 
34 | Note: the released code for the ViT-based detector achieves better results than that we have initially reported 
35 | in the paper.
36 | 
37 | |  OV-COCO  | Backbone  | Novel AP50 |                                    Config                                     | Download  |
38 | |:---------:|:--------:|:----------:|:-----------------------------------------------------------------------------:|:---------:|
39 | |   Paper   | ViT-B/16 |    25.7    |                                       -                                       |     -     |
40 | | This Repo | ViT-B/16 |    29.7    | [config](openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py) | [model](https://drive.google.com/file/d/1lOKpb2EiC2rcgsX9GeXUhVN1QnyUTZSM/view?usp=sharing) |
41 | 
42 | |  OV-LVIS  | Backbone | Mask APr |                                      Config                                       | Download  |
43 | |:---------:|:--------:|:--------:|:---------------------------------------------------------------------------------:|:---------:|
44 | |   Paper   | ViT-B/16 |   20.8   |                                         -                                         |     -     |
45 | | This Repo | ViT-B/16 |   24.3   |  [config](openai_vitb16/mask_rcnn_nasfpn_openai_vitb16_clim_bs64_ov_lvis_4x.py)   | [model](https://drive.google.com/file/d/1rLEp2cL8rH0rvFduxaOG6m_Z9-s_qMwQ/view?usp=sharing) |
46 | |   Paper   | RN50x64  |   32.3   |                                         -                                         |     -     |
47 | | This Repo | RN50x64  |   32.4   | [config](openai_rn50x64/mask_rcnn_fpn_openai_rn50x64_clim_bs256_ov_lvis_2.88k.py) | [model](https://drive.google.com/file/d/1LjJo4p3vaLKoy1Vp08kt_Xg08dLdgbo5/view?usp=sharing) |
48 | 
49 | Take ViT-B/16 on OV-COCO as example,  run the following script to test the detector
50 | 
51 | ```
52 | cd CLIM/ovdet
53 | bash tools/dist_test.sh \
54 |      configs/clip_based/openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py \
55 |      checkpoints/ovdet_openai_vitb16_ov_coco_clim.pth \
56 |      8 --work-dir your/output/directory/ovdet_openai_vitb16_ov_coco_clim
57 | ```
58 | 


--------------------------------------------------------------------------------
/ovdet/configs/clip_based/openai_rn50x64/mask_rcnn_fpn_openai_rn50x64_clim_bs256_ov_lvis_2.88k.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['mmdet::_base_/models/mask-rcnn_r50_fpn.py',
  2 |           '../../_base_/iter_based_runtime.py',
  3 |           '../../_base_/datasets/lvis_v1_ovd_base_lsj.py']
  4 | find_unused_parameters = True
  5 | class_weight = 'data/metadata/lvis_v1_train_cat_norare_info.json'
  6 | norm_cfg = dict(type='SyncBN', requires_grad=True)
  7 | model = dict(
  8 |     type='FVLM',
  9 |     data_preprocessor=dict(
 10 |         mean=[122.7709383, 116.7460125, 104.09373615],
 11 |         std=[68.5005327, 66.6321579, 70.32316305]),
 12 |     backbone=dict(
 13 |         type='CLIPResNet',
 14 |         _delete_=True,
 15 |         model_name='RN50x64',
 16 |         cache_dir='checkpoints',
 17 |         pretrained='checkpoints/openai_rn50x64_cc3m_clim.pt',
 18 |         roi_extractor=dict(
 19 |             type='SingleRoIExtractor',
 20 |             roi_layer=dict(type='RoIAlign', output_size=14,
 21 |                            sampling_ratio=0, use_torchvision=True),
 22 |             out_channels=4096,
 23 |             featmap_strides=[32]),
 24 |     ),
 25 |     neck=dict(
 26 |         in_channels=[512, 1024, 2048, 4096],
 27 |         norm_cfg=norm_cfg
 28 |     ),
 29 |     rpn_head=dict(
 30 |         type='CustomRPNHead',
 31 |         num_convs=2,
 32 |         norm_cfg=norm_cfg
 33 |     ),
 34 |     roi_head=dict(
 35 |         type='FVLMStandardRoIHead',
 36 |         bbox_head=dict(
 37 |             type='FVLMConvFCBBoxHead',
 38 |             num_shared_convs=4,
 39 |             num_shared_fcs=2,
 40 |             num_cls_fcs=1,
 41 |             num_reg_fcs=1,
 42 |             reg_class_agnostic=True,
 43 |             num_classes=1203,
 44 |             norm_cfg=norm_cfg,
 45 |             alpha=0.35,
 46 |             beta=0.65,
 47 |             clip_temp=50.0,
 48 |             cls_temp=50.0,
 49 |             learn_cls_temp=True,
 50 |             cls_embeddings_path="data/metadata/lvis_openai_rn50x64_hand_craft.npy",
 51 |             bg_embedding='learn',
 52 |             loss_cls=dict(
 53 |                 type='CustomCrossEntropyLoss',
 54 |                 use_sigmoid=False,
 55 |                 class_weight=class_weight,
 56 |                 bg_weight=0.9,
 57 |             ),
 58 |         ),
 59 |         mask_head=dict(
 60 |             norm_cfg=norm_cfg, class_agnostic=True, num_classes=1203)
 61 |     ),
 62 |     test_cfg=dict(
 63 |         rpn=dict(nms_pre=2000),
 64 |         rcnn=dict(
 65 |             score_thr=0.0001,
 66 |             nms=dict(type='nms', iou_threshold=0.5),
 67 |             max_per_img=300)
 68 |     )
 69 | )
 70 | 
 71 | default_hooks = dict(
 72 |     checkpoint=dict(interval=2880//2)
 73 | )
 74 | 
 75 | # training schedule for 2.88k
 76 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=2880, val_interval=2880)
 77 | val_cfg = dict(type='ValLoop')
 78 | test_cfg = dict(type='TestLoop')
 79 | 
 80 | # learning rate
 81 | param_scheduler = [
 82 |     dict(
 83 |         type='LinearLR', start_factor=0.009, by_epoch=False, begin=0, end=250),
 84 |     dict(
 85 |         type='MultiStepLR',
 86 |         begin=0,
 87 |         end=2880,
 88 |         by_epoch=False,
 89 |         milestones=[2304, 2592, 2736],
 90 |         gamma=0.1)
 91 | ]
 92 | # optimizer
 93 | optim_wrapper = dict(
 94 |     type='AmpOptimWrapper',
 95 |     optimizer=dict(type='SGD', lr=0.36, momentum=0.9, weight_decay=0.0001),
 96 |     clip_grad=dict(max_norm=1.0, norm_type=2),
 97 | )
 98 | 
 99 | # Default setting for scaling LR automatically
100 | #   - `enable` means enable scaling LR automatically
101 | #       or not by default.
102 | #   - `base_batch_size` = (8 GPUs) x (32 samples per GPU).
103 | auto_scale_lr = dict(enable=True, base_batch_size=256)
104 | train_dataloader = dict(
105 |     batch_size=32,
106 |     num_workers=4,
107 |     sampler=dict(type='InfiniteSampler'),
108 | )
109 | 


--------------------------------------------------------------------------------
/ovdet/configs/clip_based/openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['mmdet::_base_/models/faster-rcnn_r50_fpn.py',
  2 |           'mmdet::_base_/default_runtime.py',
  3 |           '../../_base_/datasets/coco_ovd_base_lsj.py']
  4 | find_unused_parameters = True
  5 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
  6 |                 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
  7 |                 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
  8 |                 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
  9 |                 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
 10 |                 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
 11 |                 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
 12 |                 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0.6]
 13 | # invalid_classes = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
 14 | #                    1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
 15 | #                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 16 | #                    0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
 17 | #                    1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 18 | #                    0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
 19 | #                    1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
 20 | #                    0, 0, 0, 0, 0, 0, 0, 1, 1, 0] + [0]
 21 | invalid_classes = None
 22 | 
 23 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 24 | model = dict(
 25 |     type='FVLM',
 26 |     data_preprocessor=dict(
 27 |         mean=[122.7709383, 116.7460125, 104.09373615],
 28 |         std=[68.5005327, 66.6321579, 70.32316305]),
 29 |     backbone=dict(
 30 |         _delete_=True,
 31 |         type='CLIPViT',
 32 |         model_name='ViT-B-16',
 33 |         cache_dir='checkpoints',
 34 |         pretrained='checkpoints/openai_vitb16_coco_clim.pt',
 35 |         out_indices=[3, 5, 7, 11],
 36 |         roi_extractor=dict(
 37 |             type='SingleRoIExtractor',
 38 |             roi_layer=dict(type='RoIAlign', output_size=1,
 39 |                            sampling_ratio=0, use_torchvision=True),
 40 |             out_channels=512,
 41 |             featmap_strides=[16]),
 42 |         norm_cfg=norm_cfg
 43 |     ),
 44 |     neck=dict(
 45 |         in_channels=[768, 768, 768, 768],
 46 |         norm_cfg=norm_cfg
 47 |     ),
 48 |     rpn_head=dict(num_convs=2),
 49 |     roi_head=dict(
 50 |         type='FVLMStandardRoIHead',
 51 |         bbox_head=dict(
 52 |             type='FVLMConvFCBBoxHead',
 53 |             num_shared_convs=4,
 54 |             num_shared_fcs=2,
 55 |             num_cls_fcs=1,
 56 |             num_reg_fcs=1,
 57 |             reg_class_agnostic=True,
 58 |             norm_cfg=norm_cfg,
 59 |             alpha=0.1,
 60 |             beta=0.8,
 61 |             clip_temp=75.0,
 62 |             cls_temp=50.0,
 63 |             invalid_classes=invalid_classes,
 64 |             learn_cls_temp=True,
 65 |             cls_embeddings_path="data/metadata/coco_openai_vitb16_hand_craft.npy",
 66 |             bg_embedding='learn',
 67 |             loss_cls=dict(
 68 |                 type='CustomCrossEntropyLoss',
 69 |                 use_sigmoid=False,
 70 |                 class_weight=class_weight
 71 |             ),
 72 |         )
 73 |     ),
 74 |     test_cfg=dict(
 75 |         rcnn=dict(
 76 |             score_thr=0.01,
 77 |             nms=dict(type='nms', iou_threshold=0.4),
 78 |             max_per_img=100)
 79 |     )
 80 | )
 81 | 
 82 | 
 83 | # training schedule for 3e
 84 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=3, val_interval=1)
 85 | val_cfg = dict(type='ValLoop')
 86 | test_cfg = dict(type='TestLoop')
 87 | 
 88 | # learning rate
 89 | param_scheduler = [
 90 |     dict(
 91 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=250),
 92 |     dict(
 93 |         type='MultiStepLR',
 94 |         begin=0,
 95 |         end=3,
 96 |         by_epoch=True,
 97 |         milestones=[100, ],
 98 |         gamma=0.1)
 99 | ]
100 | 
101 | # optimizer
102 | optim_wrapper = dict(
103 |     type='AmpOptimWrapper',
104 |     optimizer=dict(type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.1),
105 |     clip_grad=dict(max_norm=1.0, norm_type=2),
106 | )
107 | 
108 | # Default setting for scaling LR automatically
109 | #   - `enable` means enable scaling LR automatically
110 | #       or not by default.
111 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
112 | auto_scale_lr = dict(enable=True, base_batch_size=64)
113 | train_dataloader = dict(
114 |     batch_size=8,
115 |     num_workers=4
116 | )
117 | 


--------------------------------------------------------------------------------
/ovdet/configs/clip_based/openai_vitb16/mask_rcnn_nasfpn_openai_vitb16_clim_bs64_ov_lvis_4x.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['mmdet::_base_/models/mask-rcnn_r50_fpn.py',
  2 |           'mmdet::_base_/default_runtime.py',
  3 |           '../../_base_/datasets/lvis_v1_ovd_base_lsj_640.py']
  4 | find_unused_parameters = True
  5 | class_weight = 'data/metadata/lvis_v1_train_cat_norare_info.json'
  6 | norm_cfg = dict(type='SyncBN', requires_grad=True)
  7 | model = dict(
  8 |     type='FVLM',
  9 |     data_preprocessor=dict(
 10 |         mean=[122.7709383, 116.7460125, 104.09373615],
 11 |         std=[68.5005327, 66.6321579, 70.32316305]),
 12 |     backbone=dict(
 13 |         type='CLIPViT',
 14 |         _delete_=True,
 15 |         model_name='ViT-B-16',
 16 |         cache_dir='checkpoints',
 17 |         out_indices=[3, 5, 7, 11],
 18 |         pretrained='checkpoints/openai_vitb16_cc3m_clim.pt',
 19 |         roi_extractor=dict(
 20 |             type='SingleRoIExtractor',
 21 |             roi_layer=dict(type='RoIAlign', output_size=1,
 22 |                            sampling_ratio=0, use_torchvision=True),
 23 |             out_channels=512,
 24 |             featmap_strides=[16]),
 25 |         norm_cfg=norm_cfg
 26 |     ),
 27 |     neck=dict(
 28 |         type='NASFPN',
 29 |         stack_times=7,
 30 |         in_channels=[768, 768, 768, 768],
 31 |         norm_cfg=norm_cfg
 32 |     ),
 33 |     rpn_head=dict(num_convs=2),
 34 |     roi_head=dict(
 35 |         type='FVLMStandardRoIHead',
 36 |         bbox_head=dict(
 37 |             type='FVLMConvFCBBoxHead',
 38 |             num_shared_convs=4,
 39 |             num_shared_fcs=2,
 40 |             num_cls_fcs=1,
 41 |             num_reg_fcs=1,
 42 |             reg_class_agnostic=True,
 43 |             num_classes=1203,
 44 |             norm_cfg=norm_cfg,
 45 |             alpha=0.35,
 46 |             beta=0.65,
 47 |             clip_temp=50.0,
 48 |             cls_temp=50.0,
 49 |             learn_cls_temp=True,
 50 |             cls_embeddings_path="data/metadata/lvis_openai_vitb16_hand_craft.npy",
 51 |             bg_embedding='learn',
 52 |             loss_cls=dict(
 53 |                 type='CustomCrossEntropyLoss',
 54 |                 use_sigmoid=False,
 55 |                 class_weight=class_weight,
 56 |                 bg_weight=0.9,
 57 |             ),
 58 |         ),
 59 |         mask_head=dict(
 60 |             norm_cfg=norm_cfg, class_agnostic=True, num_classes=1203)
 61 |     ),
 62 |     test_cfg=dict(
 63 |         rpn=dict(nms_pre=2000),
 64 |         rcnn=dict(
 65 |             score_thr=0.0001,
 66 |             nms=dict(type='nms', iou_threshold=0.5),
 67 |             max_per_img=300)
 68 |     )
 69 | )
 70 | 
 71 | default_hooks = dict(
 72 |     checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1, interval=1)
 73 | )
 74 | 
 75 | # training schedule for 4x
 76 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=48, val_interval=12)
 77 | val_cfg = dict(type='ValLoop')
 78 | test_cfg = dict(type='TestLoop')
 79 | 
 80 | # learning rate
 81 | param_scheduler = [
 82 |     dict(
 83 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=250),
 84 |     dict(
 85 |         type='MultiStepLR',
 86 |         begin=0,
 87 |         end=48,
 88 |         by_epoch=True,
 89 |         milestones=[32, 44],
 90 |         gamma=0.1)
 91 | ]
 92 | 
 93 | # optimizer
 94 | optim_wrapper = dict(
 95 |     type='AmpOptimWrapper',
 96 |     optimizer=dict(
 97 |         type='AdamW', lr=0.0004, betas=(0.9, 0.999), weight_decay=0.05),
 98 |     clip_grad=dict(max_norm=35, norm_type=2),
 99 | )
100 | 
101 | # Default setting for scaling LR automatically
102 | #   - `enable` means enable scaling LR automatically
103 | #       or not by default.
104 | #   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
105 | auto_scale_lr = dict(enable=True, base_batch_size=8*16)
106 | train_dataloader = dict(
107 |     batch_size=16,
108 |     num_workers=4
109 | )
110 | 


--------------------------------------------------------------------------------
/ovdet/configs/detic/README.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | Please refer to this [README](../../INSTALLATION.md).
 3 | ## Data Preparation
 4 | Please refer to this [README](../../DATA.md).
 5 | 
 6 | ## Usage
 7 | ### Obtain CLIP Checkpoints
 8 | We use CLIP's text encoder (ViT-B/32) for Detic. Obtain the state_dict 
 9 | of the model from [GoogleDrive](https://drive.google.com/file/d/1ilxBhjb3JXNDar8lKRQ9GA4hTmjxADfu/view?usp=sharing) and put it under `checkpoints`.
10 | ### OV-COCO
11 | #### Training
12 | 1. To pre-train the detector only on the detection data of base categories, run
13 | 
14 | ```
15 | cd CLIM/ovdet
16 | bash tools/dist_train.sh \
17 |      configs/detic/ov_coco/faster_rcnn_r50_caffe_c4_90k_ovcoco.py 8 \
18 |      --work-dir your/output/directory/detic_coco_base
19 | ```
20 | Rename the checkpoint of the trained model as `detic_coco_base.pth` and put it under `checkpoints`.
21 | We also provide this checkpoint `detic_coco_base.pth` 
22 | in [Google Drive](https://drive.google.com/file/d/1ZzR6aI-AnvSygUcJ7Ny8jOlY4v8Id7MO/view?usp=sharing).
23 | 
24 | 2.1 To fine-tune the detector with caption data (no tags), run 
25 | 
26 | ```
27 | cd CLIM/ovdet
28 | bash tools/dist_train.sh \
29 |      configs/detic/ov_coco/detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py 8 \
30 |      --work-dir your/output/directory/detic_coco_cap_no_tags_clim
31 | ```
32 | 2.2 To fine-tune  the detector using caption loss and image tag loss, run
33 | 
34 | ```
35 | cd CLIM/ovdet
36 | bash tools/dist_train.sh \
37 |      configs/detic/ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py 8 \
38 |      --work-dir your/output/directory/detic_coco_cap_w_tags_clim
39 | ```
40 | 
41 | 
42 | #### Testing
43 | We have provided the following checkpoints in [Google Drive](https://drive.google.com/drive/folders/1f-AkMXFgDIfRMezUbVSc_BC0tr5AjRJ4?usp=sharing).
44 | 
45 | 
46 | 
47 | 
48 | | OV-COCO |    Losses     | Novel AP50 |                                Config                                | Download  |
49 | |:-------:|:-------------:|:----------:|:--------------------------------------------------------------------:|:---------:|
50 | |    1    |    Caption    |    32.3    | [config](ov_coco/detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py) | [model](https://drive.google.com/file/d/1TRr7Bz_EF40kUYa61cIGpScYoY8Yv7Cs/view?usp=sharing)   |
51 | |    2    | Caption & Tag |    35.4    | [config](ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py)  | [model](https://drive.google.com/file/d/1MQyHN7i_BP9D9S7vi213Tysnrdj7eGdG/view?usp=sharing) |
52 | 
53 | 
54 | 
55 | For example, to evaluate the model trained with caption loss and tag loss, run
56 | 
57 | ```
58 | cd CLIM/ovdet
59 | bash tools/dist_test.sh \
60 |      configs/detic/ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py \
61 |      checkpoints/detic_coco_cap_w_tags_clim.pth \
62 |      8 --work-dir your/output/directory/detic_coco_cap_w_tags_clim
63 | ```
64 | 
65 | ### OV-LVIS
66 | 
67 | #### Training
68 | First obtain the 
69 | [checkpoint](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_boxsup/detic_centernet2_r50_fpn_4x_lvis-base_boxsup_20230921_180638-c1685ee2.pth) 
70 | trained on base categories and put it under `checkpoints/`. Then run
71 | 
72 | ```
73 | cd CLIM/ovdet
74 | bash tools/dist_train.sh \
75 |      configs/detic/ov_lvis/detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py 8 \
76 |      --work-dir your/output/directory/detic_lvis_cap_w_tags_clim
77 | ```
78 | 
79 | #### Testing
80 | We have provided the following checkpoint.
81 | 
82 | | OV-LVIS |    Losses     | mask APr |                                  Config                                   |  Download   |
83 | |:-------:|:-------------:|:--------:|:-------------------------------------------------------------------------:|:-----------:|
84 | |    1    | Caption & Tag |   21.8   | [config](ov_lvis/detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py) |  [model](https://drive.google.com/drive/folders/1Y_3T9jo86rJGc6AnjOoXzrNYbx63pBj-?usp=sharing)  |
85 | 
86 | 
87 | For example, to evaluate the model trained on LVIS-base and CC3M, run
88 | 
89 | ```
90 | cd CLIM/ovdet
91 | bash tools/dist_test.sh \
92 |      configs/detic/ov_lvis/detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py \
93 |      patch/to/the/checkpoint.pth \
94 |      8 --work-dir your/output/directory/detic_lvis_cap_w_tags_clim
95 | ```
96 | 


--------------------------------------------------------------------------------
/ovdet/configs/detic/ov_coco/detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     'mmdet::_base_/models/faster-rcnn_r50-caffe-c4.py',
 3 |     '../../_base_/datasets/coco_ovd_detic_clim.py',
 4 |     '../../_base_/schedules/schedule_45k.py',
 5 |     '../../_base_/iter_based_runtime.py'
 6 | ]
 7 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
 8 |                 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
 9 |                 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
10 |                 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
11 |                 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
12 |                 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
13 |                 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
14 |                 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0]
15 | 
16 | reg_layer = [
17 |     dict(type='Linear', in_features=2048, out_features=2048),
18 |     dict(type='ReLU', inplace=True),
19 |     dict(type='Linear', in_features=2048, out_features=4)
20 | ]
21 | 
22 | clip_cfg = dict(          # ViT-B/32
23 |     type='CLIP',
24 |     image_encoder=None,
25 |     text_encoder=dict(
26 |         type='CLIPTextEncoder',
27 |         embed_dim=512,
28 |         context_length=77,
29 |         vocab_size=49408,
30 |         transformer_width=512,    # also the word embedding dim
31 |         transformer_heads=8,
32 |         transformer_layers=12,
33 |         init_cfg=dict(
34 |             type='Pretrained',
35 |             checkpoint='checkpoints/clip_vitb32.pth')
36 |     )
37 | )
38 | 
39 | model = dict(
40 |     type='OVDTwoStageDetector',
41 |     data_preprocessor=dict(
42 |         type='MultiBranchDataPreprocessor',
43 |         _delete_=True,
44 |         data_preprocessor=dict(
45 |             type='DetDataPreprocessor',
46 |             mean=[103.530, 116.280, 123.675],
47 |             std=[1.0, 1.0, 1.0],
48 |             bgr_to_rgb=False,
49 |             pad_size_divisor=32
50 |         ),
51 |     ),
52 |     rpn_head=dict(
53 |         type='CustomRPNHead',
54 |         anchor_generator=dict(
55 |             scale_major=False,      # align with detectron2
56 |         )
57 |     ),
58 |     backbone=dict(init_cfg=None),
59 |     batch2ovd=dict(caption_batch=['detic_caption'],
60 |                    mosaic_batch=['detic_caption']),
61 |     roi_head=dict(
62 |         type='OVDStandardRoIHead',
63 |         shared_head=dict(init_cfg=None),
64 |         clip_cfg=clip_cfg,
65 |         ovd_cfg=dict(detic_caption=dict(type='DeticCaptionWithComposition',
66 |                                         base_batch_size=4,
67 |                                         bce_bias=-20.0, norm_temp=25.0, caption_weight=0.1,
68 |                                         max_caps=1,
69 |                                         queue_cfg=dict(lengths=[256], id_length=16,
70 |                                                        names=['clip_caption_features']),
71 |                                         cap_neg_weight=0.125),
72 |                      ),
73 |         bbox_head=dict(
74 |             type='DeticBBoxHead',
75 |             reg_predictor_cfg=reg_layer,
76 |             reg_class_agnostic=True,
77 |             cls_bias=-20.0,
78 |             cls_temp=25.0,
79 |             cls_embeddings_path='data/metadata/coco_clip_hand_craft.npy',
80 |             loss_cls=dict(
81 |                 type='CustomCrossEntropyLoss',
82 |                 use_sigmoid=True,
83 |                 class_weight=class_weight),
84 |         ),
85 |     ),
86 | )
87 | 
88 | # optimizer
89 | optim_wrapper = dict(
90 |     type='AmpOptimWrapper',        # amp training
91 |     clip_grad=dict(max_norm=35, norm_type=2),
92 | )
93 | # load_from = 'work_dirs/detic_base/iter_90000.pth'
94 | load_from = 'checkpoints/detic_coco_base.pth'
95 | 


--------------------------------------------------------------------------------
/ovdet/configs/detic/ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py:
--------------------------------------------------------------------------------
  1 | _base_ = [
  2 |     'mmdet::_base_/models/faster-rcnn_r50-caffe-c4.py',
  3 |     '../../_base_/datasets/coco_ovd_detic_clim.py',
  4 |     '../../_base_/schedules/schedule_45k.py',
  5 |     '../../_base_/iter_based_runtime.py'
  6 | ]
  7 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
  8 |                 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
  9 |                 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
 10 |                 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
 11 |                 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
 12 |                 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
 13 |                 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
 14 |                 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0]
 15 | 
 16 | reg_layer = [
 17 |     dict(type='Linear', in_features=2048, out_features=2048),
 18 |     dict(type='ReLU', inplace=True),
 19 |     dict(type='Linear', in_features=2048, out_features=4)
 20 | ]
 21 | 
 22 | clip_cfg = dict(          # ViT-B/32
 23 |     type='CLIP',
 24 |     image_encoder=None,
 25 |     text_encoder=dict(
 26 |         type='CLIPTextEncoder',
 27 |         embed_dim=512,
 28 |         context_length=77,
 29 |         vocab_size=49408,
 30 |         transformer_width=512,    # also the word embedding dim
 31 |         transformer_heads=8,
 32 |         transformer_layers=12,
 33 |         init_cfg=dict(
 34 |             type='Pretrained',
 35 |             checkpoint='checkpoints/clip_vitb32.pth')
 36 |     )
 37 | )
 38 | 
 39 | model = dict(
 40 |     type='OVDTwoStageDetector',
 41 |     data_preprocessor=dict(
 42 |         type='MultiBranchDataPreprocessor',
 43 |         _delete_=True,
 44 |         data_preprocessor=dict(
 45 |             type='DetDataPreprocessor',
 46 |             mean=[103.530, 116.280, 123.675],
 47 |             std=[1.0, 1.0, 1.0],
 48 |             bgr_to_rgb=False,
 49 |             pad_size_divisor=32
 50 |         ),
 51 |     ),
 52 |     rpn_head=dict(
 53 |         type='CustomRPNHead',
 54 |         anchor_generator=dict(
 55 |             scale_major=False,      # align with detectron2
 56 |         )
 57 |     ),
 58 |     backbone=dict(init_cfg=None),
 59 |     batch2ovd=dict(caption_batch=['detic_tags', 'detic_caption'],
 60 |                    mosaic_batch=['detic_tags', 'detic_caption']),
 61 |     roi_head=dict(
 62 |         type='OVDStandardRoIHead',
 63 |         shared_head=dict(init_cfg=None),
 64 |         clip_cfg=clip_cfg,
 65 |         ovd_cfg=dict(detic_caption=dict(type='DeticCaptionWithComposition',
 66 |                                         base_batch_size=4,
 67 |                                         bce_bias=-20.0, norm_temp=25.0, caption_weight=0.1,
 68 |                                         max_caps=1,
 69 |                                         queue_cfg=dict(lengths=[256], id_length=16,
 70 |                                                        names=['clip_caption_features']),
 71 |                                         cap_neg_weight=0.125),
 72 |                      detic_tags=dict(type='DeticTagsWithComposition',
 73 |                                      tag_embeddings_path='data/metadata/coco_clip_hand_craft.npy',
 74 |                                      sampling_cfg=dict(topk=128, iof_thr=0.3),
 75 |                                      base_batch_size=None,
 76 |                                      bce_bias=-20.0, norm_temp=25.0, tag_weight=0.1 / 3,
 77 |                                      tag_neg_weight=1.0
 78 |                                      )
 79 |                      ),
 80 |         bbox_head=dict(
 81 |             type='DeticBBoxHead',
 82 |             reg_predictor_cfg=reg_layer,
 83 |             reg_class_agnostic=True,
 84 |             cls_bias=-20.0,
 85 |             cls_temp=25.0,
 86 |             cls_embeddings_path='data/metadata/coco_clip_hand_craft.npy',
 87 |             loss_cls=dict(
 88 |                 type='CustomCrossEntropyLoss',
 89 |                 use_sigmoid=True,
 90 |                 class_weight=class_weight),
 91 |         ),
 92 |     ),
 93 | )
 94 | 
 95 | # optimizer
 96 | optim_wrapper = dict(
 97 |     type='AmpOptimWrapper',        # amp training
 98 |     clip_grad=dict(max_norm=35, norm_type=2),
 99 | )
100 | # load_from = 'work_dirs/detic_base/iter_90000.pth'
101 | load_from = 'checkpoints/detic_coco_base.pth'
102 | 


--------------------------------------------------------------------------------
/ovdet/configs/detic/ov_coco/faster_rcnn_r50_caffe_c4_90k_ovcoco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     'mmdet::_base_/models/faster-rcnn_r50-caffe-c4.py',
 3 |     '../../_base_/datasets/coco_ovd_base.py',
 4 |     '../../_base_/schedules/schedule_90k.py',
 5 |     '../../_base_/iter_based_runtime.py'
 6 | ]
 7 | 
 8 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
 9 |                 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
10 |                 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
11 |                 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
12 |                 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
13 |                 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
14 |                 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
15 |                 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0]
16 | 
17 | reg_layer = [
18 |     dict(type='Linear', in_features=2048, out_features=2048),
19 |     dict(type='ReLU', inplace=True),
20 |     dict(type='Linear', in_features=2048, out_features=4)
21 | ]
22 | 
23 | model = dict(
24 |     type='OVDTwoStageDetector',
25 |     rpn_head=dict(
26 |         type='CustomRPNHead',
27 |         anchor_generator=dict(
28 |             scale_major=False,      # align with detectron2
29 |         )
30 |     ),
31 |     backbone=dict(
32 |         init_cfg=dict(
33 |             checkpoint='checkpoints/resnet50_msra-5891d200.pth')),
34 |     roi_head=dict(
35 |         type='OVDStandardRoIHead',
36 |         shared_head=dict(
37 |             init_cfg=dict(
38 |                 checkpoint='checkpoints/resnet50_msra-5891d200.pth')),
39 |         clip_cfg=None,
40 |         bbox_head=dict(
41 |             type='DeticBBoxHead',
42 |             reg_predictor_cfg=reg_layer,
43 |             reg_class_agnostic=True,
44 |             cls_bias=-20.0,
45 |             cls_temp=25.0,
46 |             cls_embeddings_path='data/metadata/coco_clip_hand_craft.npy',
47 |             loss_cls=dict(
48 |                 type='CustomCrossEntropyLoss',
49 |                 use_sigmoid=True,
50 |                 class_weight=class_weight),
51 |         ),
52 |     ),
53 | )
54 | 
55 | # optimizer
56 | optim_wrapper = dict(
57 |     type='AmpOptimWrapper',        # amp training
58 |     clip_grad=dict(max_norm=35, norm_type=2),
59 | )
60 | 


--------------------------------------------------------------------------------
/ovdet/data/metadata/coco_clip_hand_craft.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/coco_clip_hand_craft.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/coco_openai_vitb16_hand_craft.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/coco_openai_vitb16_hand_craft.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/coco_openai_vitb16_hand_craft_with_background.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/coco_openai_vitb16_hand_craft_with_background.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/lvis_openai_rn50x64_hand_craft.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_rn50x64_hand_craft.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/lvis_openai_rn50x64_hand_craft_with_background.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_rn50x64_hand_craft_with_background.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/lvis_openai_vitb16_hand_craft.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_vitb16_hand_craft.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/lvis_openai_vitb16_hand_craft_with_background.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_vitb16_hand_craft_with_background.npy


--------------------------------------------------------------------------------
/ovdet/data/metadata/lvis_v1_clip_a+cname.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_v1_clip_a+cname.npy


--------------------------------------------------------------------------------
/ovdet/ovdet/__init__.py:
--------------------------------------------------------------------------------
1 | from mmcv import *   # noqa
2 | from mmdet import *   # noqa
3 | from mmengine import *   # noqa
4 | from .datasets import *
5 | from .methods import *
6 | from .models import *
7 | from .utils import *
8 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .coco_caption import CocoCaptionOVDDataset
2 | from .samplers.multi_source_sampler import CustomGroupMultiSourceSampler
3 | from .pipelines import CachedMosaicWithCaption, MultiChoicesMosaic
4 | from .cc3m_lvis_v1 import CC3MLVISV1Dataset
5 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/datasets/coco_caption.py:
--------------------------------------------------------------------------------
 1 | from mmdet.datasets import CocoDataset
 2 | from mmdet.registry import DATASETS
 3 | import os.path as osp
 4 | 
 5 | 
 6 | @DATASETS.register_module()
 7 | class CocoCaptionOVDDataset(CocoDataset):
 8 |     """
 9 |         Renamed from `CocoCaptionDataset' to avoid conflicts with the mmdet
10 |     """
11 | 
12 |     def prepare_data(self, idx):
13 |         """Get data processed by ``self.pipeline``.
14 | 
15 |         Args:
16 |             idx (int): The index of ``data_info``.
17 | 
18 |         Returns:
19 |             Any: Depends on ``self.pipeline``.
20 |         """
21 |         data_info = self.get_data_info(idx)
22 |         if data_info['has_caption']:
23 |             return self.pipeline(data_info)
24 |         else:
25 |             return None
26 | 
27 |     def parse_data_info(self, raw_data_info: dict):
28 |         """Parse raw annotation to target format.
29 | 
30 |         Args:
31 |             raw_data_info (dict): Raw data information load from ``ann_file``
32 | 
33 |         Returns:
34 |             Union[dict, List[dict]]: Parsed annotation.
35 |         """
36 |         img_info = raw_data_info['raw_img_info']
37 | 
38 |         data_info = {}
39 | 
40 |         # TODO: need to change data_prefix['img'] to data_prefix['img_path']
41 |         img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
42 |         seg_map_path = None
43 |         data_info['img_path'] = img_path
44 |         data_info['img_id'] = img_info['img_id']
45 |         data_info['seg_map_path'] = seg_map_path
46 |         data_info['height'] = img_info['height']
47 |         data_info['width'] = img_info['width']
48 | 
49 |         data_info['captions'] = [img_info.get('captions', []), ]
50 |         pos_cat_ids = img_info.get('pos_category_ids', [])
51 |         tags = [self.cat2label[cat_id] for cat_id in pos_cat_ids]
52 |         tags = list(set(tags))
53 |         data_info['tags'] = [tags, ]
54 |         data_info['image_ids'] = [img_info['img_id'], ]
55 | 
56 |         has_caption = len(img_info.get('captions', [])) > 0
57 |         data_info['has_caption'] = has_caption
58 | 
59 |         instance = {}
60 |         bbox = [0.0, 0.0, img_info['width'], img_info['height']]
61 |         instance['ignore_flag'] = 0
62 |         instance['bbox'] = bbox
63 |         instance['bbox_label'] = 0
64 | 
65 |         data_info['instances'] = [instance]
66 |         return data_info
67 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .mosaic import CachedMosaicWithCaption, MultiChoicesMosaic
2 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/datasets/samplers/multi_source_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import numpy as np
 3 | from mmdet.registry import DATA_SAMPLERS
 4 | from mmdet.datasets import GroupMultiSourceSampler
 5 | 
 6 | 
 7 | @DATA_SAMPLERS.register_module()
 8 | class CustomGroupMultiSourceSampler(GroupMultiSourceSampler):
 9 |     def _get_source_group_info(self) -> None:
10 |         num_sources = len(self.num_per_source)
11 |         self.group2size_per_source = [{0: 0, 1: 0} for _ in range(num_sources)]
12 |         self.group2inds_per_source = [{0: [], 1: []} for _ in range(num_sources)]
13 |         for source, dataset in enumerate(self.dataset.datasets):
14 |             for idx in range(len(dataset)):
15 |                 data_info = dataset.get_data_info(idx)
16 |                 width, height = data_info['width'], data_info['height']
17 |                 group = 0 if width < height else 1
18 |                 self.group2size_per_source[source][group] += 1
19 |                 self.group2inds_per_source[source][group].append(idx)
20 | 
21 |         self.group_sizes = np.zeros(2, dtype=np.int64)
22 |         for group2size in self.group2size_per_source:
23 |             for group, size in group2size.items():
24 |                 self.group_sizes[group] += size
25 |         self.group_ratio = self.group_sizes / sum(self.group_sizes)
26 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .detic import DeticTags, DeticCaption, DeticCaptionWithComposition, DeticTagsWithComposition
2 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/methods/builder.py:
--------------------------------------------------------------------------------
 1 | from mmengine.registry import Registry
 2 | OVD = Registry('ovdet', )
 3 | QUEUE = Registry('queue', )
 4 | 
 5 | 
 6 | def build_ovd(cfg):
 7 |     """Build backbone."""
 8 |     return OVD.build(cfg)
 9 | 
10 | 
11 | def build_queue(cfg):
12 |     """Build backbone."""
13 |     return QUEUE.build(cfg)
14 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/methods/detic/__init__.py:
--------------------------------------------------------------------------------
1 | from .detic_caption import DeticCaption, DeticCaptionWithComposition
2 | from .detic_tags import DeticTags, DeticTagsWithComposition
3 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/methods/detic/utils.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | from six.moves import map, zip
 3 | import torch
 4 | 
 5 | 
 6 | def multi_apply(func, *args, **kwargs):
 7 |     """Apply function to a list of arguments.
 8 | 
 9 |     Note:
10 |         This function applies the ``func`` to multiple inputs and
11 |         map the multiple outputs of the ``func`` into different
12 |         list. Each list contains the same type of outputs corresponding
13 |         to different inputs.
14 | 
15 |     Args:
16 |         func (Function): A function that will be applied to a list of
17 |             arguments
18 | 
19 |     Returns:
20 |         tuple(list): A tuple containing multiple list, each list contains \
21 |             a kind of returned results by the function
22 |     """
23 |     pfunc = partial(func, **kwargs) if kwargs else func
24 |     map_results = map(pfunc, *args)
25 |     return tuple(map(list, zip(*map_results)))
26 | 
27 | 
28 | def bboxes_area(bboxes):
29 |     whs = torch.clamp(bboxes[:, 2:4] - bboxes[:, :2], min=0.0)
30 |     return whs.prod(-1)
31 | 
32 | 
33 | def bboxes_clamp(boxes, bound):   # xyxy
34 |     boxes[..., 0::2] = boxes[..., 0::2].clamp(min=bound[0], max=bound[2])   # x1 x2
35 |     boxes[..., 1::2] = boxes[..., 1::2].clamp(min=bound[1], max=bound[3])   # y1 y2
36 | 
37 |     return boxes
38 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/methods/queues.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from .builder import QUEUE
 4 | 
 5 | 
 6 | @QUEUE.register_module()
 7 | class Queues(nn.Module):
 8 |     def __init__(self, names, lengths, emb_dim=512, id_length=4):
 9 |         super(Queues, self).__init__()
10 |         self.names = names
11 |         self.lengths = lengths
12 |         self.emb_dim = emb_dim
13 |         self.id_length = id_length
14 |         self._init_queues()
15 | 
16 |     def _init_queues(self):
17 |         attr_names = self.names
18 |         queue_lengths = self.lengths
19 |         for n in attr_names:
20 |             self.register_buffer(n, -torch.ones(1, self.emb_dim + self.id_length),
21 |                                  persistent=False)
22 |         self.queue_lengths = {n: queue_lengths[i] for i, n in enumerate(attr_names)}
23 | 
24 |     @torch.no_grad()
25 |     def dequeue_and_enqueue(self, queue_update):
26 |         for k, feat in queue_update.items():
27 |             queue_length = self.queue_lengths[k]
28 |             valid = (feat[:, self.emb_dim:] >= 0).sum(-1) > 0   # valid label
29 |             if valid.sum() == 0:
30 |                 continue
31 |             feat = feat[valid]
32 |             feat = feat[:queue_length]
33 |             in_length = feat.shape[0]
34 |             queue_value = getattr(self, k)
35 |             current_length = queue_value.shape[0]
36 |             kept_length = min(queue_length - in_length, current_length)
37 | 
38 |             queue_value.data = torch.cat([feat, queue_value[:kept_length]])
39 | 
40 |     @torch.no_grad()
41 |     def get_queue(self, key):
42 |         value = getattr(self, key)
43 |         valid = (value[:, self.emb_dim:] >= 0).sum(-1) > 0  # valid label
44 |         return value[valid]
45 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .dense_heads import *
2 | from .detectors import *
3 | from .losses import *
4 | from .roi_heads import *
5 | from .vlms import *
6 | from .backbones import *
7 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip_vit import CLIPViT
2 | from .clip_resnet import CLIPResNet
3 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/backbones/clip_resnet.py:
--------------------------------------------------------------------------------
 1 | import open_clip
 2 | import torch
 3 | from mmdet.registry import MODELS
 4 | from mmengine.model import BaseModule
 5 | from torch.nn import functional as F
 6 | 
 7 | 
 8 | @MODELS.register_module()
 9 | class CLIPResNet(BaseModule):
10 |     def __init__(self, model_name, cache_dir, pretrained='openai', roi_extractor=None):
11 |         super().__init__()
12 |         self.model_name = model_name
13 |         clip_model = open_clip.create_model(model_name,
14 |                                             pretrained=pretrained,
15 |                                             cache_dir=cache_dir)
16 |         self.visual = clip_model.visual
17 |         self.roi_extractor = MODELS.build(roi_extractor)
18 | 
19 |     def init_weights(self):
20 |         for param in self.visual.parameters():  # only freeze the CLIP model
21 |             param.requires_grad = False
22 | 
23 |     def train(self, mode=True):
24 |         self.training = mode
25 |         self.visual.train(False)
26 |         return self
27 | 
28 |     def forward(self, x):
29 |         outputs = []
30 |         with torch.no_grad():
31 |             visual = self.visual
32 |             x = visual.stem(x)
33 |             for i in range(4):
34 |                 layer = getattr(visual, f'layer{i+1}')
35 |                 x = layer(x)
36 |                 outputs.append(x)
37 | 
38 |         return tuple(outputs)
39 | 
40 |     def clip_pool(self, clip_x, rois):
41 |         roi_feats = self.roi_extractor([clip_x], rois)
42 |         roi_feats = self.visual.attnpool(roi_feats)
43 |         roi_feats = F.normalize(roi_feats, dim=-1)
44 | 
45 |         return roi_feats
46 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .rpn_head import CustomRPNHead
2 | from .centernet_rpn_head import CenterNetRPNHead
3 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/dense_heads/iou_loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | 
  5 | # support calculate  IOULoss with box_pred
  6 | class IOULoss(nn.Module):
  7 | 
  8 |     def __init__(self, loc_loss_type='iou'):
  9 |         super(IOULoss, self).__init__()
 10 |         self.loc_loss_type = loc_loss_type
 11 | 
 12 |     def forward(self, pred, target, weight=None, reduction='sum'):
 13 |         pred_left = pred[:, 0]
 14 |         pred_top = pred[:, 1]
 15 |         pred_right = pred[:, 2]
 16 |         pred_bottom = pred[:, 3]
 17 | 
 18 |         target_left = target[:, 0]
 19 |         target_top = target[:, 1]
 20 |         target_right = target[:, 2]
 21 |         target_bottom = target[:, 3]
 22 | 
 23 |         target_aera = (target_left + target_right) * (
 24 |             target_top + target_bottom)
 25 |         pred_aera = (pred_left + pred_right) * (pred_top + pred_bottom)
 26 | 
 27 |         w_intersect = torch.min(pred_left, target_left) + torch.min(
 28 |             pred_right, target_right)
 29 |         h_intersect = torch.min(pred_bottom, target_bottom) + torch.min(
 30 |             pred_top, target_top)
 31 | 
 32 |         g_w_intersect = torch.max(pred_left, target_left) + torch.max(
 33 |             pred_right, target_right)
 34 |         g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max(
 35 |             pred_top, target_top)
 36 |         ac_uion = g_w_intersect * g_h_intersect
 37 | 
 38 |         area_intersect = w_intersect * h_intersect
 39 |         area_union = target_aera + pred_aera - area_intersect
 40 | 
 41 |         ious = (area_intersect + 1.0) / (area_union + 1.0)
 42 |         gious = ious - (ac_uion - area_union) / ac_uion
 43 |         if self.loc_loss_type == 'iou':
 44 |             losses = -torch.log(ious)
 45 |         elif self.loc_loss_type == 'linear_iou':
 46 |             losses = 1 - ious
 47 |         elif self.loc_loss_type == 'giou':
 48 |             losses = 1 - gious
 49 |         else:
 50 |             raise NotImplementedError
 51 | 
 52 |         if weight is not None:
 53 |             losses = losses * weight
 54 |         else:
 55 |             losses = losses
 56 | 
 57 |         if reduction == 'sum':
 58 |             return losses.sum()
 59 |         elif reduction == 'batch':
 60 |             return losses.sum(dim=[1])
 61 |         elif reduction == 'none':
 62 |             return losses
 63 |         else:
 64 |             raise NotImplementedError
 65 | 
 66 | 
 67 | def giou_loss(
 68 |     boxes1: torch.Tensor,
 69 |     boxes2: torch.Tensor,
 70 |     reduction: str = 'none',
 71 |     eps: float = 1e-7,
 72 | ) -> torch.Tensor:
 73 |     """Generalized Intersection over Union Loss (Hamid Rezatofighi et.
 74 | 
 75 |     al)
 76 |     https://arxiv.org/abs/1902.09630
 77 |     Gradient-friendly IoU loss with an additional penalty that is
 78 |     non-zero when the boxes do not overlap and scales with the size
 79 |     of their smallest enclosing box. This loss is symmetric, so the
 80 |     boxes1 and boxes2 arguments are interchangeable.
 81 |     Args:
 82 |         boxes1, boxes2 (Tensor): box locations in XYXY format, shape
 83 |         (N, 4) or (4,).
 84 |         reduction: 'none' | 'mean' | 'sum'
 85 |                  'none': No reduction will be applied to the output.
 86 |                  'mean': The output will be averaged.
 87 |                  'sum': The output will be summed.
 88 |         eps (float): small number to prevent division by zero
 89 |     """
 90 | 
 91 |     x1, y1, x2, y2 = boxes1.unbind(dim=-1)
 92 |     x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
 93 | 
 94 |     assert (x2 >= x1).all(), 'bad box: x1 larger than x2'
 95 |     assert (y2 >= y1).all(), 'bad box: y1 larger than y2'
 96 | 
 97 |     # Intersection keypoints
 98 |     xkis1 = torch.max(x1, x1g)
 99 |     ykis1 = torch.max(y1, y1g)
100 |     xkis2 = torch.min(x2, x2g)
101 |     ykis2 = torch.min(y2, y2g)
102 | 
103 |     intsctk = torch.zeros_like(x1)
104 |     mask = (ykis2 > ykis1) & (xkis2 > xkis1)
105 |     intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
106 |     unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk
107 |     iouk = intsctk / (unionk + eps)
108 | 
109 |     # smallest enclosing box
110 |     xc1 = torch.min(x1, x1g)
111 |     yc1 = torch.min(y1, y1g)
112 |     xc2 = torch.max(x2, x2g)
113 |     yc2 = torch.max(y2, y2g)
114 | 
115 |     area_c = (xc2 - xc1) * (yc2 - yc1)
116 |     miouk = iouk - ((area_c - unionk) / (area_c + eps))
117 | 
118 |     loss = 1 - miouk
119 | 
120 |     if reduction == 'mean':
121 |         loss = loss.mean()
122 |     elif reduction == 'sum':
123 |         loss = loss.sum()
124 | 
125 |     return loss
126 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .two_stage import OVDTwoStageDetector
2 | from .fvlm import FVLM
3 | from .detic import Detic
4 | from .centernet2 import CenterNet2
5 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/detectors/detic.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .centernet2 import CenterNet2
 3 | import torch
 4 | from torch import Tensor
 5 | from mmdet.structures import SampleList
 6 | from typing import Dict
 7 | from mmdet.registry import MODELS
 8 | 
 9 | 
10 | @MODELS.register_module()
11 | class Detic(CenterNet2):
12 | 
13 |     def __init__(self,
14 |                  batch2ovd=None,
15 |                  *args,
16 |                  **kwargs) -> None:
17 |         super().__init__(*args, **kwargs)
18 |         self.batch2ovd = dict() if batch2ovd is None else batch2ovd
19 | 
20 |     def run_ovd(self, x, inputs, data_samples, ovd_name):
21 |         losses = dict()
22 |         if self.with_rpn:
23 |             with torch.no_grad():
24 |                 rpn_results_list = self.rpn_head_predict(x, data_samples)
25 |         else:
26 |             assert data_samples[0].get('proposals', None) is not None
27 |             rpn_results_list = [
28 |                 data_sample.proposals for data_sample in data_samples
29 |             ]
30 |         if isinstance(ovd_name, str):
31 |             ovd_name = [ovd_name]
32 |         for _ovd_name in ovd_name:
33 |             losses.update(self.roi_head.run_ovd(x, data_samples, rpn_results_list,
34 |                                                 _ovd_name, inputs))
35 |         return losses
36 | 
37 |     def rpn_head_predict(self, x, batch_data_samples):
38 |         batch_img_metas = [
39 |             data_samples.metainfo for data_samples in batch_data_samples
40 |         ]
41 |         outs = self.rpn_head(x)
42 |         proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn)
43 |         predictions = self.rpn_head.predict_by_feat(
44 |             *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg, rescale=False)
45 |         return predictions
46 | 
47 |     def loss(self, multi_batch_inputs: Dict[str, Tensor],
48 |              multi_batch_data_samples: Dict[str, SampleList]) -> dict:
49 |         if not isinstance(multi_batch_inputs, dict):
50 |             multi_batch_inputs = dict(det_batch=multi_batch_inputs)
51 |             multi_batch_data_samples = dict(det_batch=multi_batch_data_samples)
52 | 
53 |         # detection losses
54 |         losses = super().loss(multi_batch_inputs.pop('det_batch'),
55 |                               multi_batch_data_samples.pop('det_batch'))
56 | 
57 |         multi_batch_features = {k: self.extract_feat(v)
58 |                                 for k, v in multi_batch_inputs.items()}
59 | 
60 |         for batch_name, ovd_name in self.batch2ovd.items():
61 |             batch_inputs = multi_batch_inputs.get(batch_name)
62 |             batch_data_samples = multi_batch_data_samples.get(batch_name)
63 |             batch_features = multi_batch_features.get(batch_name)
64 |             loss_ovd = self.run_ovd(batch_features,
65 |                                     batch_inputs,
66 |                                     batch_data_samples,
67 |                                     ovd_name)
68 |             for k, v in loss_ovd.items():
69 |                 losses.update({f'{batch_name}.{k}': v})
70 |         return losses
71 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/detectors/fvlm.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | from torch import Tensor
 3 | from mmdet.registry import MODELS
 4 | from mmdet.models.detectors import TwoStageDetector
 5 | from mmdet.structures import SampleList
 6 | 
 7 | 
 8 | @MODELS.register_module()
 9 | class FVLM(TwoStageDetector):
10 |     def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
11 |         """Extract features.
12 | 
13 |         Args:
14 |             batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
15 | 
16 |         Returns:
17 |             tuple[Tensor]: Multi-level features that may have
18 |             different resolutions.
19 |         """
20 |         x = self.backbone(batch_inputs)
21 |         if self.with_neck:
22 |             x = self.neck(x[:self.neck.num_ins])
23 | 
24 |         return x
25 | 
26 |     def predict(self,
27 |                 batch_inputs: Tensor,
28 |                 batch_data_samples: SampleList,
29 |                 rescale: bool = True) -> SampleList:
30 |         assert self.with_bbox, 'Bbox head must be implemented.'
31 |         x = self.backbone(batch_inputs)
32 |         clip_x = x[-1]
33 |         if self.with_neck:
34 |             x = self.neck(x[:self.neck.num_ins])
35 | 
36 |         # If there are no pre-defined proposals, use RPN to get proposals
37 |         if batch_data_samples[0].get('proposals', None) is None:
38 |             rpn_results_list = self.rpn_head.predict(
39 |                 x, batch_data_samples, rescale=False)
40 |         else:
41 |             rpn_results_list = [
42 |                 data_sample.proposals for data_sample in batch_data_samples
43 |             ]
44 | 
45 |         results_list = self.roi_head.predict(
46 |             x, rpn_results_list, batch_data_samples, rescale=rescale,
47 |             clip_x=clip_x, clip_pool=self.backbone.clip_pool
48 |         )
49 | 
50 |         batch_data_samples = self.add_pred_to_datasample(
51 |             batch_data_samples, results_list)
52 |         return batch_data_samples
53 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/detectors/two_stage.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import copy
  3 | import torch
  4 | from torch import Tensor
  5 | from mmdet.structures import SampleList
  6 | from mmdet.models.detectors.two_stage import TwoStageDetector
  7 | from mmdet.registry import MODELS
  8 | from typing import Dict
  9 | 
 10 | 
 11 | @MODELS.register_module()
 12 | class OVDTwoStageDetector(TwoStageDetector):
 13 |     def __init__(self, batch2ovd=None, *args, **kwargs):
 14 |         super().__init__(*args, **kwargs)
 15 |         self.batch2ovd = dict() if batch2ovd is None else batch2ovd
 16 |         # mapping from batch name to ovd name
 17 | 
 18 |     def run_ovd(self, x, inputs, data_samples, ovd_name):
 19 |         losses = dict()
 20 |         if self.with_rpn:
 21 |             with torch.no_grad():
 22 |                 rpn_results_list = self.rpn_head_predict(x, data_samples)
 23 |         else:
 24 |             assert data_samples[0].get('proposals', None) is not None
 25 |             rpn_results_list = [
 26 |                 data_sample.proposals for data_sample in data_samples
 27 |             ]
 28 |         if isinstance(ovd_name, str):
 29 |             ovd_name = [ovd_name]
 30 |         for _ovd_name in ovd_name:
 31 |             losses.update(self.roi_head.run_ovd(x, data_samples, rpn_results_list,
 32 |                                                 _ovd_name, inputs))
 33 |         return losses
 34 | 
 35 |     def rpn_head_predict(self, x, batch_data_samples):
 36 |         batch_img_metas = [
 37 |             data_samples.metainfo for data_samples in batch_data_samples
 38 |         ]
 39 |         outs = self.rpn_head(x)
 40 |         proposal_cfg = self.train_cfg.get('rpn_proposal',
 41 |                                           self.test_cfg.rpn)
 42 |         predictions = self.rpn_head.predict_by_feat(
 43 |             *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg, rescale=False)
 44 |         return predictions
 45 | 
 46 |     def loss(self, multi_batch_inputs: Dict[str, Tensor],
 47 |              multi_batch_data_samples: Dict[str, SampleList]) -> dict:
 48 |         if not isinstance(multi_batch_inputs, dict):
 49 |             multi_batch_inputs = dict(det_batch=multi_batch_inputs)
 50 |             multi_batch_data_samples = dict(det_batch=multi_batch_data_samples)
 51 | 
 52 |         multi_batch_features = {k: self.extract_feat(v)
 53 |                                 for k, v in multi_batch_inputs.items()}
 54 |         losses = self.det_loss(multi_batch_features.get('det_batch'),
 55 |                                multi_batch_data_samples.get('det_batch'))
 56 | 
 57 |         for batch_name, ovd_name in self.batch2ovd.items():
 58 |             batch_inputs = multi_batch_inputs.get(batch_name)
 59 |             batch_data_samples = multi_batch_data_samples.get(batch_name)
 60 |             batch_features = multi_batch_features.get(batch_name)
 61 |             loss_ovd = self.run_ovd(batch_features,
 62 |                                     batch_inputs,
 63 |                                     batch_data_samples,
 64 |                                     ovd_name)
 65 |             for k, v in loss_ovd.items():
 66 |                 losses.update({k + f'_{batch_name}': v})
 67 |         return losses
 68 | 
 69 |     def det_loss(self, x, batch_data_samples):
 70 |         losses = dict()
 71 | 
 72 |         # RPN forward and loss
 73 |         if self.with_rpn:
 74 |             proposal_cfg = self.train_cfg.get('rpn_proposal',
 75 |                                               self.test_cfg.rpn)
 76 |             rpn_data_samples = copy.deepcopy(batch_data_samples)
 77 |             # set cat_id of gt_labels to 0 in RPN
 78 |             for data_sample in rpn_data_samples:
 79 |                 data_sample.gt_instances.labels = \
 80 |                     torch.zeros_like(data_sample.gt_instances.labels)
 81 | 
 82 |             rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
 83 |                 x, rpn_data_samples, proposal_cfg=proposal_cfg)
 84 |             # avoid get same name with roi_head loss
 85 |             keys = rpn_losses.keys()
 86 |             for key in list(keys):
 87 |                 if 'loss' in key and 'rpn' not in key:
 88 |                     rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
 89 |             losses.update(rpn_losses)
 90 |         else:
 91 |             assert batch_data_samples[0].get('proposals', None) is not None
 92 |             # use pre-defined proposals in InstanceData for the second stage
 93 |             # to extract ROI features.
 94 |             rpn_results_list = [
 95 |                 data_sample.proposals for data_sample in batch_data_samples
 96 |             ]
 97 | 
 98 |         roi_losses = self.roi_head.loss(x, rpn_results_list,
 99 |                                         batch_data_samples)
100 |         losses.update(roi_losses)
101 | 
102 |         return losses
103 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .cross_entropy_loss import CustomCrossEntropyLoss
2 | from .heatmap_focal_loss import HeatmapFocalLoss
3 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/losses/cross_entropy_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import torch.nn.functional as F
  3 | from mmdet.registry import MODELS
  4 | from mmdet.models.losses.utils import weight_reduce_loss
  5 | from mmdet.models import CrossEntropyLoss
  6 | from mmdet.models.losses.cross_entropy_loss import _expand_onehot_labels
  7 | from ovdet.utils import load_class_freq
  8 | 
  9 | 
 10 | def binary_cross_entropy(pred,
 11 |                          label,
 12 |                          weight=None,
 13 |                          reduction='mean',
 14 |                          avg_factor=None,
 15 |                          class_weight=None,
 16 |                          ignore_index=-100,
 17 |                          avg_non_ignore=False, **kwargs):
 18 |     ignore_index = -100 if ignore_index is None else ignore_index
 19 | 
 20 |     if pred.dim() != label.dim():
 21 |         label, weight, valid_mask = _expand_onehot_labels(
 22 |             label, weight, pred.size(-1), ignore_index)
 23 |     else:
 24 |         # should mask out the ignored elements
 25 |         valid_mask = ((label >= 0) & (label != ignore_index)).float()
 26 |         if weight is not None:
 27 |             # The inplace writing method will have a mismatched broadcast
 28 |             # shape error if the weight and valid_mask dimensions
 29 |             # are inconsistent such as (B,N,1) and (B,N,C).
 30 |             weight = weight * valid_mask
 31 |         else:
 32 |             weight = valid_mask
 33 | 
 34 |     # average loss over non-ignored elements
 35 |     if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
 36 |         avg_factor = valid_mask.sum().item()
 37 | 
 38 |     # weighted element-wise losses
 39 |     weight = weight.float()
 40 |     loss = F.binary_cross_entropy_with_logits(pred, label.float(), reduction='none')
 41 |     if class_weight is not None:
 42 |         loss = loss * class_weight[None]
 43 |     # do the reduction for the weighted loss
 44 |     loss = weight_reduce_loss(
 45 |         loss, weight, reduction=reduction, avg_factor=avg_factor)
 46 | 
 47 |     return loss
 48 | 
 49 | 
 50 | def cross_entropy(pred,
 51 |                   label,
 52 |                   weight=None,
 53 |                   reduction='mean',
 54 |                   avg_factor=None,
 55 |                   class_weight=None,
 56 |                   ignore_index=-100,
 57 |                   avg_non_ignore=False):
 58 |     # The default value of ignore_index is the same as F.cross_entropy
 59 |     ignore_index = -100 if ignore_index is None else ignore_index
 60 |     # element-wise losses
 61 |     if class_weight is not None:
 62 |         mask_out = class_weight < 0.00001
 63 |         pred[:, mask_out] = -float('inf')
 64 |     loss = F.cross_entropy(
 65 |         pred,
 66 |         label,
 67 |         weight=class_weight,      # still use
 68 |         reduction='none',
 69 |         ignore_index=ignore_index)
 70 | 
 71 |     # average loss over non-ignored elements
 72 |     # pytorch's official cross_entropy average loss over non-ignored elements
 73 |     # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
 74 |     if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
 75 |         avg_factor = label.numel() - (label == ignore_index).sum().item()
 76 | 
 77 |     # apply weights and do the reduction
 78 |     if weight is not None:
 79 |         weight = weight.float()
 80 |     loss = weight_reduce_loss(
 81 |         loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
 82 | 
 83 |     return loss
 84 | 
 85 | 
 86 | @MODELS.register_module()
 87 | class CustomCrossEntropyLoss(CrossEntropyLoss):
 88 |     def __init__(self, bg_weight=1.0, *args, **kwargs):
 89 |         super().__init__(*args, **kwargs)
 90 |         if self.use_sigmoid:
 91 |             del self.cls_criterion
 92 |             self.cls_criterion = binary_cross_entropy
 93 |         elif not self.use_mask:
 94 |             del self.cls_criterion
 95 |             self.cls_criterion = cross_entropy
 96 | 
 97 |         if isinstance(self.class_weight, str):
 98 |             cat_freq = load_class_freq(self.class_weight, min_count=0)
 99 |             self.class_weight = (cat_freq > 0.0).float().tolist() + [bg_weight]
100 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/losses/heatmap_focal_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | from typing import Optional, Union
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch import Tensor
  7 | 
  8 | from mmdet.registry import MODELS
  9 | 
 10 | 
 11 | # support class-agnostic heatmap_focal_loss
 12 | def heatmap_focal_loss_with_pos_inds(
 13 |         pred: Tensor,
 14 |         targets: Tensor,
 15 |         pos_inds: Tensor,
 16 |         alpha: float = 2.0,
 17 |         beta: float = 4.0,
 18 |         gamma: float = 4.0,
 19 |         sigmoid_clamp: float = 1e-4,
 20 |         ignore_high_fp: float = -1.0,
 21 |         pos_weight: float = 1.0,
 22 |         neg_weight: float = 1.0,
 23 |         avg_factor: Optional[Union[int, float]] = None) -> Tensor:
 24 | 
 25 |     pred = torch.clamp(
 26 |         pred.sigmoid_(), min=sigmoid_clamp, max=1 - sigmoid_clamp)
 27 | 
 28 |     neg_weights = torch.pow(1 - targets, beta)
 29 | 
 30 |     pos_pred = pred[pos_inds]
 31 |     pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma)
 32 |     neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights
 33 |     if ignore_high_fp > 0:
 34 |         not_high_fp = (pred < ignore_high_fp).float()
 35 |         neg_loss = not_high_fp * neg_loss
 36 | 
 37 |     pos_loss = -pos_loss.sum()
 38 |     neg_loss = -neg_loss.sum()
 39 |     if alpha >= 0:
 40 |         pos_loss = alpha * pos_loss
 41 |         neg_loss = (1 - alpha) * neg_loss
 42 | 
 43 |     pos_loss = pos_weight * pos_loss / avg_factor
 44 |     neg_loss = neg_weight * neg_loss / avg_factor
 45 | 
 46 |     return pos_loss, neg_loss
 47 | 
 48 | 
 49 | @MODELS.register_module()
 50 | class HeatmapFocalLoss(nn.Module):
 51 |     """GaussianFocalLoss is a variant of focal loss.
 52 | 
 53 |     More details can be found in the `paper
 54 |     <https://arxiv.org/abs/1808.01244>`_
 55 |     Code is modified from `kp_utils.py
 56 |     <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L152>`_  # noqa: E501
 57 |     Please notice that the target in GaussianFocalLoss is a gaussian heatmap,
 58 |     not 0/1 binary target.
 59 | 
 60 |     Args:
 61 |         alpha (float): Power of prediction.
 62 |         gamma (float): Power of target for negative samples.
 63 |         reduction (str): Options are "none", "mean" and "sum".
 64 |         loss_weight (float): Loss weight of current loss.
 65 |         pos_weight(float): Positive sample loss weight. Defaults to 1.0.
 66 |         neg_weight(float): Negative sample loss weight. Defaults to 1.0.
 67 |     """
 68 | 
 69 |     def __init__(
 70 |         self,
 71 |         alpha: float = 2.0,
 72 |         beta: float = 4.0,
 73 |         gamma: float = 4.0,
 74 |         sigmoid_clamp: float = 1e-4,
 75 |         ignore_high_fp: float = -1.0,
 76 |         loss_weight: float = 1.0,
 77 |         pos_weight: float = 1.0,
 78 |         neg_weight: float = 1.0,
 79 |     ) -> None:
 80 |         super().__init__()
 81 |         self.alpha = alpha
 82 |         self.beta = beta
 83 |         self.gamma = gamma
 84 |         self.sigmoid_clamp = sigmoid_clamp
 85 |         self.ignore_high_fp = ignore_high_fp
 86 |         self.loss_weight = loss_weight
 87 |         self.pos_weight = pos_weight
 88 |         self.neg_weight = neg_weight
 89 | 
 90 |     def forward(self,
 91 |                 pred: Tensor,
 92 |                 target: Tensor,
 93 |                 pos_inds: Optional[Tensor] = None,
 94 |                 avg_factor: Optional[Union[int, float]] = None) -> Tensor:
 95 |         """Forward function.
 96 | 
 97 |         If you want to manually determine which positions are
 98 |         positive samples, you can set the pos_index and pos_label
 99 |         parameter. Currently, only the CenterNet update version uses
100 |         the parameter.
101 | 
102 |         Args:
103 |             pred (torch.Tensor): The prediction. The shape is (N, num_classes).
104 |             target (torch.Tensor): The learning target of the prediction
105 |                 in gaussian distribution. The shape is (N, num_classes).
106 |             pos_inds (torch.Tensor): The positive sample index.
107 |                 Defaults to None.
108 |             pos_labels (torch.Tensor): The label corresponding to the positive
109 |                 sample index. Defaults to None.
110 |             weight (torch.Tensor, optional): The weight of loss for each
111 |                 prediction. Defaults to None.
112 |             avg_factor (int, float, optional): Average factor that is used to
113 |                 average the loss. Defaults to None.
114 |             reduction_override (str, optional): The reduction method used to
115 |                 override the original reduction method of the loss.
116 |                 Defaults to None.
117 |         """
118 | 
119 |         pos_loss, neg_loss = heatmap_focal_loss_with_pos_inds(
120 |             pred,
121 |             target,
122 |             pos_inds,
123 |             alpha=self.alpha,
124 |             beta=self.beta,
125 |             gamma=self.gamma,
126 |             sigmoid_clamp=self.sigmoid_clamp,
127 |             ignore_high_fp=self.ignore_high_fp,
128 |             pos_weight=self.pos_weight,
129 |             neg_weight=self.neg_weight,
130 |             avg_factor=avg_factor)
131 |         return pos_loss, neg_loss
132 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/roi_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .standard_roi_head import *
2 | from .detic_bbox_heads import *
3 | from .fvlm_bbox_heads import *
4 | from .detic_roi_head import *
5 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/roi_heads/detic_bbox_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .bbox_head import DeticBBoxHead
2 | from .zero_shot_classifier import ZeroShotClassifier
3 | from .detic_bbox_head import OriginalDeticBBoxHead
4 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/roi_heads/detic_bbox_heads/zero_shot_classifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import numpy as np
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import functional as F
 6 | from mmdet.registry import MODELS
 7 | 
 8 | 
 9 | @MODELS.register_module()
10 | class ZeroShotClassifier(nn.Module):
11 | 
12 |     def __init__(
13 |         self,
14 |         in_features: int,
15 |         out_features: int,  # num_classes
16 |         zs_weight_path: str,
17 |         zs_weight_dim: int = 512,
18 |         use_bias: float = 0.0,
19 |         norm_weight: bool = True,
20 |         norm_temperature: float = 50.0,
21 |     ):
22 |         super().__init__()
23 |         num_classes = out_features
24 |         self.norm_weight = norm_weight
25 |         self.norm_temperature = norm_temperature
26 | 
27 |         self.use_bias = use_bias < 0
28 |         if self.use_bias:
29 |             self.cls_bias = nn.Parameter(torch.ones(1) * use_bias)
30 | 
31 |         self.linear = nn.Linear(in_features, zs_weight_dim)
32 | 
33 |         if zs_weight_path == 'rand':
34 |             zs_weight = torch.randn((zs_weight_dim, num_classes))
35 |             nn.init.normal_(zs_weight, std=0.01)
36 |         else:
37 |             if zs_weight_path.endswith('npy'):
38 |                 zs_weight = torch.tensor(
39 |                     np.load(zs_weight_path),
40 |                     dtype=torch.float32).permute(1, 0).contiguous()  # D x C
41 |             else:
42 |                 zs_weight = torch.load(
43 |                     zs_weight_path).float().permute(1, 0).contiguous()  # D x C
44 |         zs_weight = torch.cat(
45 |             [zs_weight, zs_weight.new_zeros(
46 |                 (zs_weight_dim, 1))], dim=1)  # D x (C + 1)
47 | 
48 |         if self.norm_weight:
49 |             zs_weight = F.normalize(zs_weight, p=2, dim=0)
50 | 
51 |         if zs_weight_path == 'rand':
52 |             self.zs_weight = nn.Parameter(zs_weight)
53 |         else:
54 |             self.register_buffer('zs_weight', zs_weight)
55 | 
56 |         assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape
57 | 
58 |     def forward(self, x, classifier=None):
59 |         '''
60 |         Inputs:
61 |             x: B x D'
62 |             classifier_info: (C', C' x D)
63 |         '''
64 |         x = self.linear(x)
65 |         if classifier is not None:
66 |             zs_weight = classifier.permute(1, 0).contiguous()  # D x C'
67 |             zs_weight = F.normalize(zs_weight, p=2, dim=0) \
68 |                 if self.norm_weight else zs_weight
69 |         else:
70 |             zs_weight = self.zs_weight
71 |         if self.norm_weight:
72 |             x = self.norm_temperature * F.normalize(x, p=2, dim=1)
73 |         x = torch.mm(x, zs_weight)
74 |         if self.use_bias:
75 |             x = x + self.cls_bias
76 |         return x
77 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/roi_heads/fvlm_bbox_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .convfc_bbox_head import *
2 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/README.md:
--------------------------------------------------------------------------------
 1 | # CLIP models
 2 | 
 3 | ## RN50
 4 | ```python
 5 | clip_cfg=dict(
 6 |     type='CLIP',
 7 |     image_encoder=dict(
 8 |         type='CLIPResNet',
 9 |         layers=[3, 4, 6, 3],
10 |         output_dim=1024,
11 |         heads=32,
12 |         input_resolution=224,
13 |         width=64,
14 |         init_cfg=dict(
15 |             type='Pretrained',
16 |             prefix='visual',
17 |             checkpoint='checkpoints/clip_r50.pth')
18 |     ),
19 |     text_encoder=dict(
20 |         type='CLIPTextEncoder',
21 |         embed_dim=1024,
22 |         context_length=77,
23 |         vocab_size=49408,
24 |         transformer_width=512,    # also the word embedding dim
25 |         transformer_heads=8,
26 |         transformer_layers=12,
27 |         init_cfg=dict(
28 |             type='Pretrained',
29 |             checkpoint='checkpoints/clip_r50.pth')
30 |     )
31 | )
32 | ```
33 | 
34 | ## ViT-B/32
35 | ```python
36 | clip_cfg=dict(
37 |     type='CLIP',
38 |     image_encoder=dict(
39 |         type='CLIPViT',
40 |         input_resolution=224,
41 |         patch_size=32,
42 |         width=768,
43 |         layers=12,
44 |         heads=12,
45 |         output_dim=512,
46 |         init_cfg=dict(
47 |             type='Pretrained',
48 |             prefix='visual',
49 |             checkpoint='checkpoints/clip_vitb32.pth')
50 |     ),
51 |     text_encoder=dict(
52 |         type='CLIPTextEncoder',
53 |         embed_dim=512,
54 |         context_length=77,
55 |         vocab_size=49408,
56 |         transformer_width=512,    # also the word embedding dim
57 |         transformer_heads=8,
58 |         transformer_layers=12,
59 |         init_cfg=dict(
60 |             type='Pretrained',
61 |             checkpoint='checkpoints/clip_vitb32.pth')
62 |     )
63 | )
64 | ```


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .text_encoder import CLIPTextEncoder
2 | # from .image_encoder import CLIPResNet, CLIPResLayer4, CLIPViT
3 | from .model import CLIP
4 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/ovdet/models/vlms/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/common.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch import Tensor
  4 | from torch.nn import MultiheadAttention
  5 | from torch.nn import functional as F
  6 | from typing import Optional, Tuple
  7 | from collections import OrderedDict
  8 | 
  9 | 
 10 | class QuickGELU(nn.Module):
 11 |     def forward(self, x: torch.Tensor):
 12 |         return x * torch.sigmoid(1.702 * x)
 13 | 
 14 | 
 15 | class MultiheadSelfAttention(MultiheadAttention):
 16 |     def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
 17 |                 need_weights: bool = True, attn_mask: Optional[Tensor] = None, return_tokens: bool = False) \
 18 |             -> Tuple[Tensor, Tensor, Optional[Tensor]]:
 19 |         assert query is value and value is key       # self-attention
 20 |         if return_tokens:
 21 |         # in_projection
 22 |             tokens = F.linear(value, self.in_proj_weight, bias=self.in_proj_bias)[..., -self.embed_dim:]
 23 |             # out_projection
 24 |             tokens = F.linear(tokens, self.out_proj.weight, bias=self.out_proj.bias)
 25 |         else:
 26 |             tokens = None
 27 | 
 28 |         attn_output, attn_output_weights = F.multi_head_attention_forward(
 29 |             query=query, key=key, value=value,
 30 |             embed_dim_to_check=self.embed_dim,
 31 |             num_heads=self.num_heads,
 32 |             in_proj_weight=self.in_proj_weight,
 33 |             in_proj_bias=self.in_proj_bias,
 34 |             bias_k=None, bias_v=None,
 35 |             add_zero_attn=False,
 36 |             dropout_p=0.,
 37 |             out_proj_weight=self.out_proj.weight,
 38 |             out_proj_bias=self.out_proj.bias,
 39 |             training=self.training,
 40 |             key_padding_mask=key_padding_mask, need_weights=need_weights,
 41 |             attn_mask=attn_mask)
 42 | 
 43 |         return attn_output, tokens, attn_output_weights
 44 | 
 45 | 
 46 | class ResidualAttentionBlock(nn.Module):
 47 |     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
 48 |         super().__init__()
 49 | 
 50 |         self.attn = MultiheadSelfAttention(d_model, n_head)
 51 |         self.ln_1 = LayerNorm(d_model)
 52 |         self.mlp = nn.Sequential(OrderedDict([
 53 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
 54 |             ("gelu", QuickGELU()),
 55 |             ("c_proj", nn.Linear(d_model * 4, d_model))
 56 |         ]))
 57 |         self.ln_2 = LayerNorm(d_model)
 58 |         self.attn_mask = attn_mask
 59 | 
 60 |     def attention(self, x: torch.Tensor, return_tokens: bool, attn_masks=None):
 61 |         self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
 62 |         length = x.shape[0]
 63 |         if attn_masks is None:
 64 |             attn_mask = None if self.attn_mask is None else self.attn_mask[:length, :length]
 65 |         else:
 66 |             attn_mask = attn_masks
 67 |         return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask,
 68 |                          return_tokens=return_tokens)[:2]
 69 | 
 70 |     def forward(self, x, return_tokens=False, cls_indices=None, attn_masks=None):
 71 |         att, tokens = self.attention(self.ln_1(x), return_tokens, attn_masks=attn_masks)
 72 |         if return_tokens:
 73 |             assert cls_indices is not None
 74 |             if not isinstance(cls_indices, int):
 75 |                 assert len(cls_indices) == x.shape[1]   # x: LNC
 76 |             cls_tokens = x[cls_indices, torch.arange(x.shape[1])]
 77 |             tokens = cls_tokens[None] + tokens
 78 |             tokens = tokens + self.mlp(self.ln_2(tokens))
 79 | 
 80 |             x = x + att
 81 |             x = x + self.mlp(self.ln_2(x))
 82 | 
 83 |             return x, tokens
 84 |         else:
 85 |             assert tokens is None
 86 |             x = x + att
 87 |             # x = x + self.attention(self.ln_1(x))
 88 |             x = x + self.mlp(self.ln_2(x))
 89 | 
 90 |             return x, None
 91 | 
 92 | 
 93 | class LayerNorm(nn.LayerNorm):
 94 |     """Subclass torch's LayerNorm to handle fp16."""
 95 | 
 96 |     def forward(self, x: torch.Tensor):
 97 |         orig_type = x.dtype
 98 |         ret = super().forward(x.type(torch.float32))
 99 |         return ret.type(orig_type)
100 | 
101 | 
102 | class Transformer(nn.Module):
103 |     def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
104 |         super().__init__()
105 |         self.width = width
106 |         self.layers = layers
107 |         self.heads = heads
108 |         self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
109 | 
110 |     def forward(self, x: torch.Tensor, return_tokens=False, cls_indices=None, attn_masks=None):
111 |         for i in range(self.layers - 1):
112 |             x, _ = self.resblocks[i](x, attn_masks=attn_masks)
113 |         return self.resblocks[-1](x, return_tokens=return_tokens, cls_indices=cls_indices,
114 |                                   attn_masks=attn_masks)
115 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/model.py:
--------------------------------------------------------------------------------
 1 | from mmdet.registry import MODELS
 2 | from mmengine.model import BaseModule
 3 | 
 4 | 
 5 | @MODELS.register_module()
 6 | class CLIP(BaseModule):
 7 |     def __init__(self, text_encoder, image_encoder):
 8 |         super().__init__()
 9 |         if text_encoder is not None:
10 |             self.text_encoder = MODELS.build(text_encoder)
11 |         if image_encoder is not None:
12 |             self.image_encoder = MODELS.build(image_encoder)
13 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 78 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 79 | 
 80 |     def bpe(self, token):
 81 |         if token in self.cache:
 82 |             return self.cache[token]
 83 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 84 |         pairs = get_pairs(word)
 85 | 
 86 |         if not pairs:
 87 |             return token+'</w>'
 88 | 
 89 |         while True:
 90 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 91 |             if bigram not in self.bpe_ranks:
 92 |                 break
 93 |             first, second = bigram
 94 |             new_word = []
 95 |             i = 0
 96 |             while i < len(word):
 97 |                 try:
 98 |                     j = word.index(first, i)
 99 |                     new_word.extend(word[i:j])
100 |                     i = j
101 |                 except:
102 |                     new_word.extend(word[i:])
103 |                     break
104 | 
105 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 |                     new_word.append(first+second)
107 |                     i += 2
108 |                 else:
109 |                     new_word.append(word[i])
110 |                     i += 1
111 |             new_word = tuple(new_word)
112 |             word = new_word
113 |             if len(word) == 1:
114 |                 break
115 |             else:
116 |                 pairs = get_pairs(word)
117 |         word = ' '.join(word)
118 |         self.cache[token] = word
119 |         return word
120 | 
121 |     def encode(self, text):
122 |         bpe_tokens = []
123 |         text = whitespace_clean(basic_clean(text)).lower()
124 |         for token in re.findall(self.pat, text):
125 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 |         return bpe_tokens
128 | 
129 |     def decode(self, tokens):
130 |         text = ''.join([self.decoder[token] for token in tokens])
131 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132 |         return text
133 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/models/vlms/clip/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | import torch
 3 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer
 4 | 
 5 | 
 6 | _tokenizer = _Tokenizer()
 7 | 
 8 | 
 9 | def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor:
10 |     """
11 |     Returns the tokenized representation of given input string(s)
12 | 
13 |     Parameters
14 |     ----------
15 |     texts : Union[str, List[str]]
16 |         An input string or a list of input strings to tokenize
17 | 
18 |     context_length : int
19 |         The ovd length to use; all CLIP models use 77 as the ovd length
20 | 
21 |     truncate: bool
22 |         Whether to truncate the text in case its encoding is longer than the ovd length
23 | 
24 |     Returns
25 |     -------
26 |     A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
27 |     """
28 |     if isinstance(texts, str):
29 |         texts = [texts]
30 | 
31 |     sot_token = _tokenizer.encoder["<|startoftext|>"]
32 |     eot_token = _tokenizer.encoder["<|endoftext|>"]
33 |     all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
34 |     result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
35 | 
36 |     for i, tokens in enumerate(all_tokens):
37 |         if len(tokens) > context_length:
38 |             if truncate:
39 |                 tokens = tokens[:context_length]
40 |                 tokens[-1] = eot_token
41 |             else:
42 |                 raise RuntimeError(f"Input {texts[i]} is too long for ovd length {context_length}")
43 |         result[i, :len(tokens)] = torch.tensor(tokens)
44 | 
45 |     return result
46 | 
47 | 
48 | def tokenize_dynamic(texts, context_length: int = 77, truncate: bool = False):
49 |     if isinstance(texts, str):
50 |         texts = [texts]
51 | 
52 |     sot_token = _tokenizer.encoder["<|startoftext|>"]
53 |     eot_token = _tokenizer.encoder["<|endoftext|>"]
54 |     all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
55 |     lengths = [len(tokens) for tokens in all_tokens]
56 |     context_length = min(context_length, max(lengths))
57 |     result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
58 | 
59 |     for i, tokens in enumerate(all_tokens):
60 |         if len(tokens) > context_length:
61 |             if truncate:
62 |                 tokens = tokens[:context_length]
63 |                 tokens[-1] = eot_token
64 |             else:
65 |                 raise RuntimeError(f"Input {texts[i]} is too long for ovd length {context_length}")
66 |         result[i, :len(tokens)] = torch.tensor(tokens)
67 | 
68 |     return result
69 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .misc import multi_apply, load_class_freq
2 | 


--------------------------------------------------------------------------------
/ovdet/ovdet/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from functools import partial
 4 | from six.moves import map, zip
 5 | 
 6 | 
 7 | def multi_apply(func, *args, **kwargs):
 8 |     """Apply function to a list of arguments.
 9 |     Note:
10 |         This function applies the ``func`` to multiple inputs and
11 |         map the multiple outputs of the ``func`` into different
12 |         list. Each list contains the same type of outputs corresponding
13 |         to different inputs.
14 |     Args:
15 |         func (Function): A function that will be applied to a list of
16 |             arguments
17 |     Returns:
18 |         tuple(list): A tuple containing multiple list, each list contains \
19 |             a kind of returned results by the function
20 |     """
21 |     pfunc = partial(func, **kwargs) if kwargs else func
22 |     map_results = map(pfunc, *args)
23 |     return tuple(map(list, zip(*map_results)))
24 | 
25 | 
26 | def load_class_freq(
27 |         path='data/metadata/lvis_v1_train_cat_norare_info.json',
28 |         freq_weight=1.0,
29 |         min_count=0):
30 |     cat_info = json.load(open(path, 'r'))
31 |     cat_info = torch.tensor(
32 |         [max(c['image_count'], min_count) for c in sorted(cat_info, key=lambda x: x['id'])])
33 |     freq_weight = cat_info.float() ** freq_weight
34 |     return freq_weight
35 | 


--------------------------------------------------------------------------------
/ovdet/tools/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | NNODES=${NNODES:-1}
 7 | NODE_RANK=${NODE_RANK:-0}
 8 | PORT=${PORT:-29500}
 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 | 
11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
12 | python -m torch.distributed.launch \
13 |     --nnodes=$NNODES \
14 |     --node_rank=$NODE_RANK \
15 |     --master_addr=$MASTER_ADDR \
16 |     --nproc_per_node=$GPUS \
17 |     --master_port=$PORT \
18 |     $(dirname "$0")/test.py \
19 |     $CONFIG \
20 |     $CHECKPOINT \
21 |     --launcher pytorch \
22 |     ${@:4}
23 | 


--------------------------------------------------------------------------------
/ovdet/tools/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | NNODES=${NNODES:-1}
 6 | NODE_RANK=${NODE_RANK:-0}
 7 | PORT=${PORT:-29500}
 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 9 | 
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python -m torch.distributed.launch \
12 |     --nnodes=$NNODES \
13 |     --node_rank=$NODE_RANK \
14 |     --master_addr=$MASTER_ADDR \
15 |     --nproc_per_node=$GPUS \
16 |     --master_port=$PORT \
17 |     $(dirname "$0")/train.py \
18 |     $CONFIG \
19 |     --launcher pytorch ${@:3}
20 | 


--------------------------------------------------------------------------------
/ovdet/tools/pre_processors/keep_coco_base.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from tqdm import tqdm
 4 | 
 5 | categories_seen = [
 6 |     {'id': 1, 'name': 'person'},
 7 |     {'id': 2, 'name': 'bicycle'},
 8 |     {'id': 3, 'name': 'car'},
 9 |     {'id': 4, 'name': 'motorcycle'},
10 |     {'id': 7, 'name': 'train'},
11 |     {'id': 8, 'name': 'truck'},
12 |     {'id': 9, 'name': 'boat'},
13 |     {'id': 15, 'name': 'bench'},
14 |     {'id': 16, 'name': 'bird'},
15 |     {'id': 19, 'name': 'horse'},
16 |     {'id': 20, 'name': 'sheep'},
17 |     {'id': 23, 'name': 'bear'},
18 |     {'id': 24, 'name': 'zebra'},
19 |     {'id': 25, 'name': 'giraffe'},
20 |     {'id': 27, 'name': 'backpack'},
21 |     {'id': 31, 'name': 'handbag'},
22 |     {'id': 33, 'name': 'suitcase'},
23 |     {'id': 34, 'name': 'frisbee'},
24 |     {'id': 35, 'name': 'skis'},
25 |     {'id': 38, 'name': 'kite'},
26 |     {'id': 42, 'name': 'surfboard'},
27 |     {'id': 44, 'name': 'bottle'},
28 |     {'id': 48, 'name': 'fork'},
29 |     {'id': 50, 'name': 'spoon'},
30 |     {'id': 51, 'name': 'bowl'},
31 |     {'id': 52, 'name': 'banana'},
32 |     {'id': 53, 'name': 'apple'},
33 |     {'id': 54, 'name': 'sandwich'},
34 |     {'id': 55, 'name': 'orange'},
35 |     {'id': 56, 'name': 'broccoli'},
36 |     {'id': 57, 'name': 'carrot'},
37 |     {'id': 59, 'name': 'pizza'},
38 |     {'id': 60, 'name': 'donut'},
39 |     {'id': 62, 'name': 'chair'},
40 |     {'id': 65, 'name': 'bed'},
41 |     {'id': 70, 'name': 'toilet'},
42 |     {'id': 72, 'name': 'tv'},
43 |     {'id': 73, 'name': 'laptop'},
44 |     {'id': 74, 'name': 'mouse'},
45 |     {'id': 75, 'name': 'remote'},
46 |     {'id': 78, 'name': 'microwave'},
47 |     {'id': 79, 'name': 'oven'},
48 |     {'id': 80, 'name': 'toaster'},
49 |     {'id': 82, 'name': 'refrigerator'},
50 |     {'id': 84, 'name': 'book'},
51 |     {'id': 85, 'name': 'clock'},
52 |     {'id': 86, 'name': 'vase'},
53 |     {'id': 90, 'name': 'toothbrush'},
54 | ]
55 | 
56 | base_cat_ids = [cat['id'] for cat in categories_seen]
57 | 
58 | parser = argparse.ArgumentParser()
59 | parser.add_argument("--json_path", default="data/coco/annotations/instances_train2017.json", type=str)
60 | parser.add_argument("--out_path", default="data/coco/wusize/instances_train2017_base.json")
61 | args = parser.parse_args()
62 | 
63 | with open(args.json_path, 'r') as f:
64 |     json_coco = json.load(f)
65 | 
66 | annotations = []
67 | 
68 | for ann in tqdm(json_coco['annotations']):
69 |     if ann['category_id'] in base_cat_ids:
70 |         annotations.append(ann)
71 | 
72 | json_coco['annotations'] = annotations
73 | 
74 | with open(args.out_path, 'w') as f:
75 |     json.dump(json_coco, f)
76 | 


--------------------------------------------------------------------------------
/ovdet/tools/pre_processors/keep_coco_novel.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from tqdm import tqdm
 4 | 
 5 | categories_unseen = [
 6 |     {'id': 5, 'name': 'airplane'},
 7 |     {'id': 6, 'name': 'bus'},
 8 |     {'id': 17, 'name': 'cat'},
 9 |     {'id': 18, 'name': 'dog'},
10 |     {'id': 21, 'name': 'cow'},
11 |     {'id': 22, 'name': 'elephant'},
12 |     {'id': 28, 'name': 'umbrella'},
13 |     {'id': 32, 'name': 'tie'},
14 |     {'id': 36, 'name': 'snowboard'},
15 |     {'id': 41, 'name': 'skateboard'},
16 |     {'id': 47, 'name': 'cup'},
17 |     {'id': 49, 'name': 'knife'},
18 |     {'id': 61, 'name': 'cake'},
19 |     {'id': 63, 'name': 'couch'},
20 |     {'id': 76, 'name': 'keyboard'},
21 |     {'id': 81, 'name': 'sink'},
22 |     {'id': 87, 'name': 'scissors'},
23 | ]
24 | 
25 | novel_cat_ids = [cat['id'] for cat in categories_unseen]
26 | 
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument("--json_path", default="data/coco/annotations/instances_val2017.json")
29 | parser.add_argument("--out_path", default="data/coco/wusize/instances_val2017_novel.json")
30 | args = parser.parse_args()
31 | 
32 | with open(args.json_path, 'r') as f:
33 |     json_coco = json.load(f)
34 | 
35 | annotations = []
36 | 
37 | for ann in tqdm(json_coco['annotations']):
38 |     if ann['category_id'] in novel_cat_ids:
39 |         annotations.append(ann)
40 | 
41 | json_coco['annotations'] = annotations
42 | 
43 | with open(args.out_path, 'w') as f:
44 |     json.dump(json_coco, f)
45 | 


--------------------------------------------------------------------------------
/ovdet/tools/pre_processors/keep_lvis_base.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from tqdm import tqdm
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--json_path", default="data/lvis_v1/annotations/lvis_v1_train.json")
 7 | parser.add_argument("--out_path", default="data/lvis_v1/wusize/lvis_v1_train_base.json")
 8 | args = parser.parse_args()
 9 | 
10 | with open(args.json_path, 'r') as f:
11 |     json_coco = json.load(f)
12 | 
13 | annotations = []
14 | 
15 | 
16 | cat_id2cat_info = {cat_info['id']: cat_info for cat_info in json_coco['categories']}
17 | for ann in tqdm(json_coco['annotations']):
18 |     cat_id = ann['category_id']
19 |     cat_info = cat_id2cat_info[cat_id]
20 |     frequency = cat_info['frequency']
21 |     if frequency in ['f', 'c']:
22 |         annotations.append(ann)
23 | 
24 | json_coco['annotations'] = annotations
25 | 
26 | with open(args.out_path, 'w') as f:
27 |     json.dump(json_coco, f)
28 | 


--------------------------------------------------------------------------------
/ovdet/tools/slurm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | CHECKPOINT=$4
 9 | GPUS=${GPUS:-8}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12 | PY_ARGS=${@:5}
13 | SRUN_ARGS=${SRUN_ARGS:-""}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     ${SRUN_ARGS} \
24 |     python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
25 | 


--------------------------------------------------------------------------------
/ovdet/tools/slurm_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | WORK_DIR=$4
 9 | GPUS=${GPUS:-8}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12 | SRUN_ARGS=${SRUN_ARGS:-""}
13 | PY_ARGS=${@:5}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     ${SRUN_ARGS} \
24 |     python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
25 | 


--------------------------------------------------------------------------------
/ovdet/tools/test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import argparse
  3 | import os
  4 | import os.path as osp
  5 | 
  6 | from mmengine.config import Config, DictAction
  7 | from mmengine.evaluator import DumpResults
  8 | from mmengine.runner import Runner
  9 | 
 10 | from mmdet.engine.hooks.utils import trigger_visualization_hook
 11 | from mmdet.registry import RUNNERS
 12 | from mmdet.utils import register_all_modules
 13 | import ovdet  # noqa
 14 | 
 15 | 
 16 | # TODO: support fuse_conv_bn and format_only
 17 | def parse_args():
 18 |     parser = argparse.ArgumentParser(
 19 |         description='MMDet test (and eval) a model')
 20 |     parser.add_argument('config', help='test config file path')
 21 |     parser.add_argument('checkpoint', help='checkpoint file')
 22 |     parser.add_argument(
 23 |         '--work-dir',
 24 |         help='the directory to save the file containing evaluation metrics')
 25 |     parser.add_argument(
 26 |         '--out',
 27 |         type=str,
 28 |         help='dump predictions to a pickle file for offline evaluation')
 29 |     parser.add_argument(
 30 |         '--show', action='store_true', help='show prediction results')
 31 |     parser.add_argument(
 32 |         '--show-dir',
 33 |         help='directory where painted images will be saved. '
 34 |         'If specified, it will be automatically saved '
 35 |         'to the work_dir/timestamp/show_dir')
 36 |     parser.add_argument(
 37 |         '--wait-time', type=float, default=2, help='the interval of show (s)')
 38 |     parser.add_argument(
 39 |         '--cfg-options',
 40 |         nargs='+',
 41 |         action=DictAction,
 42 |         help='override some settings in the used config, the key-value pair '
 43 |         'in xxx=yyy format will be merged into config file. If the value to '
 44 |         'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 45 |         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 46 |         'Note that the quotation marks are necessary and that no white space '
 47 |         'is allowed.')
 48 |     parser.add_argument(
 49 |         '--launcher',
 50 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 51 |         default='none',
 52 |         help='job launcher')
 53 |     parser.add_argument('--local_rank', type=int, default=0)
 54 |     args = parser.parse_args()
 55 |     if 'LOCAL_RANK' not in os.environ:
 56 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 57 |     return args
 58 | 
 59 | 
 60 | def main():
 61 |     args = parse_args()
 62 | 
 63 |     # register all modules in mmdet into the registries
 64 |     # do not init the default scope here because it will be init in the runner
 65 |     register_all_modules(init_default_scope=False)
 66 | 
 67 |     # load config
 68 |     cfg = Config.fromfile(args.config)
 69 |     cfg.launcher = args.launcher
 70 |     if args.cfg_options is not None:
 71 |         cfg.merge_from_dict(args.cfg_options)
 72 | 
 73 |     # work_dir is determined in this priority: CLI > segment in file > filename
 74 |     if args.work_dir is not None:
 75 |         # update configs according to CLI args if args.work_dir is not None
 76 |         cfg.work_dir = args.work_dir
 77 |     elif cfg.get('work_dir', None) is None:
 78 |         # use config filename as default work_dir if cfg.work_dir is None
 79 |         cfg.work_dir = osp.join('./work_dirs',
 80 |                                 osp.splitext(osp.basename(args.config))[0])
 81 | 
 82 |     cfg.load_from = args.checkpoint
 83 | 
 84 |     if args.show or args.show_dir:
 85 |         cfg = trigger_visualization_hook(cfg, args)
 86 | 
 87 |     # build the runner from config
 88 |     if 'runner_type' not in cfg:
 89 |         # build the default runner
 90 |         runner = Runner.from_cfg(cfg)
 91 |     else:
 92 |         # build customized runner from the registry
 93 |         # if 'runner_type' is set in the cfg
 94 |         runner = RUNNERS.build(cfg)
 95 | 
 96 |     # add `DumpResults` dummy metric
 97 |     if args.out is not None:
 98 |         assert args.out.endswith(('.pkl', '.pickle')), \
 99 |             'The dump file must be a pkl file.'
100 |         runner.test_evaluator.metrics.append(
101 |             DumpResults(out_file_path=args.out))
102 | 
103 |     # start testing
104 |     runner.test()
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/ovdet/tools/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import argparse
  3 | import logging
  4 | import os
  5 | import os.path as osp
  6 | 
  7 | from mmengine.config import Config, DictAction
  8 | from mmengine.logging import print_log
  9 | from mmengine.registry import RUNNERS
 10 | from mmengine.runner import Runner
 11 | 
 12 | from mmdet.utils import register_all_modules
 13 | 
 14 | import ovdet   # noqa
 15 | 
 16 | 
 17 | def parse_args():
 18 |     parser = argparse.ArgumentParser(description='Train a detector')
 19 |     parser.add_argument('config', help='train config file path')
 20 |     parser.add_argument('--work-dir', help='the dir to save logs and models')
 21 |     parser.add_argument(
 22 |         '--amp',
 23 |         action='store_true',
 24 |         default=False,
 25 |         help='enable automatic-mixed-precision training')
 26 |     parser.add_argument(
 27 |         '--auto-scale-lr',
 28 |         action='store_true',
 29 |         help='enable automatically scaling LR.')
 30 |     parser.add_argument(
 31 |         '--resume',
 32 |         nargs='?',
 33 |         type=str,
 34 |         const='auto',
 35 |         help='If specify checkpoint path, resume from it, while if not '
 36 |         'specify, try to auto resume from the latest checkpoint '
 37 |         'in the work directory.')
 38 |     parser.add_argument(
 39 |         '--cfg-options',
 40 |         nargs='+',
 41 |         action=DictAction,
 42 |         help='override some settings in the used config, the key-value pair '
 43 |         'in xxx=yyy format will be merged into config file. If the value to '
 44 |         'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 45 |         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 46 |         'Note that the quotation marks are necessary and that no white space '
 47 |         'is allowed.')
 48 |     parser.add_argument(
 49 |         '--launcher',
 50 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 51 |         default='none',
 52 |         help='job launcher')
 53 |     parser.add_argument('--local_rank', type=int, default=0)
 54 |     args = parser.parse_args()
 55 |     if 'LOCAL_RANK' not in os.environ:
 56 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 57 | 
 58 |     return args
 59 | 
 60 | 
 61 | def main():
 62 |     args = parse_args()
 63 | 
 64 |     # register all modules in mmdet into the registries
 65 |     # do not init the default scope here because it will be init in the runner
 66 |     register_all_modules(init_default_scope=False)
 67 | 
 68 |     # load config
 69 |     cfg = Config.fromfile(args.config)
 70 |     cfg.launcher = args.launcher
 71 |     if args.cfg_options is not None:
 72 |         cfg.merge_from_dict(args.cfg_options)
 73 | 
 74 |     # work_dir is determined in this priority: CLI > segment in file > filename
 75 |     if args.work_dir is not None:
 76 |         # update configs according to CLI args if args.work_dir is not None
 77 |         cfg.work_dir = args.work_dir
 78 |     elif cfg.get('work_dir', None) is None:
 79 |         # use config filename as default work_dir if cfg.work_dir is None
 80 |         cfg.work_dir = osp.join('./work_dirs',
 81 |                                 osp.splitext(osp.basename(args.config))[0])
 82 | 
 83 |     # enable automatic-mixed-precision training
 84 |     if args.amp is True:
 85 |         optim_wrapper = cfg.optim_wrapper.type
 86 |         if optim_wrapper == 'AmpOptimWrapper':
 87 |             print_log(
 88 |                 'AMP training is already enabled in your config.',
 89 |                 logger='current',
 90 |                 level=logging.WARNING)
 91 |         else:
 92 |             assert optim_wrapper == 'OptimWrapper', (
 93 |                 '`--amp` is only supported when the optimizer wrapper type is '
 94 |                 f'`OptimWrapper` but got {optim_wrapper}.')
 95 |             cfg.optim_wrapper.type = 'AmpOptimWrapper'
 96 |             cfg.optim_wrapper.loss_scale = 'dynamic'
 97 | 
 98 |     # enable automatically scaling LR
 99 |     if args.auto_scale_lr:
100 |         if 'auto_scale_lr' in cfg and \
101 |                 'enable' in cfg.auto_scale_lr and \
102 |                 'base_batch_size' in cfg.auto_scale_lr:
103 |             cfg.auto_scale_lr.enable = True
104 |         else:
105 |             raise RuntimeError('Can not find "auto_scale_lr" or '
106 |                                '"auto_scale_lr.enable" or '
107 |                                '"auto_scale_lr.base_batch_size" in your'
108 |                                ' configuration file.')
109 | 
110 |     # resume is determined in this priority: resume from > auto_resume
111 |     if args.resume == 'auto':
112 |         cfg.resume = True
113 |         cfg.load_from = None
114 |     elif args.resume is not None:
115 |         cfg.resume = True
116 |         cfg.load_from = args.resume
117 | 
118 |     # build the runner from config
119 |     if 'runner_type' not in cfg:
120 |         # build the default runner
121 |         runner = Runner.from_cfg(cfg)
122 |     else:
123 |         # build customized runner from the registry
124 |         # if 'runner_type' is set in the cfg
125 |         runner = RUNNERS.build(cfg)
126 | 
127 |     # start training
128 |     runner.train()
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 | 


--------------------------------------------------------------------------------
/requirements-training.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.9.0
 2 | torchvision
 3 | webdataset>=0.2.5
 4 | regex
 5 | ftfy
 6 | tqdm
 7 | pandas
 8 | braceexpand
 9 | huggingface_hub
10 | transformers
11 | timm
12 | fsspec
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.9.0
 2 | torchvision
 3 | regex
 4 | ftfy
 5 | tqdm
 6 | huggingface_hub
 7 | sentencepiece
 8 | protobuf<4
 9 | timm
10 | panopticapi@git+https://github.com/cocodataset/panopticapi.git


--------------------------------------------------------------------------------
/scripts/test_openai_vitb16_macc_boxes_masks.sh:
--------------------------------------------------------------------------------
1 | NAME=$1
2 | CHECKPOINT=$2
3 | torchrun --nproc_per_node 4 -m training.main --batch-size=1 \
4 | --model ViT-B-16 --pretrained openai --test-type coco_panoptic --train-data="" \
5 | --val-data data/coco/annotations/panoptic_val2017.json \
6 | --embed-path metadata/coco_panoptic_clip_hand_craft_ViTB16.npy \
7 | --val-image-root data/coco/val2017  --cache-dir $CHECKPOINT --extract-type="v2" \
8 | --name $NAME --downsample-factor 16 --det-image-size 1024
9 | 


--------------------------------------------------------------------------------
/scripts/train_clim_cc3m_3e_openai_vitb16.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 4 -m training.main --batch-size=32 --lr=1e-5 --wd=0.1 --epochs=3 --workers=4 \
 2 | --model ViT-B-16 --pretrained openai --warmup 1000  --zeroshot-frequency 1 --dataset-type coco_caption  \
 3 | --test-type coco_panoptic --train-data data/cc3m/cc3m_train_original_size_filtered.json \
 4 | --val-data data/coco/annotations/panoptic_val2017.json \
 5 | --embed-path metadata/coco_panoptic_clip_hand_craft_ViTB16.npy --train-image-root="data/cc3m" \
 6 | --val-image-root data/coco/val2017  --cache-dir checkpoints --log-every-n-steps 50 \
 7 | --lock-image --save-frequency 3 --lock-image-unlocked-groups 6 --extract-type="v2" \
 8 | --name clim_cc3m_3_save3_test1_openai_vitb16_6layers --downsample-factor 16 --det-image-size 1024 \
 9 | --alpha 0.7 --train-image-size 1024
10 | 


--------------------------------------------------------------------------------
/scripts/train_clim_coco_100e_openai_vitb16.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 4 -m training.main --batch-size=32 --lr=1e-5 --wd=0.1 --epochs=100 --workers=4 \
 2 | --model ViT-B-16 --pretrained openai --warmup 1000  --zeroshot-frequency 10 --dataset-type coco_caption  \
 3 | --test-type coco_panoptic --train-data data/coco/wusize/captions_train2017_tags_allcaps.json \
 4 | --val-data data/coco/annotations/panoptic_val2017.json \
 5 | --embed-path metadata/coco_panoptic_clip_hand_craft_ViTB16.npy --train-image-root="data/coco/train2017" \
 6 | --val-image-root data/coco/val2017  --cache-dir checkpoints --log-every-n-steps 50 \
 7 | --lock-image --save-frequency 100 --lock-image-unlocked-groups 6 --extract-type="v2" \
 8 | --name clim_coco_100_save100_test10_openai_vitb16_6layers --downsample-factor 16 --det-image-size 1024 \
 9 | --alpha 0.7 --train-image-size 1024
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """ Setup
 2 | """
 3 | from setuptools import setup, find_packages
 4 | from codecs import open
 5 | from os import path
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | # Get the long description from the README file
10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
11 |     long_description = f.read()
12 | 
13 | def _read_reqs(relpath):
14 |     fullpath = path.join(path.dirname(__file__), relpath)
15 |     with open(fullpath) as f:
16 |         return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))]
17 | 
18 | REQUIREMENTS = _read_reqs("requirements.txt")
19 | TRAINING_REQUIREMENTS = _read_reqs("requirements-training.txt")
20 | 
21 | exec(open('src/open_clip/version.py').read())
22 | setup(
23 |     name='open_clip_torch',
24 |     version=__version__,
25 |     description='OpenCLIP',
26 |     long_description=long_description,
27 |     long_description_content_type='text/markdown',
28 |     url='https://github.com/mlfoundations/open_clip',
29 |     author='',
30 |     author_email='',
31 |     classifiers=[
32 |         # How mature is this project? Common values are
33 |         #   3 - Alpha
34 |         #   4 - Beta
35 |         #   5 - Production/Stable
36 |         'Development Status :: 3 - Alpha',
37 |         'Intended Audience :: Education',
38 |         'Intended Audience :: Science/Research',
39 |         'License :: OSI Approved :: Apache Software License',
40 |         'Programming Language :: Python :: 3.7',
41 |         'Programming Language :: Python :: 3.8',
42 |         'Programming Language :: Python :: 3.9',
43 |         'Programming Language :: Python :: 3.10',
44 |         'Topic :: Scientific/Engineering',
45 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
46 |         'Topic :: Software Development',
47 |         'Topic :: Software Development :: Libraries',
48 |         'Topic :: Software Development :: Libraries :: Python Modules',
49 |     ],
50 | 
51 |     # Note that this is a string of words separated by whitespace, not a list.
52 |     keywords='CLIP pretrained',
53 |     package_dir={'': 'src'},
54 |     packages=find_packages(where='src'),
55 |     include_package_data=True,
56 |     install_requires=REQUIREMENTS,
57 |     extras_require={
58 |         "training": TRAINING_REQUIREMENTS,
59 |     },
60 |     python_requires='>=3.7',
61 | )
62 | 


--------------------------------------------------------------------------------
/src/open_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coca_model import CoCa
 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
 7 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 8 | from .openai import load_openai_model, list_openai_models
 9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
10 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
12 | from .tokenizer import SimpleTokenizer, tokenize, decode
13 | from .transform import image_transform, AugmentationCfg
14 | 


--------------------------------------------------------------------------------
/src/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/open_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/src/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/src/open_clip/customs.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch.nn import MultiheadAttention
 3 | from torch.nn import functional as F
 4 | from typing import Optional, Tuple
 5 | 
 6 | 
 7 | class MultiheadSelfAttention(MultiheadAttention):
 8 |     def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
 9 |                 need_weights: bool = True, attn_mask: Optional[Tensor] = None, return_tokens: bool = False) \
10 |             -> Tuple[Tensor, Tensor]:
11 |         assert query is value and value is key       # self-attention
12 |         if return_tokens:
13 |         # in_projection
14 |             tokens = F.linear(value, self.in_proj_weight, bias=self.in_proj_bias)[..., -self.embed_dim:]
15 |             # out_projection
16 |             tokens = F.linear(tokens, self.out_proj.weight, bias=self.out_proj.bias)
17 |         else:
18 |             tokens = None
19 | 
20 |         attn_output, attn_output_weights = F.multi_head_attention_forward(
21 |             query=query, key=key, value=value,
22 |             embed_dim_to_check=self.embed_dim,
23 |             num_heads=self.num_heads,
24 |             in_proj_weight=self.in_proj_weight,
25 |             in_proj_bias=self.in_proj_bias,
26 |             bias_k=None, bias_v=None,
27 |             add_zero_attn=False,
28 |             dropout_p=0.,
29 |             out_proj_weight=self.out_proj.weight,
30 |             out_proj_bias=self.out_proj.bias,
31 |             training=self.training,
32 |             key_padding_mask=key_padding_mask, need_weights=need_weights,
33 |             attn_mask=attn_mask)
34 | 
35 |         return attn_output, tokens   # , attn_output_weights
36 | 


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 4 | from .loss import ClipLoss
 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\
 6 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 7 | from .openai import load_openai_model, list_openai_models
 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
 9 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
10 | from .tokenizer import SimpleTokenizer, tokenize
11 | from .transform import image_transform


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/open_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |   # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |   "roberta": {
 5 |       "config_names": {
 6 |           "context_length": "max_position_embeddings",
 7 |           "vocab_size": "vocab_size",
 8 |           "width": "hidden_size",
 9 |           "heads": "num_attention_heads",
10 |           "layers": "num_hidden_layers",
11 |           "layer_attr": "layer",
12 |           "token_embeddings_attr": "embeddings"
13 |       },
14 |       "pooler": "mean_pooler",
15 |   },
16 |   # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |   "xlm-roberta": {
18 |       "config_names": {
19 |           "context_length": "max_position_embeddings",
20 |           "vocab_size": "vocab_size",
21 |           "width": "hidden_size",
22 |           "heads": "num_attention_heads",
23 |           "layers": "num_hidden_layers",
24 |           "layer_attr": "layer",
25 |           "token_embeddings_attr": "embeddings"
26 |       },
27 |       "pooler": "mean_pooler",
28 |   },
29 |   # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |   "mt5": {
31 |       "config_names": {
32 |           # unlimited seqlen
33 |           # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |           # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |           "context_length": "",
36 |           "vocab_size": "vocab_size",
37 |           "width": "d_model",
38 |           "heads": "num_heads",
39 |           "layers": "num_layers",
40 |           "layer_attr": "block",
41 |           "token_embeddings_attr": "embed_tokens"
42 |       },
43 |       "pooler": "mean_pooler",
44 |   },
45 |   "bert": {
46 |     "config_names": {
47 |       "context_length": "max_position_embeddings",
48 |       "vocab_size": "vocab_size",
49 |       "width": "hidden_size",
50 |       "heads": "num_attention_heads",
51 |       "layers": "num_hidden_layers",
52 |       "layer_attr": "layer",
53 |       "token_embeddings_attr": "embeddings"
54 |     },
55 |     "pooler": "mean_pooler",
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16,
 8 |         "eva_model_name": "eva-clip-b-16",
 9 |         "ls_init_value": 0.1,
10 |         "drop_path_rate": 0.0
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 1024,
19 |         "heads": 16,
20 |         "layers": 24,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0.4,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 768,
19 |         "heads": 12,
20 |         "layers": 12,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "head_width": 64,
 8 |         "patch_size": 16,
 9 |         "mlp_ratio": 2.6667,
10 |         "eva_model_name": "eva-clip-b-16-X",
11 |         "drop_path_rate": 0.0,
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 512,
24 |         "heads": 8,
25 |         "layers": 12,
26 |         "xattn": true,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14-336",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/src/open_clip/eva_clip/transform.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Sequence, Tuple
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision.transforms.functional as F
  6 | 
  7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
  8 |     CenterCrop
  9 | 
 10 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 11 | 
 12 | 
 13 | class ResizeMaxSize(nn.Module):
 14 | 
 15 |     def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
 16 |         super().__init__()
 17 |         if not isinstance(max_size, int):
 18 |             raise TypeError(f"Size should be int. Got {type(max_size)}")
 19 |         self.max_size = max_size
 20 |         self.interpolation = interpolation
 21 |         self.fn = min if fn == 'min' else min
 22 |         self.fill = fill
 23 | 
 24 |     def forward(self, img):
 25 |         if isinstance(img, torch.Tensor):
 26 |             height, width = img.shape[:2]
 27 |         else:
 28 |             width, height = img.size
 29 |         scale = self.max_size / float(max(height, width))
 30 |         if scale != 1.0:
 31 |             new_size = tuple(round(dim * scale) for dim in (height, width))
 32 |             img = F.resize(img, new_size, self.interpolation)
 33 |             pad_h = self.max_size - new_size[0]
 34 |             pad_w = self.max_size - new_size[1]
 35 |             img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
 36 |         return img
 37 | 
 38 | 
 39 | def _convert_to_rgb(image):
 40 |     return image.convert('RGB')
 41 | 
 42 | 
 43 | # class CatGen(nn.Module):
 44 | #     def __init__(self, num=4):
 45 | #         self.num = num
 46 | #     def mixgen_batch(image, text):
 47 | #         batch_size = image.shape[0]
 48 | #         index = np.random.permutation(batch_size)
 49 | 
 50 | #         cat_images = []
 51 | #         for i in range(batch_size):
 52 | #             # image mixup
 53 | #             image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
 54 | #             # text concat
 55 | #             text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
 56 | #         text = torch.stack(text)
 57 | #         return image, text
 58 | 
 59 | 
 60 | def image_transform(
 61 |         image_size: int,
 62 |         is_train: bool,
 63 |         mean: Optional[Tuple[float, ...]] = None,
 64 |         std: Optional[Tuple[float, ...]] = None,
 65 |         resize_longest_max: bool = False,
 66 |         fill_color: int = 0,
 67 | ):
 68 |     mean = mean or OPENAI_DATASET_MEAN
 69 |     if not isinstance(mean, (list, tuple)):
 70 |         mean = (mean,) * 3
 71 | 
 72 |     std = std or OPENAI_DATASET_STD
 73 |     if not isinstance(std, (list, tuple)):
 74 |         std = (std,) * 3
 75 | 
 76 |     if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
 77 |         # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
 78 |         image_size = image_size[0]
 79 | 
 80 |     normalize = Normalize(mean=mean, std=std)
 81 |     if is_train:
 82 |         return Compose([
 83 |             RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
 84 |             _convert_to_rgb,
 85 |             ToTensor(),
 86 |             normalize,
 87 |         ])
 88 |     else:
 89 |         if resize_longest_max:
 90 |             transforms = [
 91 |                 ResizeMaxSize(image_size, fill=fill_color)
 92 |             ]
 93 |         else:
 94 |             transforms = [
 95 |                 Resize(image_size, interpolation=InterpolationMode.BICUBIC),
 96 |                 CenterCrop(image_size),
 97 |             ]
 98 |         transforms.extend([
 99 |             _convert_to_rgb,
100 |             ToTensor(),
101 |             normalize,
102 |         ])
103 |         return Compose(transforms)
104 | 


--------------------------------------------------------------------------------
/src/open_clip/generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/open_clip/generation_utils.py


--------------------------------------------------------------------------------
/src/open_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |     "roberta": {
 5 |         "config_names": {
 6 |             "context_length": "max_position_embeddings",
 7 |             "vocab_size": "vocab_size",
 8 |             "width": "hidden_size",
 9 |             "heads": "num_attention_heads",
10 |             "layers": "num_hidden_layers",
11 |             "layer_attr": "layer",
12 |             "token_embeddings_attr": "embeddings"
13 |         },
14 |         "pooler": "mean_pooler",
15 |     },
16 |     # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |     "xlm-roberta": {
18 |         "config_names": {
19 |             "context_length": "max_position_embeddings",
20 |             "vocab_size": "vocab_size",
21 |             "width": "hidden_size",
22 |             "heads": "num_attention_heads",
23 |             "layers": "num_hidden_layers",
24 |             "layer_attr": "layer",
25 |             "token_embeddings_attr": "embeddings"
26 |         },
27 |         "pooler": "mean_pooler",
28 |     },
29 |     # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |     "mt5": {
31 |         "config_names": {
32 |             # unlimited seqlen
33 |             # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |             "context_length": "",
36 |             "vocab_size": "vocab_size",
37 |             "width": "d_model",
38 |             "heads": "num_heads",
39 |             "layers": "num_layers",
40 |             "layer_attr": "block",
41 |             "token_embeddings_attr": "embed_tokens"
42 |         },
43 |         "pooler": "mean_pooler",
44 |     },
45 | }
46 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": [
 6 |             3,
 7 |             15,
 8 |             36,
 9 |             10
10 |         ],
11 |         "width": 128,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 1024,
18 |         "heads": 16,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16,
 8 |         "ls_init_value": 1e-4
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 384,
14 |         "heads": 6,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 32
17 |     }
18 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 56,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.5715,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 36
17 |     }
18 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 512,
25 |         "heads": 8,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 8
28 |     },
29 |     "custom_text": true
30 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 768,
25 |         "heads": 12,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 12
28 |     },
29 |     "custom_text": true
30 | }
31 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "multimodal_cfg": {
 4 |         "width": 768,
 5 |         "context_length": 76,
 6 |         "vocab_size": 64000,
 7 |         "mlp_ratio": 4,
 8 |         "layers": 12,
 9 |         "dim_head": 64,
10 |         "heads": 12,
11 |         "n_queries": 256,
12 |         "attn_pooler_heads": 8
13 |     },
14 |     "vision_cfg": {
15 |         "image_size": 288,
16 |         "layers": 12,
17 |         "width": 768,
18 |         "patch_size": 18,
19 |         "output_tokens": true
20 |     },
21 |     "text_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 64000,
24 |         "layers": 12,
25 |         "heads": 12,
26 |         "width": 768,
27 |         "embed_cls": true,
28 |         "output_tokens": true
29 |     },
30 |     "custom_text": true
31 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "output_tokens": true
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "proj": "linear",
14 |         "width": 768,
15 |         "output_tokens": true
16 |     },
17 |     "multimodal_cfg": {
18 |         "context_length": 76,
19 |         "width": 768,
20 |         "heads": 8,
21 |         "layers": 12
22 |     },
23 |     "custom_text": true
24 | }
25 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 640,
16 |         "heads": 10,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 640,
16 |         "heads": 10,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_large_d_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_small",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_tiny",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 20
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 24
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 24
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "google/mt5-base",
11 |         "hf_tokenizer_name": "google/mt5-base",
12 |         "proj": "mlp",
13 |         "pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "google/mt5-xl",
12 |         "hf_tokenizer_name": "google/mt5-xl",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 640,
14 |         "heads": 10,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_medium_patch16_gap_256",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_relpos_medium_patch16_cls_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "xlm-roberta-base",
11 |         "hf_tokenizer_name": "xlm-roberta-base",
12 |         "proj": "mlp",
13 |         "pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "xlm-roberta-large",
12 |         "hf_tokenizer_name": "xlm-roberta-large",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/open_clip/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import repeat
 2 | import collections.abc
 3 | 
 4 | from torch import nn as nn
 5 | from torchvision.ops.misc import FrozenBatchNorm2d
 6 | 
 7 | 
 8 | def freeze_batch_norm_2d(module, module_match={}, name=''):
 9 |     """
10 |     Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
11 |     itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
12 |     returned. Otherwise, the module is walked recursively and submodules are converted in place.
13 | 
14 |     Args:
15 |         module (torch.nn.Module): Any PyTorch module.
16 |         module_match (dict): Dictionary of full module names to freeze (all if empty)
17 |         name (str): Full module name (prefix)
18 | 
19 |     Returns:
20 |         torch.nn.Module: Resulting module
21 | 
22 |     Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
23 |     """
24 |     res = module
25 |     is_match = True
26 |     if module_match:
27 |         is_match = name in module_match
28 |     if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
29 |         res = FrozenBatchNorm2d(module.num_features)
30 |         res.num_features = module.num_features
31 |         res.affine = module.affine
32 |         if module.affine:
33 |             res.weight.data = module.weight.data.clone().detach()
34 |             res.bias.data = module.bias.data.clone().detach()
35 |         res.running_mean.data = module.running_mean.data
36 |         res.running_var.data = module.running_var.data
37 |         res.eps = module.eps
38 |     else:
39 |         for child_name, child in module.named_children():
40 |             full_child_name = '.'.join([name, child_name]) if name else child_name
41 |             new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
42 |             if new_child is not child:
43 |                 res.add_module(child_name, new_child)
44 |     return res
45 | 
46 | 
47 | # From PyTorch internals
48 | def _ntuple(n):
49 |     def parse(x):
50 |         if isinstance(x, collections.abc.Iterable):
51 |             return x
52 |         return tuple(repeat(x, n))
53 |     return parse
54 | 
55 | 
56 | to_1tuple = _ntuple(1)
57 | to_2tuple = _ntuple(2)
58 | to_3tuple = _ntuple(3)
59 | to_4tuple = _ntuple(4)
60 | to_ntuple = lambda n, x: _ntuple(n)(x)
61 | 


--------------------------------------------------------------------------------
/src/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.16.0'
2 | 


--------------------------------------------------------------------------------
/src/training/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | 


--------------------------------------------------------------------------------
/src/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/training/__init__.py


--------------------------------------------------------------------------------
/src/training/clim.py:
--------------------------------------------------------------------------------
 1 | # TODO: process mosaicked image
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class CLIM:
 7 |     mosaic_choices = [2, 3, 4]
 8 | 
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     def __call__(self, batch, model, dist_model, loss, device, cast_dtype,
13 |                  distributed, args):
14 |         if distributed:
15 |             model = model.module
16 |         images, texts = batch
17 |         images = images.to(device=device, dtype=cast_dtype, non_blocking=True)
18 |         texts = texts.to(device=device, non_blocking=True)
19 | 
20 |         mosaicked_images, pseudo_boxes_list, single_images \
21 |             = self.split_a_batch(images, args.train_image_size)
22 |         single_image_features = model.encode_image(single_images, normalize=True)
23 |         with torch.no_grad():
24 |             text_features = model.encode_text(texts, normalize=True)
25 |         logit_scale = model.logit_scale.exp()
26 | 
27 |         pseudo_region_features = model.encode_pseudo_boxes(
28 |             mosaicked_images, pseudo_boxes_list, normalize=True, extract_type=args.extract_type)
29 |         image_features = torch.cat([pseudo_region_features, single_image_features], dim=0)
30 | 
31 |         contrast_loss = loss(image_features,
32 |                              text_features,
33 |                              logit_scale,
34 |                              output_dict=False, )
35 | 
36 |         losses = dict(loss_contrast=contrast_loss * args.contrast_weight)
37 | 
38 |         return losses, len(images), logit_scale
39 | 
40 | 
41 |     @staticmethod
42 |     def _generate_normed_boxes(M, N):
43 |         grid_x, grid_y = torch.meshgrid(torch.linspace(0, 1, N + 1), torch.linspace(0, 1, M + 1),
44 |                                         indexing='xy')
45 |         x0y0s = torch.stack([grid_x[:M, :N], grid_y[:M, :N]], dim=-1)
46 |         x1y1s = torch.stack([grid_x[1:, 1:], grid_y[1:, 1:]], dim=-1)
47 |         pseudo_boxes = torch.cat([x0y0s, x1y1s],
48 |                                  dim=-1).view(-1, 4)
49 |         return pseudo_boxes
50 | 
51 |     def split_a_batch(self, images, train_image_size):
52 |         batch_size = images.shape[0]
53 |         choices = self.mosaic_choices
54 |         min_images = sum([c**2 for c in choices])
55 | 
56 |         assert batch_size >= min_images
57 |         num_single = batch_size % min_images
58 |         num_groups = batch_size // min_images
59 |         # assert num_single == 0
60 |         split = [c for c in choices for _ in range(num_groups)]
61 |         # split = [2] * num_groups + [3] * num_groups + [4] * num_groups
62 |         pseudo_boxes_list = [self._generate_normed_boxes(s, s).to(images) for s in split]
63 | 
64 |         images_list = torch.split(images, [s**2 for s in split] + [num_single], dim=0)
65 | 
66 |         mosaicked_images_list = [
67 |             F.interpolate(self._mosaic_a_minibatch(imgs, s, s), size=train_image_size, mode='bicubic')
68 |             for imgs, s in zip(images_list[:-1], split)]
69 | 
70 |         mosaicked_images = torch.cat(mosaicked_images_list)
71 | 
72 |         return mosaicked_images, pseudo_boxes_list, images_list[-1]
73 | 
74 |     @staticmethod
75 |     def _mosaic_a_minibatch(images, M, N):
76 |         bs, _, h, w = images.shape
77 |         assert bs % (M * N) == 0
78 |         num_mosaic = bs // (M*N)
79 |         images_for_mosaic = images.permute(0, 2, 3, 1)
80 |         images_for_mosaic = images_for_mosaic.view(num_mosaic, M, N, h, w, 3)
81 |         images_for_mosaic = images_for_mosaic.permute(0, 1, 3, 2, 4, 5).contiguous()
82 |         mosaicked_images = images_for_mosaic.view(num_mosaic, M * h, N * w, 3)
83 |         mosaicked_images = mosaicked_images.permute(0, 3, 1, 2)
84 | 
85 |         return mosaicked_images
86 | 


--------------------------------------------------------------------------------
/src/training/custom_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | import torch.nn as nn
 4 | import torchvision.transforms.functional as F
 5 | from torchvision.transforms import RandomCrop, InterpolationMode
 6 | 
 7 | 
 8 | class CustomRandomResize(nn.Module):
 9 | 
10 |     def __init__(self, scale=(0.5, 2.0), interpolation=InterpolationMode.BILINEAR):
11 |         super().__init__()
12 |         self.min_scale, self.max_scale = min(scale), max(scale)
13 |         self.interpolation = interpolation
14 | 
15 |     def forward(self, img):
16 |         if isinstance(img, torch.Tensor):
17 |             height, width = img.shape[:2]
18 |         else:
19 |             width, height = img.size
20 |         scale = random.uniform(self.min_scale, self.max_scale)
21 |         new_size = [int(height * scale), int(width * scale)]
22 |         img = F.resize(img, new_size, self.interpolation)
23 | 
24 |         return img
25 | 
26 | 
27 | class CustomRandomCrop(RandomCrop):
28 |     def forward(self, img):
29 |         """
30 |         Args:
31 |             img (PIL Image or Tensor): Image to be cropped.
32 | 
33 |         Returns:
34 |             PIL Image or Tensor: Cropped image.
35 |         """
36 | 
37 |         width, height = F.get_image_size(img)
38 |         tar_h, tar_w = self.size
39 | 
40 |         tar_h = min(tar_h, height)
41 |         tar_w = min(tar_w, width)
42 |         i, j, h, w = self.get_params(img, (tar_h, tar_w))
43 | 
44 |         return F.crop(img, i, j, h, w)
45 | 


--------------------------------------------------------------------------------
/src/training/distributed.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | import torch.distributed as dist
  5 | 
  6 | try:
  7 |     import horovod.torch as hvd
  8 | except ImportError:
  9 |     hvd = None
 10 | 
 11 | 
 12 | def is_global_master(args):
 13 |     return args.rank == 0
 14 | 
 15 | 
 16 | def is_local_master(args):
 17 |     return args.local_rank == 0
 18 | 
 19 | 
 20 | def is_master(args, local=False):
 21 |     return is_local_master(args) if local else is_global_master(args)
 22 | 
 23 | 
 24 | def is_using_horovod():
 25 |     # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set
 26 |     # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required...
 27 |     ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
 28 |     pmi_vars = ["PMI_RANK", "PMI_SIZE"]
 29 |     if all([var in os.environ for var in ompi_vars]) or all([var in os.environ for var in pmi_vars]):
 30 |         return True
 31 |     else:
 32 |         return False
 33 | 
 34 | 
 35 | def is_using_distributed():
 36 |     if 'WORLD_SIZE' in os.environ:
 37 |         return int(os.environ['WORLD_SIZE']) > 1
 38 |     if 'SLURM_NTASKS' in os.environ:
 39 |         return int(os.environ['SLURM_NTASKS']) > 1
 40 |     return False
 41 | 
 42 | 
 43 | def world_info_from_env():
 44 |     local_rank = 0
 45 |     for v in ('LOCAL_RANK', 'MPI_LOCALRANKID', 'SLURM_LOCALID', 'OMPI_COMM_WORLD_LOCAL_RANK'):
 46 |         if v in os.environ:
 47 |             local_rank = int(os.environ[v])
 48 |             break
 49 |     global_rank = 0
 50 |     for v in ('RANK', 'PMI_RANK', 'SLURM_PROCID', 'OMPI_COMM_WORLD_RANK'):
 51 |         if v in os.environ:
 52 |             global_rank = int(os.environ[v])
 53 |             break
 54 |     world_size = 1
 55 |     for v in ('WORLD_SIZE', 'PMI_SIZE', 'SLURM_NTASKS', 'OMPI_COMM_WORLD_SIZE'):
 56 |         if v in os.environ:
 57 |             world_size = int(os.environ[v])
 58 |             break
 59 | 
 60 |     return local_rank, global_rank, world_size
 61 | 
 62 | 
 63 | def init_distributed_device(args):
 64 |     # Distributed training = training on more than one GPU.
 65 |     # Works in both single and multi-node scenarios.
 66 |     args.distributed = False
 67 |     args.world_size = 1
 68 |     args.rank = 0  # global rank
 69 |     args.local_rank = 0
 70 |     if args.horovod:
 71 |         assert hvd is not None, "Horovod is not installed"
 72 |         hvd.init()
 73 |         args.local_rank = int(hvd.local_rank())
 74 |         args.rank = hvd.rank()
 75 |         args.world_size = hvd.size()
 76 |         args.distributed = True
 77 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 78 |         os.environ['RANK'] = str(args.rank)
 79 |         os.environ['WORLD_SIZE'] = str(args.world_size)
 80 |     elif is_using_distributed():
 81 |         if 'SLURM_PROCID' in os.environ:
 82 |             # DDP via SLURM
 83 |             args.local_rank, args.rank, args.world_size = world_info_from_env()
 84 |             # SLURM var -> torch.distributed vars in case needed
 85 |             os.environ['LOCAL_RANK'] = str(args.local_rank)
 86 |             os.environ['RANK'] = str(args.rank)
 87 |             os.environ['WORLD_SIZE'] = str(args.world_size)
 88 |             torch.distributed.init_process_group(
 89 |                 backend=args.dist_backend,
 90 |                 init_method=args.dist_url,
 91 |                 world_size=args.world_size,
 92 |                 rank=args.rank,
 93 |             )
 94 |         else:
 95 |             # DDP via torchrun, torch.distributed.launch
 96 |             args.local_rank, _, _ = world_info_from_env()
 97 |             torch.distributed.init_process_group(
 98 |                 backend=args.dist_backend,
 99 |                 init_method=args.dist_url)
100 |             args.world_size = torch.distributed.get_world_size()
101 |             args.rank = torch.distributed.get_rank()
102 |         args.distributed = True
103 | 
104 |     if torch.cuda.is_available():
105 |         if args.distributed and not args.no_set_device_rank:
106 |             device = 'cuda:%d' % args.local_rank
107 |         else:
108 |             device = 'cuda:0'
109 |         torch.cuda.set_device(device)
110 |     else:
111 |         device = 'cpu'
112 |     args.device = device
113 |     device = torch.device(device)
114 |     return device
115 | 
116 | 
117 | def broadcast_object(args, obj, src=0):
118 |     # broadcast a pickle-able python object from rank-0 to all ranks
119 |     if args.horovod:
120 |         return hvd.broadcast_object(obj, root_rank=src)
121 |     else:
122 |         if args.rank == src:
123 |             objects = [obj]
124 |         else:
125 |             objects = [None]
126 |         dist.broadcast_object_list(objects, src=src)
127 |         return objects[0]
128 | 
129 | 
130 | def all_gather_object(args, obj, dst=0):
131 |     # gather a pickle-able python object across all ranks
132 |     if args.horovod:
133 |         return hvd.allgather_object(obj)
134 |     else:
135 |         objects = [None for _ in range(args.world_size)]
136 |         dist.all_gather_object(objects, obj)
137 |         return objects
138 | 


--------------------------------------------------------------------------------
/src/training/file_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import multiprocessing
 4 | import subprocess
 5 | import time
 6 | import fsspec
 7 | import torch
 8 | from tqdm import tqdm
 9 | 
10 | def remote_sync_s3(local_dir, remote_dir):
11 |     # skip epoch_latest which can change during sync.
12 |     result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
13 |     if result.returncode != 0:
14 |         logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}")
15 |         return False
16 |         
17 |     logging.info(f"Successfully synced with S3 bucket")
18 |     return True
19 | 
20 | def remote_sync_fsspec(local_dir, remote_dir):
21 |     # FIXME currently this is slow and not recommended. Look into speeding up.
22 |     a = fsspec.get_mapper(local_dir)
23 |     b = fsspec.get_mapper(remote_dir)
24 | 
25 |     for k in a:
26 |         # skip epoch_latest which can change during sync.
27 |         if 'epoch_latest.pt' in k:
28 |             continue
29 | 
30 |         logging.info(f'Attempting to sync {k}')
31 |         if k in b and len(a[k]) == len(b[k]):
32 |             logging.debug(f'Skipping remote sync for {k}.')
33 |             continue
34 | 
35 |         try:
36 |             logging.info(f'Successful sync for {k}.')
37 |             b[k] = a[k]
38 |         except Exception as e:
39 |             logging.info(f'Error during remote sync for {k}: {e}')
40 |             return False
41 | 
42 |     return True
43 | 
44 | def remote_sync(local_dir, remote_dir, protocol):
45 |     logging.info('Starting remote sync.')
46 |     if protocol == 's3':
47 |         return remote_sync_s3(local_dir, remote_dir)
48 |     elif protocol == 'fsspec':
49 |         return remote_sync_fsspec(local_dir, remote_dir)
50 |     else:
51 |         logging.error('Remote protocol not known')
52 |         return False
53 | 
54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol):
55 |     while True:
56 |         time.sleep(sync_every)
57 |         remote_sync(local_dir, remote_dir, protocol)
58 | 
59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol):
60 |     p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol))
61 |     return p
62 | 
63 | # Note: we are not currently using this save function.
64 | def pt_save(pt_obj, file_path):
65 |     of = fsspec.open(file_path, "wb")
66 |     with of as f:
67 |         torch.save(pt_obj, file_path)
68 | 
69 | def pt_load(file_path, map_location=None):
70 |     if file_path.startswith('s3'):
71 |         logging.info('Loading remote checkpoint, which may take a bit.')
72 |     of = fsspec.open(file_path, "rb")
73 |     with of as f:
74 |         out = torch.load(f, map_location=map_location)
75 |     return out
76 | 
77 | def check_exists(file_path):
78 |     try:
79 |         with fsspec.open(file_path):
80 |             pass
81 |     except FileNotFoundError:
82 |         return False
83 |     return True
84 | 


--------------------------------------------------------------------------------
/src/training/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def setup_logging(log_file, level, include_host=False):
 5 |     if include_host:
 6 |         import socket
 7 |         hostname = socket.gethostname()
 8 |         formatter = logging.Formatter(
 9 |             f'%(asctime)s |  {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
10 |     else:
11 |         formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
12 | 
13 |     logging.root.setLevel(level)
14 |     loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
15 |     for logger in loggers:
16 |         logger.setLevel(level)
17 | 
18 |     stream_handler = logging.StreamHandler()
19 |     stream_handler.setFormatter(formatter)
20 |     logging.root.addHandler(stream_handler)
21 | 
22 |     if log_file:
23 |         file_handler = logging.FileHandler(filename=log_file)
24 |         file_handler.setFormatter(formatter)
25 |         logging.root.addHandler(file_handler)
26 | 
27 | 


--------------------------------------------------------------------------------
/src/training/precision.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from contextlib import suppress
 3 | 
 4 | 
 5 | def get_autocast(precision):
 6 |     if precision == 'amp':
 7 |         return torch.cuda.amp.autocast
 8 |     elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
 9 |         # amp_bfloat16 is more stable than amp float16 for clip training
10 |         return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
11 |     else:
12 |         return suppress
13 | 


--------------------------------------------------------------------------------
/src/training/region_clip.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn.functional as F
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | def get_fed_loss_inds(gt_classes, num_sample_cats, C):
 8 |     appeared = torch.unique(gt_classes) # C'
 9 |     prob = appeared.new_ones(C).float()
10 |     if len(appeared) < num_sample_cats:
11 |         prob[appeared] = 0
12 |         more_appeared = torch.multinomial(
13 |             prob, num_sample_cats - len(appeared),
14 |             replacement=False)
15 |         appeared = torch.cat([appeared, more_appeared])
16 |     return appeared
17 | 
18 | 
19 | class RegionCLIP(nn.Module):
20 |     def __init__(self, args):
21 |         super().__init__()
22 |         embed_path = args.train_embed_path
23 |         noun_embeddings = torch.from_numpy(np.load(embed_path))
24 |         noun_embeddings = F.normalize(noun_embeddings, dim=-1)
25 |         self.register_buffer("noun_embeddings", noun_embeddings)
26 |         self.place_holder = nn.Parameter(torch.ones(1))
27 | 
28 |     def __call__(self, batch, model, dist_model, loss, device, cast_dtype,
29 |                  distributed, args):
30 |         if distributed:
31 |             model = model.module
32 |         images, boxes = batch
33 |         images = images.to(device=device, dtype=cast_dtype, non_blocking=True)
34 |         boxes = boxes.to(device=device, non_blocking=True)
35 | 
36 |         boxes_list = []
37 |         boxes_label_list = []
38 | 
39 |         for boxes_per_image in boxes:
40 |             boxes_per_image = boxes_per_image[boxes_per_image[:, -1] > 0.5]
41 |             boxes_label_list.append(boxes_per_image[:, 4].long())
42 |             boxes_list.append(boxes_per_image[:, :4])
43 |         boxes_labels = torch.cat(boxes_label_list)
44 |         box_features = model.encode_pseudo_boxes(images, boxes_list, normalize=True,
45 |                                                  extract_type=args.extract_type)
46 |         temp = model.logit_scale.exp().detach()
47 |         boxes2nouns = box_features @ self.noun_embeddings.T * temp
48 |         target = torch.zeros_like(boxes2nouns)
49 |         target[range(len(boxes_labels)), boxes_labels] = 1.0
50 | 
51 |         appeared = get_fed_loss_inds(boxes_labels, 100, self.noun_embeddings.shape[0])
52 |         target = target[:, appeared]
53 |         boxes2nouns = boxes2nouns[:, appeared]
54 | 
55 |         loss_cls = F.binary_cross_entropy_with_logits(boxes2nouns, target, reduction='none')  # B x C
56 |         loss_cls = loss_cls.sum(-1).mean()
57 | 
58 |         image_size = model.visual.image_size
59 |         if isinstance(image_size, int):
60 |             tar_h = tar_w = image_size
61 |         else:
62 |             tar_h, tar_w = image_size
63 |         images = F.interpolate(images, size=(tar_h, tar_w), mode='bilinear')
64 | 
65 |         losses = dict(loss_contrast=loss_cls * args.contrast_weight)
66 | 
67 |         return losses, len(images), temp
68 | 


--------------------------------------------------------------------------------
/src/training/scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def assign_learning_rate(optimizer, new_lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group["lr"] = new_lr
 7 | 
 8 | 
 9 | def _warmup_lr(base_lr, warmup_length, step):
10 |     return base_lr * (step + 1) / warmup_length
11 | 
12 | 
13 | def const_lr(optimizer, base_lr, warmup_length, steps):
14 |     def _lr_adjuster(step):
15 |         if step < warmup_length:
16 |             lr = _warmup_lr(base_lr, warmup_length, step)
17 |         else:
18 |             lr = base_lr
19 |         assign_learning_rate(optimizer, lr)
20 |         return lr
21 |     return _lr_adjuster
22 | 
23 | 
24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.):
25 |     def _lr_adjuster(step):
26 |         start_cooldown_step = steps - cooldown_steps
27 |         if step < warmup_length:
28 |             lr = _warmup_lr(base_lr, warmup_length, step)
29 |         else:
30 |             if step < start_cooldown_step:
31 |                 lr = base_lr
32 |             else:
33 |                 e = step - start_cooldown_step
34 |                 es = steps - start_cooldown_step
35 |                 # linear decay if power == 1; polynomial decay otherwise;
36 |                 decay = (1 - (e/es)) ** cooldown_power
37 |                 lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr
38 |         assign_learning_rate(optimizer, lr)
39 |         return lr
40 |     return _lr_adjuster
41 | 
42 | 
43 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
44 |     def _lr_adjuster(step):
45 |         if step < warmup_length:
46 |             lr = _warmup_lr(base_lr, warmup_length, step)
47 |         else:
48 |             e = step - warmup_length
49 |             es = steps - warmup_length
50 |             lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
51 |         assign_learning_rate(optimizer, lr)
52 |         return lr
53 |     return _lr_adjuster
54 | 


--------------------------------------------------------------------------------
/src/training/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from functools import partial
 3 | from six.moves import map, zip
 4 | 
 5 | 
 6 | def multi_apply(func, *args, **kwargs):
 7 |     """Apply function to a list of arguments.
 8 |     Note:
 9 |         This function applies the ``func`` to multiple inputs and
10 |         map the multiple outputs of the ``func`` into different
11 |         list. Each list contains the same type of outputs corresponding
12 |         to different inputs.
13 |     Args:
14 |         func (Function): A function that will be applied to a list of
15 |             arguments
16 |     Returns:
17 |         tuple(list): A tuple containing multiple list, each list contains \
18 |             a kind of returned results by the function
19 |     """
20 |     pfunc = partial(func, **kwargs) if kwargs else func
21 |     map_results = map(pfunc, *args)
22 |     return tuple(map(list, zip(*map_results)))
23 | 
24 | 
25 | def mask2box(mask):
26 |     ys, xs = np.where(mask)
27 |     y0, y1 = ys.min(), ys.max()
28 |     x0, x1 = xs.min(), xs.max()
29 | 
30 |     return x0, y0, x1, y1
31 | 


--------------------------------------------------------------------------------