├── .gitignore ├── LICENSE ├── README.md ├── README_CLIP.md ├── metadata ├── coco_panoptic_clip_hand_craft_EVACLIP_ViTB16.npy ├── coco_panoptic_clip_hand_craft_EVACLIP_ViTL14x336.npy ├── coco_panoptic_clip_hand_craft_RN50x64.npy ├── coco_panoptic_clip_hand_craft_ViTB16.npy └── coco_panoptic_clip_hand_craft_ViTL14x336.npy ├── ovdet ├── DATA.md ├── INSTALLATION.md ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── coco_ovd_base.py │ │ │ ├── coco_ovd_base_lsj.py │ │ │ ├── coco_ovd_detic.py │ │ │ ├── coco_ovd_detic_clim.py │ │ │ ├── lvis_v1_ovd_base.py │ │ │ ├── lvis_v1_ovd_base_lsj.py │ │ │ └── lvis_v1_ovd_base_lsj_640.py │ │ ├── iter_based_runtime.py │ │ ├── models │ │ │ ├── faster-rcnn_r50_fpn_syncbn.py │ │ │ └── mask-rcnn_r50_fpn_syncbn.py │ │ └── schedules │ │ │ ├── schedule_180k.py │ │ │ ├── schedule_45k.py │ │ │ └── schedule_90k.py │ ├── clip_based │ │ ├── README.md │ │ ├── openai_rn50x64 │ │ │ └── mask_rcnn_fpn_openai_rn50x64_clim_bs256_ov_lvis_2.88k.py │ │ └── openai_vitb16 │ │ │ ├── faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py │ │ │ └── mask_rcnn_nasfpn_openai_vitb16_clim_bs64_ov_lvis_4x.py │ └── detic │ │ ├── README.md │ │ ├── ov_coco │ │ ├── detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py │ │ ├── detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py │ │ └── faster_rcnn_r50_caffe_c4_90k_ovcoco.py │ │ └── ov_lvis │ │ ├── detic_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py │ │ ├── detic_centernet2_r50_fpn_4x_lvis_boxsup.py │ │ └── detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py ├── data │ └── metadata │ │ ├── coco_clip_hand_craft.npy │ │ ├── coco_openai_vitb16_hand_craft.npy │ │ ├── coco_openai_vitb16_hand_craft_with_background.npy │ │ ├── lvis_openai_rn50x64_hand_craft.npy │ │ ├── lvis_openai_rn50x64_hand_craft_with_background.npy │ │ ├── lvis_openai_vitb16_hand_craft.npy │ │ ├── lvis_openai_vitb16_hand_craft_with_background.npy │ │ ├── lvis_v1_clip_a+cname.npy │ │ ├── lvis_v1_train_cat_info.json │ │ └── lvis_v1_train_cat_norare_info.json ├── ovdet │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── cc3m_lvis_v1.py │ │ ├── coco_caption.py │ │ ├── pipelines │ │ │ ├── __init__.py │ │ │ └── mosaic.py │ │ └── samplers │ │ │ └── multi_source_sampler.py │ ├── methods │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── detic │ │ │ ├── __init__.py │ │ │ ├── detic_caption.py │ │ │ ├── detic_tags.py │ │ │ └── utils.py │ │ └── queues.py │ ├── models │ │ ├── __init__.py │ │ ├── backbones │ │ │ ├── __init__.py │ │ │ ├── clip_resnet.py │ │ │ └── clip_vit.py │ │ ├── dense_heads │ │ │ ├── __init__.py │ │ │ ├── centernet_rpn_head.py │ │ │ ├── iou_loss.py │ │ │ └── rpn_head.py │ │ ├── detectors │ │ │ ├── __init__.py │ │ │ ├── centernet2.py │ │ │ ├── detic.py │ │ │ ├── fvlm.py │ │ │ └── two_stage.py │ │ ├── losses │ │ │ ├── __init__.py │ │ │ ├── cross_entropy_loss.py │ │ │ └── heatmap_focal_loss.py │ │ ├── roi_heads │ │ │ ├── __init__.py │ │ │ ├── detic_bbox_heads │ │ │ │ ├── __init__.py │ │ │ │ ├── bbox_head.py │ │ │ │ ├── detic_bbox_head.py │ │ │ │ └── zero_shot_classifier.py │ │ │ ├── detic_roi_head.py │ │ │ ├── fvlm_bbox_heads │ │ │ │ ├── __init__.py │ │ │ │ └── convfc_bbox_head.py │ │ │ └── standard_roi_head.py │ │ └── vlms │ │ │ ├── __init__.py │ │ │ └── clip │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── clip.py │ │ │ ├── common.py │ │ │ ├── image_encoder.py │ │ │ ├── model.py │ │ │ ├── openai_model.py │ │ │ ├── simple_tokenizer.py │ │ │ ├── text_encoder.py │ │ │ └── utils.py │ └── utils │ │ ├── __init__.py │ │ └── misc.py └── tools │ ├── dist_test.sh │ ├── dist_train.sh │ ├── generate_text_embeddings.py │ ├── pre_processors │ ├── keep_coco_base.py │ ├── keep_coco_novel.py │ └── keep_lvis_base.py │ ├── slurm_test.sh │ ├── slurm_train.sh │ ├── test.py │ └── train.py ├── requirements-training.txt ├── requirements.txt ├── scripts ├── test_openai_vitb16_macc_boxes_masks.sh ├── train_clim_cc3m_3e_openai_vitb16.sh └── train_clim_coco_100e_openai_vitb16.sh ├── setup.py ├── src ├── open_clip │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── coca_model.py │ ├── constants.py │ ├── customs.py │ ├── eva_clip │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── constants.py │ │ ├── eva_vit_model.py │ │ ├── factory.py │ │ ├── hf_configs.py │ │ ├── hf_model.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ └── EVA02-CLIP-bigE-14.json │ │ ├── modified_resnet.py │ │ ├── openai.py │ │ ├── pretrained.py │ │ ├── rope.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── transformer.py │ │ └── utils.py │ ├── factory.py │ ├── generation_utils.py │ ├── hf_configs.py │ ├── hf_model.py │ ├── loss.py │ ├── model.py │ ├── model_configs │ │ ├── RN101-quickgelu.json │ │ ├── RN101.json │ │ ├── RN50-quickgelu.json │ │ ├── RN50.json │ │ ├── RN50x16.json │ │ ├── RN50x4.json │ │ ├── RN50x64.json │ │ ├── ViT-B-16-plus-240.json │ │ ├── ViT-B-16-plus.json │ │ ├── ViT-B-16.json │ │ ├── ViT-B-32-plus-256.json │ │ ├── ViT-B-32-quickgelu.json │ │ ├── ViT-B-32.json │ │ ├── ViT-H-14.json │ │ ├── ViT-H-16.json │ │ ├── ViT-L-14-280.json │ │ ├── ViT-L-14-336.json │ │ ├── ViT-L-14.json │ │ ├── ViT-L-16-320.json │ │ ├── ViT-L-16.json │ │ ├── ViT-M-16-alt.json │ │ ├── ViT-M-16.json │ │ ├── ViT-M-32-alt.json │ │ ├── ViT-M-32.json │ │ ├── ViT-S-16-alt.json │ │ ├── ViT-S-16.json │ │ ├── ViT-S-32-alt.json │ │ ├── ViT-S-32.json │ │ ├── ViT-bigG-14.json │ │ ├── ViT-e-14.json │ │ ├── ViT-g-14.json │ │ ├── coca_ViT-B-32.json │ │ ├── coca_ViT-L-14.json │ │ ├── coca_base.json │ │ ├── coca_roberta-ViT-B-32.json │ │ ├── convnext_base.json │ │ ├── convnext_base_w.json │ │ ├── convnext_base_w_320.json │ │ ├── convnext_large.json │ │ ├── convnext_large_d.json │ │ ├── convnext_large_d_320.json │ │ ├── convnext_small.json │ │ ├── convnext_tiny.json │ │ ├── convnext_xlarge.json │ │ ├── convnext_xxlarge.json │ │ ├── convnext_xxlarge_320.json │ │ ├── mt5-base-ViT-B-32.json │ │ ├── mt5-xl-ViT-H-14.json │ │ ├── roberta-ViT-B-32.json │ │ ├── swin_base_patch4_window7_224.json │ │ ├── vit_medium_patch16_gap_256.json │ │ ├── vit_relpos_medium_patch16_cls_224.json │ │ ├── xlm-roberta-base-ViT-B-32.json │ │ └── xlm-roberta-large-ViT-H-14.json │ ├── modified_resnet.py │ ├── openai.py │ ├── pretrained.py │ ├── push_to_hf_hub.py │ ├── timm_model.py │ ├── tokenizer.py │ ├── transform.py │ ├── transformer.py │ ├── utils.py │ └── version.py └── training │ ├── .gitignore │ ├── __init__.py │ ├── clim.py │ ├── coco_api.py │ ├── custom_transforms.py │ ├── data.py │ ├── dist_utils.py │ ├── distributed.py │ ├── file_utils.py │ ├── logger.py │ ├── main.py │ ├── params.py │ ├── precision.py │ ├── profile.py │ ├── region_clip.py │ ├── scheduler.py │ ├── train.py │ ├── utils.py │ └── zero_shot.py └── tools └── generate_text_embeddings.py /.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | wandb/ 3 | features/ 4 | results/ 5 | 6 | tests/data/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | sync.sh 138 | gpu1sync.sh 139 | .idea 140 | *.pdf 141 | **/._* 142 | **/*DS_* 143 | **.jsonl 144 | src/sbatch 145 | src/misc 146 | .vscode 147 | src/debug 148 | core.* 149 | 150 | # Allow 151 | !src/evaluation/misc/results_dbs/* 152 | data/coco 153 | data/lvis 154 | checkpoints/ 155 | logs 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | S-Lab License 1.0 2 | 3 | Copyright 2022 S-Lab 4 | 5 | Redistribution and use for non-commercial purpose in source and 6 | binary forms, with or without modification, are permitted provided 7 | that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in 14 | the documentation and/or other materials provided with the 15 | distribution. 16 | 17 | 3. Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived 19 | from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | 33 | In the event that redistribution and/or use for commercial purpose in 34 | source or binary forms, with or without modification is required, 35 | please contact the contributor(s) of the work. 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CLIM: Contrastive Language-Image Mosaic for Region Representation 2 | ## Introduction 3 | 4 | This is an official release of the paper 5 | **CLIM: Contrastive Language-Image Mosaic for Region Representation**. 6 | 7 | > [**CLIM: Contrastive Language-Image Mosaic for Region Representation**](https://arxiv.org/abs/2312.11376), 8 | > Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Wentao Liu, Chen Change Loy 9 | > [Bibetex](https://github.com/wusize/CLIM#citation) 10 | 11 | 12 | ## Application to CLIP 13 | 14 | Please refer to the instructions in this [README](README_CLIP.md). 15 | 16 | ## Application to Detic 17 | Please refer to the instructions in this [README](ovdet/configs/detic/README.md). 18 | 19 | ## License 20 | This project is licensed under [NTU S-Lab License 1.0](LICENSE). 21 | 22 | ## Citation 23 | 24 | ```bibtex 25 | @article{wu2023clim, 26 | title={CLIM: Contrastive Language-Image Mosaic for Region Representation}, 27 | author={Size Wu and Wenwei Zhang and Lumin Xu and Sheng Jin and Wentao Liu and Chen Change Loy}, 28 | journal={arXiv preprint arXiv:2312.11376}, 29 | year={2023} 30 | } 31 | ``` 32 | 33 | 34 | ## Acknowledgement 35 | 36 | We thank [OpenCLIP](https://github.com/mlfoundations/open_clip/tree/v2.16.0) and [MMDetection](https://github.com/open-mmlab/mmdetection) for their valuable code bases. 37 | -------------------------------------------------------------------------------- /README_CLIP.md: -------------------------------------------------------------------------------- 1 | # Application to CLIP 2 | 3 | ## Installation 4 | The code for applying CLIM to CLIP model is adapted from [OpenCLIP-v2.16.0](https://github.com/mlfoundations/open_clip/tree/v2.16.0). Run the 5 | following command to install the package 6 | 7 | ```bash 8 | cd CLIM/ 9 | pip install -e . -v 10 | ``` 11 | 12 | ## Data Preparation 13 | The main experiments are conducted using images from [COCO](https://cocodataset.org/#home) and 14 | [CC3M](https://ai.google.com/research/ConceptualCaptions/download) 15 | Please prepare datasets and organize them like the following: 16 | 17 | ```text 18 | CLIM/ 19 | ├── data 20 | ├── coco 21 | ├── annotations 22 | ├── panoptic_val2017.json 23 | ├── panoptic_val2017 # panoptic masks 24 | ├── wusize 25 | ├── captions_train2017_tags_allcaps.json 26 | ├── train2017 27 | ├── val2017 28 | ├── cc3m 29 | ├── cc3m_captions_train.json 30 | ├── train 31 | ``` 32 | The json file `captions_train2017_tags_allcaps.json` for coco captions can be obtained from 33 | [GoogleDrive](https://drive.google.com/drive/folders/1O6rt6WN2ePPg6j-wVgF89T7ql2HiuRIG?usp=sharing). 34 | For CC3M dataset, please download the image using the csv file from the official 35 | [website](https://ai.google.com/research/ConceptualCaptions/download), and then generate the json file 36 | following the COCO format. The json file `cc3m_captions_train.json` might look like: 37 | 38 | ```json lines 39 | {'images': 40 | [ 41 | {'id': 1, 'file_name': 'train/0/0.jpg', 'captions': ['a very typical bus station']}, 42 | {'id': 4, 'file_name': 'train/3/3.jpg', 'captions': ['interior design of modern living room with fireplace in a new house']}, 43 | ] 44 | } 45 | ``` 46 | 47 | ## Run 48 | ### Original Models 49 | To run CLIM, first obtain the original models using these 50 | [links](https://github.com/openai/CLIP/blob/a1d071733d7111c9c014f024669f959182114e33/clip/clip.py#L30), 51 | and put them under 52 | `checkpoints/` like the following: 53 | 54 | ```text 55 | CLIM/ 56 | ├── checkpoints 57 | ├── ViT-B-16.pt 58 | ├── RN50x64.pt 59 | 60 | ``` 61 | 62 | ### Applying CLIM 63 | We provide the [scripts](scripts) to run CLIM. For example, if we want to refine ViT-B/16 on the COCO dataset, simply run: 64 | ```bash 65 | bash scripts/train_clim_coco_100e_openai_vitb16.sh 66 | ``` 67 | We also provide the checkpoints of the models trained by CLIM in 68 | [Google Drive](https://drive.google.com/drive/folders/1v91n5SSXSOtgo2SlEESj_Gquwh9KMj3J?usp=sharing). 69 | 70 | ### Open-Vocabulary Object Detection 71 | 72 | To build open-vocabulary detectors using the models trained by CLIM, 73 | please refer to the instructions in this [README](ovdet/configs/clip_based/README.md). 74 | -------------------------------------------------------------------------------- /metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTB16.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTB16.npy -------------------------------------------------------------------------------- /metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTL14x336.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_EVACLIP_ViTL14x336.npy -------------------------------------------------------------------------------- /metadata/coco_panoptic_clip_hand_craft_RN50x64.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_RN50x64.npy -------------------------------------------------------------------------------- /metadata/coco_panoptic_clip_hand_craft_ViTB16.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_ViTB16.npy -------------------------------------------------------------------------------- /metadata/coco_panoptic_clip_hand_craft_ViTL14x336.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/metadata/coco_panoptic_clip_hand_craft_ViTL14x336.npy -------------------------------------------------------------------------------- /ovdet/DATA.md: -------------------------------------------------------------------------------- 1 | # Data preparation 2 | ## Open-Vocabulary COCO 3 | Prepare data following [MMDetection](https://mmdetection.readthedocs.io/en/latest/user_guides/useful_tools.html#dataset-download). 4 | Obtain the json files for OV-COCO from [GoogleDrive](https://drive.google.com/drive/folders/1O6rt6WN2ePPg6j-wVgF89T7ql2HiuRIG?usp=sharing) and put them 5 | under `data/coco/wusize` 6 | The data structure looks like: 7 | 8 | ```text 9 | CLIM/ovdet/data 10 | ├── coco 11 | ├── annotations 12 | ├── instances_{train,val}2017.json 13 | ├── wusize 14 | ├── instances_train2017_base.json 15 | ├── instances_val2017_base.json 16 | ├── instances_val2017_novel.json 17 | ├── captions_train2017_tags_allcaps.json 18 | ├── train2017 19 | ├── val2017 20 | ├── test2017 21 | ``` 22 | 23 | 24 | ## Open-Vocabulary LVIS 25 | Prepare data following [MMDetection](https://mmdetection.readthedocs.io/en/latest/user_guides/useful_tools.html#dataset-download). 26 | ```text 27 | CLIM/ovdet/data 28 | ├── lvis_v1 29 | ├── annotations 30 | ├── lvis_v1_val.json 31 | ├── lvis_v1_train.json 32 | ├── wusize 33 | ├── lvis_v1_train_base.json 34 | ├── train2017 35 | ├── val2017 36 | ├── cc3m 37 | ├── annotations 38 | ├── train_image_info_tags.json 39 | ├── images 40 | ``` 41 | We provide the json file `lvis_v1_train_base.json` than only contains annotations of base categories in 42 | [Google Drive](https://drive.google.com/file/d/1ahmCUXyFAQqnlMb-ZDDSQUMnIosYqhu5/view?usp=sharing). To obtain cc3m, please refer 43 | to [Detic](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md). 44 | -------------------------------------------------------------------------------- /ovdet/INSTALLATION.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | This code for open-vocabulary object detection is based on [MMDetection 3.x](https://github.com/open-mmlab/mmdetection/tree/3.x) 4 | 5 | It requires the following OpenMMLab packages: 6 | 7 | - MMEngine >= 0.6.0 8 | - MMCV-full >= v2.0.0rc4 9 | - MMDetection >= v3.0.0rc6 10 | - lvisapi 11 | 12 | ```bash 13 | pip install openmim mmengine 14 | mim install "mmcv>=2.0.0rc4" 15 | pip install git+https://github.com/lvis-dataset/lvis-api.git 16 | mim install mmdet>=3.0.0rc6 17 | ``` 18 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/coco_ovd_base.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py' 3 | 4 | data_root = 'data/coco/' 5 | 6 | train_dataloader = dict( 7 | sampler=dict(type='InfiniteSampler'), 8 | dataset=dict( 9 | ann_file='wusize/instances_train2017_base.json', 10 | data_prefix=dict(img='train2017/'), 11 | ) 12 | ) 13 | val_evaluator = [ 14 | dict( 15 | type='CocoMetric', 16 | ann_file=data_root + 'wusize/instances_val2017_base.json', 17 | metric='bbox', 18 | prefix='Base', 19 | format_only=False), 20 | dict( 21 | type='CocoMetric', 22 | ann_file=data_root + 'wusize/instances_val2017_novel.json', 23 | metric='bbox', 24 | prefix='Novel', 25 | format_only=False) 26 | ] 27 | test_evaluator = val_evaluator 28 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/coco_ovd_base_lsj.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py' 3 | 4 | data_root = 'data/coco/' 5 | image_size = (640, 640) 6 | 7 | image_backend_args = None 8 | # image_backend_args = dict( 9 | # backend='petrel', 10 | # path_mapping=dict({ 11 | # 'data/coco': 's3://openmmlab/datasets/detection/coco' 12 | # })) 13 | train_pipeline = [ 14 | dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True), 15 | dict(type="LoadAnnotations", with_bbox=True, with_mask=False), 16 | dict( 17 | type="RandomResize", 18 | scale=image_size, 19 | ratio_range=(0.1, 2.0), 20 | keep_ratio=True), 21 | dict( 22 | type="RandomCrop", 23 | crop_type='absolute_range', 24 | crop_size=image_size, 25 | recompute_bbox=True, 26 | allow_negative_crop=True), 27 | dict(type="Pad", size=image_size, 28 | pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)), 29 | dict(type="FilterAnnotations", min_gt_bbox_wh=(1e-2, 1e-2)), 30 | dict(type="RandomFlip", prob=0.5), 31 | dict(type="PackDetInputs") 32 | ] 33 | 34 | train_dataloader = dict( 35 | dataset=dict( 36 | ann_file='wusize/instances_train2017_base.json', 37 | data_prefix=dict(img='train2017/'), 38 | pipeline=train_pipeline, 39 | ) 40 | ) 41 | 42 | test_pipeline = [ 43 | dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True), 44 | dict(type="Resize", scale=image_size, keep_ratio=True), 45 | dict(type="Pad", size=image_size, 46 | pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)), 47 | # If you don't have a gt annotation, delete the pipeline 48 | dict(type="LoadAnnotations", with_bbox=True), 49 | dict( 50 | type="PackDetInputs", 51 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 52 | 'scale_factor')) 53 | ] 54 | 55 | val_dataloader = dict( 56 | dataset=dict( 57 | pipeline=test_pipeline) 58 | ) 59 | test_dataloader = val_dataloader 60 | 61 | 62 | val_evaluator = [ 63 | dict( 64 | type='CocoMetric', 65 | ann_file=data_root + 'wusize/instances_val2017_base.json', 66 | metric='bbox', 67 | prefix='Base', 68 | format_only=False), 69 | dict( 70 | type='CocoMetric', 71 | ann_file=data_root + 'wusize/instances_val2017_novel.json', 72 | metric='bbox', 73 | prefix='Novel', 74 | format_only=False) 75 | ] 76 | test_evaluator = val_evaluator 77 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/coco_ovd_detic.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py' 3 | dataset_type = 'CocoDataset' 4 | data_root = 'data/coco/' 5 | file_client_args = dict(backend='disk') 6 | branch_field = ['det_batch', 'caption_batch'] 7 | det_pipeline = [ 8 | dict(type='LoadImageFromFile', file_client_args=file_client_args), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', prob=0.5), 12 | # dict(type='PackDetInputs') 13 | dict(type='MultiBranch', 14 | branch_field=branch_field, 15 | det_batch=dict(type='PackDetInputs')) 16 | ] 17 | 18 | ovd_pipeline = [ 19 | dict(type='LoadImageFromFile', file_client_args=file_client_args), 20 | dict(type='LoadAnnotations', with_bbox=True), 21 | dict(type='Resize', scale=(667, 400), keep_ratio=True), 22 | dict(type='RandomFlip', prob=0.5), 23 | # dict(type='PackDetInputs') 24 | dict(type='MultiBranch', 25 | branch_field=branch_field, 26 | caption_batch=dict(type='PackDetInputs', 27 | meta_keys=['img_id', 'img_path', 'ori_shape', 28 | 'img_shape', 'scale_factor', 29 | 'flip', 'flip_direction', 'captions', 30 | 'tags', 'image_ids'] 31 | ) 32 | ) 33 | ] 34 | det_dataset = dict( 35 | type='CocoDataset', 36 | data_root=data_root, 37 | ann_file='wusize/instances_train2017_base.json', 38 | data_prefix=dict(img='train2017/'), 39 | filter_cfg=dict(filter_empty_gt=True, min_size=32), 40 | pipeline=det_pipeline) 41 | 42 | ovd_dataset = dict( 43 | type='CocoCaptionOVDDataset', 44 | data_root=data_root, 45 | ann_file='wusize/captions_train2017_tags_allcaps.json', 46 | data_prefix=dict(img='train2017/'), 47 | filter_cfg=dict(filter_empty_gt=False), 48 | pipeline=ovd_pipeline 49 | ) 50 | batch_split = [2, 4] 51 | train_dataloader = dict( 52 | batch_size=sum(batch_split), 53 | num_workers=sum(batch_split), 54 | persistent_workers=True, 55 | sampler=dict(type='CustomGroupMultiSourceSampler', 56 | batch_size=sum(batch_split), 57 | source_ratio=batch_split), 58 | batch_sampler=None, 59 | dataset=dict( 60 | _delete_=True, 61 | type='ConcatDataset', 62 | datasets=[det_dataset, ovd_dataset]) 63 | ) 64 | 65 | val_evaluator = [ 66 | dict( 67 | type='CocoMetric', 68 | ann_file=data_root + 'wusize/instances_val2017_base.json', 69 | metric='bbox', 70 | prefix='Base', 71 | format_only=False), 72 | dict( 73 | type='CocoMetric', 74 | ann_file=data_root + 'wusize/instances_val2017_novel.json', 75 | metric='bbox', 76 | prefix='Novel', 77 | format_only=False) 78 | ] 79 | test_evaluator = val_evaluator 80 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/coco_ovd_detic_clim.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/coco_detection.py' 3 | dataset_type = 'CocoDataset' 4 | data_root = 'data/coco/' 5 | file_client_args = dict(backend='disk') 6 | branch_field = ['det_batch', 'caption_batch', 'mosaic_batch'] 7 | det_pipeline = [ 8 | dict(type='LoadImageFromFile', file_client_args=file_client_args), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', prob=0.5), 12 | # dict(type='PackDetInputs') 13 | dict(type='MultiBranch', 14 | branch_field=branch_field, 15 | det_batch=dict(type='PackDetInputs')) 16 | ] 17 | 18 | ovd_pipeline = [ 19 | dict(type='LoadImageFromFile', file_client_args=file_client_args), 20 | dict(type='LoadAnnotations', with_bbox=True), 21 | dict(type='Resize', scale=(667, 400), keep_ratio=True), 22 | dict(type='RandomFlip', prob=0.5), 23 | # dict(type='PackDetInputs') 24 | dict(type='MultiBranch', 25 | branch_field=branch_field, 26 | caption_batch=dict(type='PackDetInputs', 27 | meta_keys=['img_id', 'img_path', 'ori_shape', 28 | 'img_shape', 'scale_factor', 29 | 'flip', 'flip_direction', 'captions', 30 | 'tags', 'image_ids'] 31 | ) 32 | ) 33 | ] 34 | 35 | 36 | mosaic_pipeline = [ 37 | dict(type='LoadImageFromFile', file_client_args=file_client_args), 38 | dict(type='LoadAnnotations', with_bbox=True), 39 | dict(type='Resize', scale=(400, 400), keep_ratio=True), 40 | dict(type='RandomFlip', prob=0.5), 41 | dict(type='MultiChoicesMosaic', 42 | choices=[(2, 2), (3, 3), (4, 4)], 43 | max_cached_images=1024, 44 | img_scale=(400, 400), 45 | pad_val=114.0, 46 | prob=1.0, center_ratio_range=(1.0, 1.0)), 47 | dict(type='Resize', scale=(800, 800), keep_ratio=True), # resize to a fixed value 48 | dict(type='MultiBranch', 49 | branch_field=branch_field, 50 | mosaic_batch=dict(type='PackDetInputs', 51 | meta_keys=['img_id', 'img_path', 'ori_shape', 52 | 'img_shape', 'scale_factor', 53 | 'flip', 'flip_direction', 'captions', 54 | 'tags', 'image_ids']) 55 | ) 56 | ] 57 | 58 | 59 | det_dataset = dict( 60 | type='CocoDataset', 61 | data_root=data_root, 62 | ann_file='wusize/instances_train2017_base.json', 63 | data_prefix=dict(img='train2017/'), 64 | filter_cfg=dict(filter_empty_gt=True, min_size=32), 65 | pipeline=det_pipeline) 66 | 67 | ovd_dataset = dict( 68 | type='CocoCaptionOVDDataset', 69 | data_root=data_root, 70 | ann_file='wusize/captions_train2017_tags_allcaps.json', 71 | data_prefix=dict(img='train2017/'), 72 | filter_cfg=dict(filter_empty_gt=False), 73 | pipeline=ovd_pipeline 74 | ) 75 | 76 | mosaic_dataset = dict( 77 | type='CocoCaptionOVDDataset', 78 | data_root=data_root, 79 | ann_file='wusize/captions_train2017_tags_allcaps.json', 80 | data_prefix=dict(img='train2017/'), 81 | filter_cfg=dict(filter_empty_gt=False), 82 | pipeline=mosaic_pipeline 83 | ) 84 | 85 | 86 | batch_split = [2, 2, 2] 87 | train_dataloader = dict( 88 | batch_size=sum(batch_split), 89 | num_workers=sum(batch_split), 90 | persistent_workers=True, 91 | sampler=dict(type='CustomGroupMultiSourceSampler', 92 | batch_size=sum(batch_split), 93 | source_ratio=batch_split), 94 | batch_sampler=None, 95 | dataset=dict( 96 | _delete_=True, 97 | type='ConcatDataset', 98 | datasets=[det_dataset, ovd_dataset, mosaic_dataset]) 99 | ) 100 | 101 | val_evaluator = [ 102 | dict( 103 | type='CocoMetric', 104 | ann_file=data_root + 'wusize/instances_val2017_base.json', 105 | metric='bbox', 106 | prefix='Base', 107 | format_only=False), 108 | dict( 109 | type='CocoMetric', 110 | ann_file=data_root + 'wusize/instances_val2017_novel.json', 111 | metric='bbox', 112 | prefix='Novel', 113 | format_only=False) 114 | ] 115 | test_evaluator = val_evaluator 116 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/lvis_v1_ovd_base.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/lvis_v1_instance.py' 3 | train_dataloader = dict( 4 | sampler=dict(type='InfiniteSampler'), 5 | dataset=dict( 6 | dataset=dict( 7 | ann_file='wusize/lvis_v1_train_base.json') 8 | ) 9 | ) 10 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/lvis_v1_ovd_base_lsj.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/lvis_v1_instance.py' 3 | image_size = (1024, 1024) 4 | 5 | image_backend_args = None 6 | # image_backend_args = dict( 7 | # backend='petrel', 8 | # path_mapping=dict({ 9 | # 'data/lvis_v1/train2017': 's3://openmmlab/datasets/detection/coco/train2017', 10 | # 'data/lvis_v1/val2017': 's3://openmmlab/datasets/detection/coco/val2017' 11 | # }) 12 | # ) 13 | 14 | train_pipeline = [ 15 | dict(type='LoadImageFromFile', backend_args=image_backend_args, to_float32=True), 16 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 17 | dict( 18 | type='RandomResize', 19 | scale=image_size, 20 | ratio_range=(0.1, 2.0), 21 | keep_ratio=True), 22 | dict( 23 | type='RandomCrop', 24 | crop_type='absolute_range', 25 | crop_size=image_size, 26 | recompute_bbox=True, 27 | allow_negative_crop=True), 28 | dict(type="Pad", size=image_size, 29 | pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)), 30 | dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), 31 | dict(type='RandomFlip', prob=0.5), 32 | dict(type='PackDetInputs') 33 | ] 34 | train_dataloader = dict( 35 | dataset=dict( 36 | dataset=dict( 37 | ann_file='wusize/lvis_v1_train_base.json', 38 | pipeline=train_pipeline,) 39 | ) 40 | ) 41 | 42 | 43 | test_pipeline = [ 44 | dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True), 45 | dict(type="Resize", scale=image_size, keep_ratio=True), 46 | dict(type="Pad", size=image_size, 47 | pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)), 48 | # If you don't have a gt annotation, delete the pipeline 49 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 50 | dict( 51 | type="PackDetInputs", 52 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 53 | 'scale_factor')) 54 | ] 55 | val_dataloader = dict( 56 | dataset=dict( 57 | pipeline=test_pipeline) 58 | ) 59 | test_dataloader = val_dataloader 60 | 61 | 62 | val_evaluator = dict(metric=['segm']) 63 | test_evaluator = val_evaluator 64 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/datasets/lvis_v1_ovd_base_lsj_640.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'mmdet::_base_/datasets/lvis_v1_instance.py' 3 | image_size = (640, 640) 4 | 5 | image_backend_args = None 6 | # image_backend_args = dict( 7 | # backend='petrel', 8 | # path_mapping=dict({ 9 | # 'data/lvis_v1/train2017': 's3://openmmlab/datasets/detection/coco/train2017', 10 | # 'data/lvis_v1/val2017': 's3://openmmlab/datasets/detection/coco/val2017' 11 | # }) 12 | # ) 13 | 14 | train_pipeline = [ 15 | dict(type='LoadImageFromFile', backend_args=image_backend_args, to_float32=True), 16 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 17 | dict( 18 | type='RandomResize', 19 | scale=image_size, 20 | ratio_range=(0.1, 2.0), 21 | keep_ratio=True), 22 | dict( 23 | type='RandomCrop', 24 | crop_type='absolute_range', 25 | crop_size=image_size, 26 | recompute_bbox=True, 27 | allow_negative_crop=True), 28 | dict(type="Pad", size=image_size, 29 | pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)), 30 | dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), 31 | dict(type='RandomFlip', prob=0.5), 32 | dict(type='PackDetInputs') 33 | ] 34 | train_dataloader = dict( 35 | dataset=dict( 36 | dataset=dict( 37 | ann_file='wusize/lvis_v1_train_base.json', 38 | pipeline=train_pipeline,) 39 | ) 40 | ) 41 | 42 | 43 | test_pipeline = [ 44 | dict(type="LoadImageFromFile", backend_args=image_backend_args, to_float32=True), 45 | dict(type="Resize", scale=image_size, keep_ratio=True), 46 | dict(type="Pad", size=image_size, 47 | pad_val=dict(img=(122.7709383, 116.7460125, 104.09373615), seg=255)), 48 | # If you don't have a gt annotation, delete the pipeline 49 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 50 | dict( 51 | type="PackDetInputs", 52 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 53 | 'scale_factor')) 54 | ] 55 | val_dataloader = dict( 56 | dataset=dict( 57 | pipeline=test_pipeline) 58 | ) 59 | test_dataloader = val_dataloader 60 | 61 | 62 | val_evaluator = dict(metric=['segm']) 63 | test_evaluator = val_evaluator 64 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/iter_based_runtime.py: -------------------------------------------------------------------------------- 1 | _base_ = 'mmdet::_base_/default_runtime.py' 2 | default_hooks = dict( 3 | # logger=dict(type='LoggerHook', interval=5), 4 | checkpoint=dict(type='CheckpointHook', by_epoch=False, max_keep_ckpts=1, interval=5000) 5 | ) 6 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) 7 | find_unused_parameters = True 8 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/models/faster-rcnn_r50_fpn_syncbn.py: -------------------------------------------------------------------------------- 1 | _base_ = 'mmdet::_base_/models/faster-rcnn_r50_fpn.py' 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | # model settings 4 | model = dict( 5 | backbone=dict( 6 | frozen_stages=-1, 7 | norm_cfg=norm_cfg, 8 | norm_eval=False, 9 | init_cfg=None), 10 | neck=dict( 11 | norm_cfg=norm_cfg,), 12 | roi_head=dict( 13 | bbox_head=dict( 14 | type='Shared4Conv1FCBBoxHead', 15 | norm_cfg=dict(type='BN', requires_grad=False), # freeze the bn at bbox head 16 | norm_eval=True, 17 | num_classes=80, 18 | reg_class_agnostic=True, 19 | loss_cls=dict( 20 | type='CustomCrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 21 | ) 22 | ), 23 | # model training and testing settings 24 | test_cfg=dict( 25 | rcnn=dict( 26 | score_thr=0.05,) 27 | ) 28 | ) 29 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/models/mask-rcnn_r50_fpn_syncbn.py: -------------------------------------------------------------------------------- 1 | _base_ = 'mmdet::_base_/models/mask-rcnn_r50_fpn.py' 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | # model settings 4 | model = dict( 5 | backbone=dict( 6 | frozen_stages=-1, 7 | norm_cfg=norm_cfg, 8 | norm_eval=False, 9 | init_cfg=None), 10 | neck=dict( 11 | norm_cfg=norm_cfg,), 12 | roi_head=dict( 13 | bbox_head=dict( 14 | type='Shared4Conv1FCBBoxHead', 15 | norm_cfg=norm_cfg, 16 | norm_eval=False, 17 | num_classes=80, 18 | reg_class_agnostic=True, 19 | loss_cls=dict( 20 | type='CustomCrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 21 | ), 22 | mask_head=dict( 23 | norm_cfg=norm_cfg, 24 | class_agnostic=True, 25 | ) 26 | ), 27 | # model training and testing settings 28 | test_cfg=dict( 29 | rcnn=dict( 30 | score_thr=0.05,) 31 | ) 32 | ) 33 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/schedules/schedule_180k.py: -------------------------------------------------------------------------------- 1 | # training schedule for 1x 2 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=180000, val_interval=30000) 3 | val_cfg = dict(type='ValLoop') 4 | test_cfg = dict(type='TestLoop') 5 | 6 | # learning rate 7 | param_scheduler = [ 8 | dict( 9 | type='LinearLR', start_factor=0.0002, by_epoch=False, begin=0, end=5000), 10 | dict( 11 | type='MultiStepLR', 12 | begin=0, 13 | end=180000, 14 | by_epoch=False, 15 | milestones=[120000, 160000], 16 | gamma=0.1) 17 | ] 18 | 19 | # optimizer 20 | optim_wrapper = dict( 21 | type='OptimWrapper', 22 | optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) 23 | 24 | # Default setting for scaling LR automatically 25 | # - `enable` means enable scaling LR automatically 26 | # or not by default. 27 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 28 | auto_scale_lr = dict(enable=False, base_batch_size=16) 29 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/schedules/schedule_45k.py: -------------------------------------------------------------------------------- 1 | # training schedule for 1x 2 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=45000, val_interval=5000) 3 | val_cfg = dict(type='ValLoop') 4 | test_cfg = dict(type='TestLoop') 5 | 6 | # learning rate 7 | param_scheduler = [ 8 | dict( 9 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=1000), 10 | dict( 11 | type='MultiStepLR', 12 | begin=0, 13 | end=45000, 14 | by_epoch=False, 15 | milestones=[30000, 40000], 16 | gamma=0.1) 17 | ] 18 | 19 | # optimizer 20 | optim_wrapper = dict( 21 | type='OptimWrapper', 22 | optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) 23 | 24 | # Default setting for scaling LR automatically 25 | # - `enable` means enable scaling LR automatically 26 | # or not by default. 27 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 28 | auto_scale_lr = dict(enable=False, base_batch_size=16) 29 | -------------------------------------------------------------------------------- /ovdet/configs/_base_/schedules/schedule_90k.py: -------------------------------------------------------------------------------- 1 | # training schedule for 1x 2 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=10000) 3 | val_cfg = dict(type='ValLoop') 4 | test_cfg = dict(type='TestLoop') 5 | 6 | # learning rate 7 | param_scheduler = [ 8 | dict( 9 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=1000), 10 | dict( 11 | type='MultiStepLR', 12 | begin=0, 13 | end=90000, 14 | by_epoch=False, 15 | milestones=[60000, 80000], 16 | gamma=0.1) 17 | ] 18 | 19 | # optimizer 20 | optim_wrapper = dict( 21 | type='OptimWrapper', 22 | optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) 23 | 24 | # Default setting for scaling LR automatically 25 | # - `enable` means enable scaling LR automatically 26 | # or not by default. 27 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 28 | auto_scale_lr = dict(enable=False, base_batch_size=16) 29 | -------------------------------------------------------------------------------- /ovdet/configs/clip_based/README.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | First please make sure the modified OpenCLIP has been installed as follows 3 | ``` 4 | cd CLIM 5 | pip install -e . -v 6 | ``` 7 | Then please refer to this [README](../../INSTALLATION.md) to install the detector. 8 | 9 | ## Data Preparation 10 | Please refer to this [README](../../DATA.md). 11 | 12 | 13 | ## Usage 14 | ### Obtain Checkpoints 15 | We provide checkpoints of models that were trained by CLIM in 16 | [Google Drive](https://drive.google.com/drive/folders/1v91n5SSXSOtgo2SlEESj_Gquwh9KMj3J?usp=sharing). Put them under 17 | `CLIM/ovdet/checkpoints`. 18 | 19 | ### Training 20 | Take ViT-B/16 on OV-COCO as example, run the following to train the detector 21 | 22 | ``` 23 | cd CLIM/ovdet 24 | bash tools/dist_train.sh \ 25 | configs/clip_based/openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py 8 \ 26 | --work-dir your/output/directory/ovdet_openai_vitb16_ov_coco_clim 27 | ``` 28 | 29 | ### Testing 30 | We also provide the following checkpoints of the trained detectors in 31 | [Google Drive](https://drive.google.com/drive/folders/1v91n5SSXSOtgo2SlEESj_Gquwh9KMj3J?usp=sharing). Download and 32 | put them under `CLIM/ovdet/checkpoints`. 33 | 34 | Note: the released code for the ViT-based detector achieves better results than that we have initially reported 35 | in the paper. 36 | 37 | | OV-COCO | Backbone | Novel AP50 | Config | Download | 38 | |:---------:|:--------:|:----------:|:-----------------------------------------------------------------------------:|:---------:| 39 | | Paper | ViT-B/16 | 25.7 | - | - | 40 | | This Repo | ViT-B/16 | 29.7 | [config](openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py) | [model](https://drive.google.com/file/d/1lOKpb2EiC2rcgsX9GeXUhVN1QnyUTZSM/view?usp=sharing) | 41 | 42 | | OV-LVIS | Backbone | Mask APr | Config | Download | 43 | |:---------:|:--------:|:--------:|:---------------------------------------------------------------------------------:|:---------:| 44 | | Paper | ViT-B/16 | 20.8 | - | - | 45 | | This Repo | ViT-B/16 | 24.3 | [config](openai_vitb16/mask_rcnn_nasfpn_openai_vitb16_clim_bs64_ov_lvis_4x.py) | [model](https://drive.google.com/file/d/1rLEp2cL8rH0rvFduxaOG6m_Z9-s_qMwQ/view?usp=sharing) | 46 | | Paper | RN50x64 | 32.3 | - | - | 47 | | This Repo | RN50x64 | 32.4 | [config](openai_rn50x64/mask_rcnn_fpn_openai_rn50x64_clim_bs256_ov_lvis_2.88k.py) | [model](https://drive.google.com/file/d/1LjJo4p3vaLKoy1Vp08kt_Xg08dLdgbo5/view?usp=sharing) | 48 | 49 | Take ViT-B/16 on OV-COCO as example, run the following script to test the detector 50 | 51 | ``` 52 | cd CLIM/ovdet 53 | bash tools/dist_test.sh \ 54 | configs/clip_based/openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py \ 55 | checkpoints/ovdet_openai_vitb16_ov_coco_clim.pth \ 56 | 8 --work-dir your/output/directory/ovdet_openai_vitb16_ov_coco_clim 57 | ``` 58 | -------------------------------------------------------------------------------- /ovdet/configs/clip_based/openai_rn50x64/mask_rcnn_fpn_openai_rn50x64_clim_bs256_ov_lvis_2.88k.py: -------------------------------------------------------------------------------- 1 | _base_ = ['mmdet::_base_/models/mask-rcnn_r50_fpn.py', 2 | '../../_base_/iter_based_runtime.py', 3 | '../../_base_/datasets/lvis_v1_ovd_base_lsj.py'] 4 | find_unused_parameters = True 5 | class_weight = 'data/metadata/lvis_v1_train_cat_norare_info.json' 6 | norm_cfg = dict(type='SyncBN', requires_grad=True) 7 | model = dict( 8 | type='FVLM', 9 | data_preprocessor=dict( 10 | mean=[122.7709383, 116.7460125, 104.09373615], 11 | std=[68.5005327, 66.6321579, 70.32316305]), 12 | backbone=dict( 13 | type='CLIPResNet', 14 | _delete_=True, 15 | model_name='RN50x64', 16 | cache_dir='checkpoints', 17 | pretrained='checkpoints/openai_rn50x64_cc3m_clim.pt', 18 | roi_extractor=dict( 19 | type='SingleRoIExtractor', 20 | roi_layer=dict(type='RoIAlign', output_size=14, 21 | sampling_ratio=0, use_torchvision=True), 22 | out_channels=4096, 23 | featmap_strides=[32]), 24 | ), 25 | neck=dict( 26 | in_channels=[512, 1024, 2048, 4096], 27 | norm_cfg=norm_cfg 28 | ), 29 | rpn_head=dict( 30 | type='CustomRPNHead', 31 | num_convs=2, 32 | norm_cfg=norm_cfg 33 | ), 34 | roi_head=dict( 35 | type='FVLMStandardRoIHead', 36 | bbox_head=dict( 37 | type='FVLMConvFCBBoxHead', 38 | num_shared_convs=4, 39 | num_shared_fcs=2, 40 | num_cls_fcs=1, 41 | num_reg_fcs=1, 42 | reg_class_agnostic=True, 43 | num_classes=1203, 44 | norm_cfg=norm_cfg, 45 | alpha=0.35, 46 | beta=0.65, 47 | clip_temp=50.0, 48 | cls_temp=50.0, 49 | learn_cls_temp=True, 50 | cls_embeddings_path="data/metadata/lvis_openai_rn50x64_hand_craft.npy", 51 | bg_embedding='learn', 52 | loss_cls=dict( 53 | type='CustomCrossEntropyLoss', 54 | use_sigmoid=False, 55 | class_weight=class_weight, 56 | bg_weight=0.9, 57 | ), 58 | ), 59 | mask_head=dict( 60 | norm_cfg=norm_cfg, class_agnostic=True, num_classes=1203) 61 | ), 62 | test_cfg=dict( 63 | rpn=dict(nms_pre=2000), 64 | rcnn=dict( 65 | score_thr=0.0001, 66 | nms=dict(type='nms', iou_threshold=0.5), 67 | max_per_img=300) 68 | ) 69 | ) 70 | 71 | default_hooks = dict( 72 | checkpoint=dict(interval=2880//2) 73 | ) 74 | 75 | # training schedule for 2.88k 76 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=2880, val_interval=2880) 77 | val_cfg = dict(type='ValLoop') 78 | test_cfg = dict(type='TestLoop') 79 | 80 | # learning rate 81 | param_scheduler = [ 82 | dict( 83 | type='LinearLR', start_factor=0.009, by_epoch=False, begin=0, end=250), 84 | dict( 85 | type='MultiStepLR', 86 | begin=0, 87 | end=2880, 88 | by_epoch=False, 89 | milestones=[2304, 2592, 2736], 90 | gamma=0.1) 91 | ] 92 | # optimizer 93 | optim_wrapper = dict( 94 | type='AmpOptimWrapper', 95 | optimizer=dict(type='SGD', lr=0.36, momentum=0.9, weight_decay=0.0001), 96 | clip_grad=dict(max_norm=1.0, norm_type=2), 97 | ) 98 | 99 | # Default setting for scaling LR automatically 100 | # - `enable` means enable scaling LR automatically 101 | # or not by default. 102 | # - `base_batch_size` = (8 GPUs) x (32 samples per GPU). 103 | auto_scale_lr = dict(enable=True, base_batch_size=256) 104 | train_dataloader = dict( 105 | batch_size=32, 106 | num_workers=4, 107 | sampler=dict(type='InfiniteSampler'), 108 | ) 109 | -------------------------------------------------------------------------------- /ovdet/configs/clip_based/openai_vitb16/faster_rcnn_fpn_openai_vitb16_clim_bs64_ov_coco_3e.py: -------------------------------------------------------------------------------- 1 | _base_ = ['mmdet::_base_/models/faster-rcnn_r50_fpn.py', 2 | 'mmdet::_base_/default_runtime.py', 3 | '../../_base_/datasets/coco_ovd_base_lsj.py'] 4 | find_unused_parameters = True 5 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 6 | 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 7 | 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 8 | 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 9 | 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 10 | 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 11 | 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 12 | 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0.6] 13 | # invalid_classes = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 14 | # 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 15 | # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 | # 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 17 | # 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18 | # 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 19 | # 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 20 | # 0, 0, 0, 0, 0, 0, 0, 1, 1, 0] + [0] 21 | invalid_classes = None 22 | 23 | norm_cfg = dict(type='SyncBN', requires_grad=True) 24 | model = dict( 25 | type='FVLM', 26 | data_preprocessor=dict( 27 | mean=[122.7709383, 116.7460125, 104.09373615], 28 | std=[68.5005327, 66.6321579, 70.32316305]), 29 | backbone=dict( 30 | _delete_=True, 31 | type='CLIPViT', 32 | model_name='ViT-B-16', 33 | cache_dir='checkpoints', 34 | pretrained='checkpoints/openai_vitb16_coco_clim.pt', 35 | out_indices=[3, 5, 7, 11], 36 | roi_extractor=dict( 37 | type='SingleRoIExtractor', 38 | roi_layer=dict(type='RoIAlign', output_size=1, 39 | sampling_ratio=0, use_torchvision=True), 40 | out_channels=512, 41 | featmap_strides=[16]), 42 | norm_cfg=norm_cfg 43 | ), 44 | neck=dict( 45 | in_channels=[768, 768, 768, 768], 46 | norm_cfg=norm_cfg 47 | ), 48 | rpn_head=dict(num_convs=2), 49 | roi_head=dict( 50 | type='FVLMStandardRoIHead', 51 | bbox_head=dict( 52 | type='FVLMConvFCBBoxHead', 53 | num_shared_convs=4, 54 | num_shared_fcs=2, 55 | num_cls_fcs=1, 56 | num_reg_fcs=1, 57 | reg_class_agnostic=True, 58 | norm_cfg=norm_cfg, 59 | alpha=0.1, 60 | beta=0.8, 61 | clip_temp=75.0, 62 | cls_temp=50.0, 63 | invalid_classes=invalid_classes, 64 | learn_cls_temp=True, 65 | cls_embeddings_path="data/metadata/coco_openai_vitb16_hand_craft.npy", 66 | bg_embedding='learn', 67 | loss_cls=dict( 68 | type='CustomCrossEntropyLoss', 69 | use_sigmoid=False, 70 | class_weight=class_weight 71 | ), 72 | ) 73 | ), 74 | test_cfg=dict( 75 | rcnn=dict( 76 | score_thr=0.01, 77 | nms=dict(type='nms', iou_threshold=0.4), 78 | max_per_img=100) 79 | ) 80 | ) 81 | 82 | 83 | # training schedule for 3e 84 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=3, val_interval=1) 85 | val_cfg = dict(type='ValLoop') 86 | test_cfg = dict(type='TestLoop') 87 | 88 | # learning rate 89 | param_scheduler = [ 90 | dict( 91 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=250), 92 | dict( 93 | type='MultiStepLR', 94 | begin=0, 95 | end=3, 96 | by_epoch=True, 97 | milestones=[100, ], 98 | gamma=0.1) 99 | ] 100 | 101 | # optimizer 102 | optim_wrapper = dict( 103 | type='AmpOptimWrapper', 104 | optimizer=dict(type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.1), 105 | clip_grad=dict(max_norm=1.0, norm_type=2), 106 | ) 107 | 108 | # Default setting for scaling LR automatically 109 | # - `enable` means enable scaling LR automatically 110 | # or not by default. 111 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 112 | auto_scale_lr = dict(enable=True, base_batch_size=64) 113 | train_dataloader = dict( 114 | batch_size=8, 115 | num_workers=4 116 | ) 117 | -------------------------------------------------------------------------------- /ovdet/configs/clip_based/openai_vitb16/mask_rcnn_nasfpn_openai_vitb16_clim_bs64_ov_lvis_4x.py: -------------------------------------------------------------------------------- 1 | _base_ = ['mmdet::_base_/models/mask-rcnn_r50_fpn.py', 2 | 'mmdet::_base_/default_runtime.py', 3 | '../../_base_/datasets/lvis_v1_ovd_base_lsj_640.py'] 4 | find_unused_parameters = True 5 | class_weight = 'data/metadata/lvis_v1_train_cat_norare_info.json' 6 | norm_cfg = dict(type='SyncBN', requires_grad=True) 7 | model = dict( 8 | type='FVLM', 9 | data_preprocessor=dict( 10 | mean=[122.7709383, 116.7460125, 104.09373615], 11 | std=[68.5005327, 66.6321579, 70.32316305]), 12 | backbone=dict( 13 | type='CLIPViT', 14 | _delete_=True, 15 | model_name='ViT-B-16', 16 | cache_dir='checkpoints', 17 | out_indices=[3, 5, 7, 11], 18 | pretrained='checkpoints/openai_vitb16_cc3m_clim.pt', 19 | roi_extractor=dict( 20 | type='SingleRoIExtractor', 21 | roi_layer=dict(type='RoIAlign', output_size=1, 22 | sampling_ratio=0, use_torchvision=True), 23 | out_channels=512, 24 | featmap_strides=[16]), 25 | norm_cfg=norm_cfg 26 | ), 27 | neck=dict( 28 | type='NASFPN', 29 | stack_times=7, 30 | in_channels=[768, 768, 768, 768], 31 | norm_cfg=norm_cfg 32 | ), 33 | rpn_head=dict(num_convs=2), 34 | roi_head=dict( 35 | type='FVLMStandardRoIHead', 36 | bbox_head=dict( 37 | type='FVLMConvFCBBoxHead', 38 | num_shared_convs=4, 39 | num_shared_fcs=2, 40 | num_cls_fcs=1, 41 | num_reg_fcs=1, 42 | reg_class_agnostic=True, 43 | num_classes=1203, 44 | norm_cfg=norm_cfg, 45 | alpha=0.35, 46 | beta=0.65, 47 | clip_temp=50.0, 48 | cls_temp=50.0, 49 | learn_cls_temp=True, 50 | cls_embeddings_path="data/metadata/lvis_openai_vitb16_hand_craft.npy", 51 | bg_embedding='learn', 52 | loss_cls=dict( 53 | type='CustomCrossEntropyLoss', 54 | use_sigmoid=False, 55 | class_weight=class_weight, 56 | bg_weight=0.9, 57 | ), 58 | ), 59 | mask_head=dict( 60 | norm_cfg=norm_cfg, class_agnostic=True, num_classes=1203) 61 | ), 62 | test_cfg=dict( 63 | rpn=dict(nms_pre=2000), 64 | rcnn=dict( 65 | score_thr=0.0001, 66 | nms=dict(type='nms', iou_threshold=0.5), 67 | max_per_img=300) 68 | ) 69 | ) 70 | 71 | default_hooks = dict( 72 | checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1, interval=1) 73 | ) 74 | 75 | # training schedule for 4x 76 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=48, val_interval=12) 77 | val_cfg = dict(type='ValLoop') 78 | test_cfg = dict(type='TestLoop') 79 | 80 | # learning rate 81 | param_scheduler = [ 82 | dict( 83 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=250), 84 | dict( 85 | type='MultiStepLR', 86 | begin=0, 87 | end=48, 88 | by_epoch=True, 89 | milestones=[32, 44], 90 | gamma=0.1) 91 | ] 92 | 93 | # optimizer 94 | optim_wrapper = dict( 95 | type='AmpOptimWrapper', 96 | optimizer=dict( 97 | type='AdamW', lr=0.0004, betas=(0.9, 0.999), weight_decay=0.05), 98 | clip_grad=dict(max_norm=35, norm_type=2), 99 | ) 100 | 101 | # Default setting for scaling LR automatically 102 | # - `enable` means enable scaling LR automatically 103 | # or not by default. 104 | # - `base_batch_size` = (8 GPUs) x (16 samples per GPU). 105 | auto_scale_lr = dict(enable=True, base_batch_size=8*16) 106 | train_dataloader = dict( 107 | batch_size=16, 108 | num_workers=4 109 | ) 110 | -------------------------------------------------------------------------------- /ovdet/configs/detic/README.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | Please refer to this [README](../../INSTALLATION.md). 3 | ## Data Preparation 4 | Please refer to this [README](../../DATA.md). 5 | 6 | ## Usage 7 | ### Obtain CLIP Checkpoints 8 | We use CLIP's text encoder (ViT-B/32) for Detic. Obtain the state_dict 9 | of the model from [GoogleDrive](https://drive.google.com/file/d/1ilxBhjb3JXNDar8lKRQ9GA4hTmjxADfu/view?usp=sharing) and put it under `checkpoints`. 10 | ### OV-COCO 11 | #### Training 12 | 1. To pre-train the detector only on the detection data of base categories, run 13 | 14 | ``` 15 | cd CLIM/ovdet 16 | bash tools/dist_train.sh \ 17 | configs/detic/ov_coco/faster_rcnn_r50_caffe_c4_90k_ovcoco.py 8 \ 18 | --work-dir your/output/directory/detic_coco_base 19 | ``` 20 | Rename the checkpoint of the trained model as `detic_coco_base.pth` and put it under `checkpoints`. 21 | We also provide this checkpoint `detic_coco_base.pth` 22 | in [Google Drive](https://drive.google.com/file/d/1ZzR6aI-AnvSygUcJ7Ny8jOlY4v8Id7MO/view?usp=sharing). 23 | 24 | 2.1 To fine-tune the detector with caption data (no tags), run 25 | 26 | ``` 27 | cd CLIM/ovdet 28 | bash tools/dist_train.sh \ 29 | configs/detic/ov_coco/detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py 8 \ 30 | --work-dir your/output/directory/detic_coco_cap_no_tags_clim 31 | ``` 32 | 2.2 To fine-tune the detector using caption loss and image tag loss, run 33 | 34 | ``` 35 | cd CLIM/ovdet 36 | bash tools/dist_train.sh \ 37 | configs/detic/ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py 8 \ 38 | --work-dir your/output/directory/detic_coco_cap_w_tags_clim 39 | ``` 40 | 41 | 42 | #### Testing 43 | We have provided the following checkpoints in [Google Drive](https://drive.google.com/drive/folders/1f-AkMXFgDIfRMezUbVSc_BC0tr5AjRJ4?usp=sharing). 44 | 45 | 46 | 47 | 48 | | OV-COCO | Losses | Novel AP50 | Config | Download | 49 | |:-------:|:-------------:|:----------:|:--------------------------------------------------------------------:|:---------:| 50 | | 1 | Caption | 32.3 | [config](ov_coco/detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py) | [model](https://drive.google.com/file/d/1TRr7Bz_EF40kUYa61cIGpScYoY8Yv7Cs/view?usp=sharing) | 51 | | 2 | Caption & Tag | 35.4 | [config](ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py) | [model](https://drive.google.com/file/d/1MQyHN7i_BP9D9S7vi213Tysnrdj7eGdG/view?usp=sharing) | 52 | 53 | 54 | 55 | For example, to evaluate the model trained with caption loss and tag loss, run 56 | 57 | ``` 58 | cd CLIM/ovdet 59 | bash tools/dist_test.sh \ 60 | configs/detic/ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py \ 61 | checkpoints/detic_coco_cap_w_tags_clim.pth \ 62 | 8 --work-dir your/output/directory/detic_coco_cap_w_tags_clim 63 | ``` 64 | 65 | ### OV-LVIS 66 | 67 | #### Training 68 | First obtain the 69 | [checkpoint](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_boxsup/detic_centernet2_r50_fpn_4x_lvis-base_boxsup_20230921_180638-c1685ee2.pth) 70 | trained on base categories and put it under `checkpoints/`. Then run 71 | 72 | ``` 73 | cd CLIM/ovdet 74 | bash tools/dist_train.sh \ 75 | configs/detic/ov_lvis/detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py 8 \ 76 | --work-dir your/output/directory/detic_lvis_cap_w_tags_clim 77 | ``` 78 | 79 | #### Testing 80 | We have provided the following checkpoint. 81 | 82 | | OV-LVIS | Losses | mask APr | Config | Download | 83 | |:-------:|:-------------:|:--------:|:-------------------------------------------------------------------------:|:-----------:| 84 | | 1 | Caption & Tag | 21.8 | [config](ov_lvis/detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py) | [model](https://drive.google.com/drive/folders/1Y_3T9jo86rJGc6AnjOoXzrNYbx63pBj-?usp=sharing) | 85 | 86 | 87 | For example, to evaluate the model trained on LVIS-base and CC3M, run 88 | 89 | ``` 90 | cd CLIM/ovdet 91 | bash tools/dist_test.sh \ 92 | configs/detic/ov_lvis/detic_clim_centernet2_r50_fpn_4x_lvis-base_cc3m-lvis.py \ 93 | patch/to/the/checkpoint.pth \ 94 | 8 --work-dir your/output/directory/detic_lvis_cap_w_tags_clim 95 | ``` 96 | -------------------------------------------------------------------------------- /ovdet/configs/detic/ov_coco/detic_no_tags_clim_faster_rcnn_r50_caffe_c4_45k.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmdet::_base_/models/faster-rcnn_r50-caffe-c4.py', 3 | '../../_base_/datasets/coco_ovd_detic_clim.py', 4 | '../../_base_/schedules/schedule_45k.py', 5 | '../../_base_/iter_based_runtime.py' 6 | ] 7 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 8 | 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 9 | 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 10 | 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 11 | 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 12 | 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 13 | 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 14 | 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0] 15 | 16 | reg_layer = [ 17 | dict(type='Linear', in_features=2048, out_features=2048), 18 | dict(type='ReLU', inplace=True), 19 | dict(type='Linear', in_features=2048, out_features=4) 20 | ] 21 | 22 | clip_cfg = dict( # ViT-B/32 23 | type='CLIP', 24 | image_encoder=None, 25 | text_encoder=dict( 26 | type='CLIPTextEncoder', 27 | embed_dim=512, 28 | context_length=77, 29 | vocab_size=49408, 30 | transformer_width=512, # also the word embedding dim 31 | transformer_heads=8, 32 | transformer_layers=12, 33 | init_cfg=dict( 34 | type='Pretrained', 35 | checkpoint='checkpoints/clip_vitb32.pth') 36 | ) 37 | ) 38 | 39 | model = dict( 40 | type='OVDTwoStageDetector', 41 | data_preprocessor=dict( 42 | type='MultiBranchDataPreprocessor', 43 | _delete_=True, 44 | data_preprocessor=dict( 45 | type='DetDataPreprocessor', 46 | mean=[103.530, 116.280, 123.675], 47 | std=[1.0, 1.0, 1.0], 48 | bgr_to_rgb=False, 49 | pad_size_divisor=32 50 | ), 51 | ), 52 | rpn_head=dict( 53 | type='CustomRPNHead', 54 | anchor_generator=dict( 55 | scale_major=False, # align with detectron2 56 | ) 57 | ), 58 | backbone=dict(init_cfg=None), 59 | batch2ovd=dict(caption_batch=['detic_caption'], 60 | mosaic_batch=['detic_caption']), 61 | roi_head=dict( 62 | type='OVDStandardRoIHead', 63 | shared_head=dict(init_cfg=None), 64 | clip_cfg=clip_cfg, 65 | ovd_cfg=dict(detic_caption=dict(type='DeticCaptionWithComposition', 66 | base_batch_size=4, 67 | bce_bias=-20.0, norm_temp=25.0, caption_weight=0.1, 68 | max_caps=1, 69 | queue_cfg=dict(lengths=[256], id_length=16, 70 | names=['clip_caption_features']), 71 | cap_neg_weight=0.125), 72 | ), 73 | bbox_head=dict( 74 | type='DeticBBoxHead', 75 | reg_predictor_cfg=reg_layer, 76 | reg_class_agnostic=True, 77 | cls_bias=-20.0, 78 | cls_temp=25.0, 79 | cls_embeddings_path='data/metadata/coco_clip_hand_craft.npy', 80 | loss_cls=dict( 81 | type='CustomCrossEntropyLoss', 82 | use_sigmoid=True, 83 | class_weight=class_weight), 84 | ), 85 | ), 86 | ) 87 | 88 | # optimizer 89 | optim_wrapper = dict( 90 | type='AmpOptimWrapper', # amp training 91 | clip_grad=dict(max_norm=35, norm_type=2), 92 | ) 93 | # load_from = 'work_dirs/detic_base/iter_90000.pth' 94 | load_from = 'checkpoints/detic_coco_base.pth' 95 | -------------------------------------------------------------------------------- /ovdet/configs/detic/ov_coco/detic_w_tags_clim_faster_rcnn_r50_caffe_c4_45k.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmdet::_base_/models/faster-rcnn_r50-caffe-c4.py', 3 | '../../_base_/datasets/coco_ovd_detic_clim.py', 4 | '../../_base_/schedules/schedule_45k.py', 5 | '../../_base_/iter_based_runtime.py' 6 | ] 7 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 8 | 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 9 | 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 10 | 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 11 | 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 12 | 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 13 | 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 14 | 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0] 15 | 16 | reg_layer = [ 17 | dict(type='Linear', in_features=2048, out_features=2048), 18 | dict(type='ReLU', inplace=True), 19 | dict(type='Linear', in_features=2048, out_features=4) 20 | ] 21 | 22 | clip_cfg = dict( # ViT-B/32 23 | type='CLIP', 24 | image_encoder=None, 25 | text_encoder=dict( 26 | type='CLIPTextEncoder', 27 | embed_dim=512, 28 | context_length=77, 29 | vocab_size=49408, 30 | transformer_width=512, # also the word embedding dim 31 | transformer_heads=8, 32 | transformer_layers=12, 33 | init_cfg=dict( 34 | type='Pretrained', 35 | checkpoint='checkpoints/clip_vitb32.pth') 36 | ) 37 | ) 38 | 39 | model = dict( 40 | type='OVDTwoStageDetector', 41 | data_preprocessor=dict( 42 | type='MultiBranchDataPreprocessor', 43 | _delete_=True, 44 | data_preprocessor=dict( 45 | type='DetDataPreprocessor', 46 | mean=[103.530, 116.280, 123.675], 47 | std=[1.0, 1.0, 1.0], 48 | bgr_to_rgb=False, 49 | pad_size_divisor=32 50 | ), 51 | ), 52 | rpn_head=dict( 53 | type='CustomRPNHead', 54 | anchor_generator=dict( 55 | scale_major=False, # align with detectron2 56 | ) 57 | ), 58 | backbone=dict(init_cfg=None), 59 | batch2ovd=dict(caption_batch=['detic_tags', 'detic_caption'], 60 | mosaic_batch=['detic_tags', 'detic_caption']), 61 | roi_head=dict( 62 | type='OVDStandardRoIHead', 63 | shared_head=dict(init_cfg=None), 64 | clip_cfg=clip_cfg, 65 | ovd_cfg=dict(detic_caption=dict(type='DeticCaptionWithComposition', 66 | base_batch_size=4, 67 | bce_bias=-20.0, norm_temp=25.0, caption_weight=0.1, 68 | max_caps=1, 69 | queue_cfg=dict(lengths=[256], id_length=16, 70 | names=['clip_caption_features']), 71 | cap_neg_weight=0.125), 72 | detic_tags=dict(type='DeticTagsWithComposition', 73 | tag_embeddings_path='data/metadata/coco_clip_hand_craft.npy', 74 | sampling_cfg=dict(topk=128, iof_thr=0.3), 75 | base_batch_size=None, 76 | bce_bias=-20.0, norm_temp=25.0, tag_weight=0.1 / 3, 77 | tag_neg_weight=1.0 78 | ) 79 | ), 80 | bbox_head=dict( 81 | type='DeticBBoxHead', 82 | reg_predictor_cfg=reg_layer, 83 | reg_class_agnostic=True, 84 | cls_bias=-20.0, 85 | cls_temp=25.0, 86 | cls_embeddings_path='data/metadata/coco_clip_hand_craft.npy', 87 | loss_cls=dict( 88 | type='CustomCrossEntropyLoss', 89 | use_sigmoid=True, 90 | class_weight=class_weight), 91 | ), 92 | ), 93 | ) 94 | 95 | # optimizer 96 | optim_wrapper = dict( 97 | type='AmpOptimWrapper', # amp training 98 | clip_grad=dict(max_norm=35, norm_type=2), 99 | ) 100 | # load_from = 'work_dirs/detic_base/iter_90000.pth' 101 | load_from = 'checkpoints/detic_coco_base.pth' 102 | -------------------------------------------------------------------------------- /ovdet/configs/detic/ov_coco/faster_rcnn_r50_caffe_c4_90k_ovcoco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmdet::_base_/models/faster-rcnn_r50-caffe-c4.py', 3 | '../../_base_/datasets/coco_ovd_base.py', 4 | '../../_base_/schedules/schedule_90k.py', 5 | '../../_base_/iter_based_runtime.py' 6 | ] 7 | 8 | class_weight = [1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 9 | 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 10 | 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 11 | 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 12 | 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 13 | 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 14 | 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 15 | 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] + [0] 16 | 17 | reg_layer = [ 18 | dict(type='Linear', in_features=2048, out_features=2048), 19 | dict(type='ReLU', inplace=True), 20 | dict(type='Linear', in_features=2048, out_features=4) 21 | ] 22 | 23 | model = dict( 24 | type='OVDTwoStageDetector', 25 | rpn_head=dict( 26 | type='CustomRPNHead', 27 | anchor_generator=dict( 28 | scale_major=False, # align with detectron2 29 | ) 30 | ), 31 | backbone=dict( 32 | init_cfg=dict( 33 | checkpoint='checkpoints/resnet50_msra-5891d200.pth')), 34 | roi_head=dict( 35 | type='OVDStandardRoIHead', 36 | shared_head=dict( 37 | init_cfg=dict( 38 | checkpoint='checkpoints/resnet50_msra-5891d200.pth')), 39 | clip_cfg=None, 40 | bbox_head=dict( 41 | type='DeticBBoxHead', 42 | reg_predictor_cfg=reg_layer, 43 | reg_class_agnostic=True, 44 | cls_bias=-20.0, 45 | cls_temp=25.0, 46 | cls_embeddings_path='data/metadata/coco_clip_hand_craft.npy', 47 | loss_cls=dict( 48 | type='CustomCrossEntropyLoss', 49 | use_sigmoid=True, 50 | class_weight=class_weight), 51 | ), 52 | ), 53 | ) 54 | 55 | # optimizer 56 | optim_wrapper = dict( 57 | type='AmpOptimWrapper', # amp training 58 | clip_grad=dict(max_norm=35, norm_type=2), 59 | ) 60 | -------------------------------------------------------------------------------- /ovdet/data/metadata/coco_clip_hand_craft.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/coco_clip_hand_craft.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/coco_openai_vitb16_hand_craft.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/coco_openai_vitb16_hand_craft.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/coco_openai_vitb16_hand_craft_with_background.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/coco_openai_vitb16_hand_craft_with_background.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/lvis_openai_rn50x64_hand_craft.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_rn50x64_hand_craft.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/lvis_openai_rn50x64_hand_craft_with_background.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_rn50x64_hand_craft_with_background.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/lvis_openai_vitb16_hand_craft.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_vitb16_hand_craft.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/lvis_openai_vitb16_hand_craft_with_background.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_openai_vitb16_hand_craft_with_background.npy -------------------------------------------------------------------------------- /ovdet/data/metadata/lvis_v1_clip_a+cname.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/data/metadata/lvis_v1_clip_a+cname.npy -------------------------------------------------------------------------------- /ovdet/ovdet/__init__.py: -------------------------------------------------------------------------------- 1 | from mmcv import * # noqa 2 | from mmdet import * # noqa 3 | from mmengine import * # noqa 4 | from .datasets import * 5 | from .methods import * 6 | from .models import * 7 | from .utils import * 8 | -------------------------------------------------------------------------------- /ovdet/ovdet/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .coco_caption import CocoCaptionOVDDataset 2 | from .samplers.multi_source_sampler import CustomGroupMultiSourceSampler 3 | from .pipelines import CachedMosaicWithCaption, MultiChoicesMosaic 4 | from .cc3m_lvis_v1 import CC3MLVISV1Dataset 5 | -------------------------------------------------------------------------------- /ovdet/ovdet/datasets/coco_caption.py: -------------------------------------------------------------------------------- 1 | from mmdet.datasets import CocoDataset 2 | from mmdet.registry import DATASETS 3 | import os.path as osp 4 | 5 | 6 | @DATASETS.register_module() 7 | class CocoCaptionOVDDataset(CocoDataset): 8 | """ 9 | Renamed from `CocoCaptionDataset' to avoid conflicts with the mmdet 10 | """ 11 | 12 | def prepare_data(self, idx): 13 | """Get data processed by ``self.pipeline``. 14 | 15 | Args: 16 | idx (int): The index of ``data_info``. 17 | 18 | Returns: 19 | Any: Depends on ``self.pipeline``. 20 | """ 21 | data_info = self.get_data_info(idx) 22 | if data_info['has_caption']: 23 | return self.pipeline(data_info) 24 | else: 25 | return None 26 | 27 | def parse_data_info(self, raw_data_info: dict): 28 | """Parse raw annotation to target format. 29 | 30 | Args: 31 | raw_data_info (dict): Raw data information load from ``ann_file`` 32 | 33 | Returns: 34 | Union[dict, List[dict]]: Parsed annotation. 35 | """ 36 | img_info = raw_data_info['raw_img_info'] 37 | 38 | data_info = {} 39 | 40 | # TODO: need to change data_prefix['img'] to data_prefix['img_path'] 41 | img_path = osp.join(self.data_prefix['img'], img_info['file_name']) 42 | seg_map_path = None 43 | data_info['img_path'] = img_path 44 | data_info['img_id'] = img_info['img_id'] 45 | data_info['seg_map_path'] = seg_map_path 46 | data_info['height'] = img_info['height'] 47 | data_info['width'] = img_info['width'] 48 | 49 | data_info['captions'] = [img_info.get('captions', []), ] 50 | pos_cat_ids = img_info.get('pos_category_ids', []) 51 | tags = [self.cat2label[cat_id] for cat_id in pos_cat_ids] 52 | tags = list(set(tags)) 53 | data_info['tags'] = [tags, ] 54 | data_info['image_ids'] = [img_info['img_id'], ] 55 | 56 | has_caption = len(img_info.get('captions', [])) > 0 57 | data_info['has_caption'] = has_caption 58 | 59 | instance = {} 60 | bbox = [0.0, 0.0, img_info['width'], img_info['height']] 61 | instance['ignore_flag'] = 0 62 | instance['bbox'] = bbox 63 | instance['bbox_label'] = 0 64 | 65 | data_info['instances'] = [instance] 66 | return data_info 67 | -------------------------------------------------------------------------------- /ovdet/ovdet/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .mosaic import CachedMosaicWithCaption, MultiChoicesMosaic 2 | -------------------------------------------------------------------------------- /ovdet/ovdet/datasets/samplers/multi_source_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import numpy as np 3 | from mmdet.registry import DATA_SAMPLERS 4 | from mmdet.datasets import GroupMultiSourceSampler 5 | 6 | 7 | @DATA_SAMPLERS.register_module() 8 | class CustomGroupMultiSourceSampler(GroupMultiSourceSampler): 9 | def _get_source_group_info(self) -> None: 10 | num_sources = len(self.num_per_source) 11 | self.group2size_per_source = [{0: 0, 1: 0} for _ in range(num_sources)] 12 | self.group2inds_per_source = [{0: [], 1: []} for _ in range(num_sources)] 13 | for source, dataset in enumerate(self.dataset.datasets): 14 | for idx in range(len(dataset)): 15 | data_info = dataset.get_data_info(idx) 16 | width, height = data_info['width'], data_info['height'] 17 | group = 0 if width < height else 1 18 | self.group2size_per_source[source][group] += 1 19 | self.group2inds_per_source[source][group].append(idx) 20 | 21 | self.group_sizes = np.zeros(2, dtype=np.int64) 22 | for group2size in self.group2size_per_source: 23 | for group, size in group2size.items(): 24 | self.group_sizes[group] += size 25 | self.group_ratio = self.group_sizes / sum(self.group_sizes) 26 | -------------------------------------------------------------------------------- /ovdet/ovdet/methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .detic import DeticTags, DeticCaption, DeticCaptionWithComposition, DeticTagsWithComposition 2 | -------------------------------------------------------------------------------- /ovdet/ovdet/methods/builder.py: -------------------------------------------------------------------------------- 1 | from mmengine.registry import Registry 2 | OVD = Registry('ovdet', ) 3 | QUEUE = Registry('queue', ) 4 | 5 | 6 | def build_ovd(cfg): 7 | """Build backbone.""" 8 | return OVD.build(cfg) 9 | 10 | 11 | def build_queue(cfg): 12 | """Build backbone.""" 13 | return QUEUE.build(cfg) 14 | -------------------------------------------------------------------------------- /ovdet/ovdet/methods/detic/__init__.py: -------------------------------------------------------------------------------- 1 | from .detic_caption import DeticCaption, DeticCaptionWithComposition 2 | from .detic_tags import DeticTags, DeticTagsWithComposition 3 | -------------------------------------------------------------------------------- /ovdet/ovdet/methods/detic/utils.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from six.moves import map, zip 3 | import torch 4 | 5 | 6 | def multi_apply(func, *args, **kwargs): 7 | """Apply function to a list of arguments. 8 | 9 | Note: 10 | This function applies the ``func`` to multiple inputs and 11 | map the multiple outputs of the ``func`` into different 12 | list. Each list contains the same type of outputs corresponding 13 | to different inputs. 14 | 15 | Args: 16 | func (Function): A function that will be applied to a list of 17 | arguments 18 | 19 | Returns: 20 | tuple(list): A tuple containing multiple list, each list contains \ 21 | a kind of returned results by the function 22 | """ 23 | pfunc = partial(func, **kwargs) if kwargs else func 24 | map_results = map(pfunc, *args) 25 | return tuple(map(list, zip(*map_results))) 26 | 27 | 28 | def bboxes_area(bboxes): 29 | whs = torch.clamp(bboxes[:, 2:4] - bboxes[:, :2], min=0.0) 30 | return whs.prod(-1) 31 | 32 | 33 | def bboxes_clamp(boxes, bound): # xyxy 34 | boxes[..., 0::2] = boxes[..., 0::2].clamp(min=bound[0], max=bound[2]) # x1 x2 35 | boxes[..., 1::2] = boxes[..., 1::2].clamp(min=bound[1], max=bound[3]) # y1 y2 36 | 37 | return boxes 38 | -------------------------------------------------------------------------------- /ovdet/ovdet/methods/queues.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .builder import QUEUE 4 | 5 | 6 | @QUEUE.register_module() 7 | class Queues(nn.Module): 8 | def __init__(self, names, lengths, emb_dim=512, id_length=4): 9 | super(Queues, self).__init__() 10 | self.names = names 11 | self.lengths = lengths 12 | self.emb_dim = emb_dim 13 | self.id_length = id_length 14 | self._init_queues() 15 | 16 | def _init_queues(self): 17 | attr_names = self.names 18 | queue_lengths = self.lengths 19 | for n in attr_names: 20 | self.register_buffer(n, -torch.ones(1, self.emb_dim + self.id_length), 21 | persistent=False) 22 | self.queue_lengths = {n: queue_lengths[i] for i, n in enumerate(attr_names)} 23 | 24 | @torch.no_grad() 25 | def dequeue_and_enqueue(self, queue_update): 26 | for k, feat in queue_update.items(): 27 | queue_length = self.queue_lengths[k] 28 | valid = (feat[:, self.emb_dim:] >= 0).sum(-1) > 0 # valid label 29 | if valid.sum() == 0: 30 | continue 31 | feat = feat[valid] 32 | feat = feat[:queue_length] 33 | in_length = feat.shape[0] 34 | queue_value = getattr(self, k) 35 | current_length = queue_value.shape[0] 36 | kept_length = min(queue_length - in_length, current_length) 37 | 38 | queue_value.data = torch.cat([feat, queue_value[:kept_length]]) 39 | 40 | @torch.no_grad() 41 | def get_queue(self, key): 42 | value = getattr(self, key) 43 | valid = (value[:, self.emb_dim:] >= 0).sum(-1) > 0 # valid label 44 | return value[valid] 45 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dense_heads import * 2 | from .detectors import * 3 | from .losses import * 4 | from .roi_heads import * 5 | from .vlms import * 6 | from .backbones import * 7 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip_vit import CLIPViT 2 | from .clip_resnet import CLIPResNet 3 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/backbones/clip_resnet.py: -------------------------------------------------------------------------------- 1 | import open_clip 2 | import torch 3 | from mmdet.registry import MODELS 4 | from mmengine.model import BaseModule 5 | from torch.nn import functional as F 6 | 7 | 8 | @MODELS.register_module() 9 | class CLIPResNet(BaseModule): 10 | def __init__(self, model_name, cache_dir, pretrained='openai', roi_extractor=None): 11 | super().__init__() 12 | self.model_name = model_name 13 | clip_model = open_clip.create_model(model_name, 14 | pretrained=pretrained, 15 | cache_dir=cache_dir) 16 | self.visual = clip_model.visual 17 | self.roi_extractor = MODELS.build(roi_extractor) 18 | 19 | def init_weights(self): 20 | for param in self.visual.parameters(): # only freeze the CLIP model 21 | param.requires_grad = False 22 | 23 | def train(self, mode=True): 24 | self.training = mode 25 | self.visual.train(False) 26 | return self 27 | 28 | def forward(self, x): 29 | outputs = [] 30 | with torch.no_grad(): 31 | visual = self.visual 32 | x = visual.stem(x) 33 | for i in range(4): 34 | layer = getattr(visual, f'layer{i+1}') 35 | x = layer(x) 36 | outputs.append(x) 37 | 38 | return tuple(outputs) 39 | 40 | def clip_pool(self, clip_x, rois): 41 | roi_feats = self.roi_extractor([clip_x], rois) 42 | roi_feats = self.visual.attnpool(roi_feats) 43 | roi_feats = F.normalize(roi_feats, dim=-1) 44 | 45 | return roi_feats 46 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .rpn_head import CustomRPNHead 2 | from .centernet_rpn_head import CenterNetRPNHead 3 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/dense_heads/iou_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | # support calculate IOULoss with box_pred 6 | class IOULoss(nn.Module): 7 | 8 | def __init__(self, loc_loss_type='iou'): 9 | super(IOULoss, self).__init__() 10 | self.loc_loss_type = loc_loss_type 11 | 12 | def forward(self, pred, target, weight=None, reduction='sum'): 13 | pred_left = pred[:, 0] 14 | pred_top = pred[:, 1] 15 | pred_right = pred[:, 2] 16 | pred_bottom = pred[:, 3] 17 | 18 | target_left = target[:, 0] 19 | target_top = target[:, 1] 20 | target_right = target[:, 2] 21 | target_bottom = target[:, 3] 22 | 23 | target_aera = (target_left + target_right) * ( 24 | target_top + target_bottom) 25 | pred_aera = (pred_left + pred_right) * (pred_top + pred_bottom) 26 | 27 | w_intersect = torch.min(pred_left, target_left) + torch.min( 28 | pred_right, target_right) 29 | h_intersect = torch.min(pred_bottom, target_bottom) + torch.min( 30 | pred_top, target_top) 31 | 32 | g_w_intersect = torch.max(pred_left, target_left) + torch.max( 33 | pred_right, target_right) 34 | g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max( 35 | pred_top, target_top) 36 | ac_uion = g_w_intersect * g_h_intersect 37 | 38 | area_intersect = w_intersect * h_intersect 39 | area_union = target_aera + pred_aera - area_intersect 40 | 41 | ious = (area_intersect + 1.0) / (area_union + 1.0) 42 | gious = ious - (ac_uion - area_union) / ac_uion 43 | if self.loc_loss_type == 'iou': 44 | losses = -torch.log(ious) 45 | elif self.loc_loss_type == 'linear_iou': 46 | losses = 1 - ious 47 | elif self.loc_loss_type == 'giou': 48 | losses = 1 - gious 49 | else: 50 | raise NotImplementedError 51 | 52 | if weight is not None: 53 | losses = losses * weight 54 | else: 55 | losses = losses 56 | 57 | if reduction == 'sum': 58 | return losses.sum() 59 | elif reduction == 'batch': 60 | return losses.sum(dim=[1]) 61 | elif reduction == 'none': 62 | return losses 63 | else: 64 | raise NotImplementedError 65 | 66 | 67 | def giou_loss( 68 | boxes1: torch.Tensor, 69 | boxes2: torch.Tensor, 70 | reduction: str = 'none', 71 | eps: float = 1e-7, 72 | ) -> torch.Tensor: 73 | """Generalized Intersection over Union Loss (Hamid Rezatofighi et. 74 | 75 | al) 76 | https://arxiv.org/abs/1902.09630 77 | Gradient-friendly IoU loss with an additional penalty that is 78 | non-zero when the boxes do not overlap and scales with the size 79 | of their smallest enclosing box. This loss is symmetric, so the 80 | boxes1 and boxes2 arguments are interchangeable. 81 | Args: 82 | boxes1, boxes2 (Tensor): box locations in XYXY format, shape 83 | (N, 4) or (4,). 84 | reduction: 'none' | 'mean' | 'sum' 85 | 'none': No reduction will be applied to the output. 86 | 'mean': The output will be averaged. 87 | 'sum': The output will be summed. 88 | eps (float): small number to prevent division by zero 89 | """ 90 | 91 | x1, y1, x2, y2 = boxes1.unbind(dim=-1) 92 | x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) 93 | 94 | assert (x2 >= x1).all(), 'bad box: x1 larger than x2' 95 | assert (y2 >= y1).all(), 'bad box: y1 larger than y2' 96 | 97 | # Intersection keypoints 98 | xkis1 = torch.max(x1, x1g) 99 | ykis1 = torch.max(y1, y1g) 100 | xkis2 = torch.min(x2, x2g) 101 | ykis2 = torch.min(y2, y2g) 102 | 103 | intsctk = torch.zeros_like(x1) 104 | mask = (ykis2 > ykis1) & (xkis2 > xkis1) 105 | intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) 106 | unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk 107 | iouk = intsctk / (unionk + eps) 108 | 109 | # smallest enclosing box 110 | xc1 = torch.min(x1, x1g) 111 | yc1 = torch.min(y1, y1g) 112 | xc2 = torch.max(x2, x2g) 113 | yc2 = torch.max(y2, y2g) 114 | 115 | area_c = (xc2 - xc1) * (yc2 - yc1) 116 | miouk = iouk - ((area_c - unionk) / (area_c + eps)) 117 | 118 | loss = 1 - miouk 119 | 120 | if reduction == 'mean': 121 | loss = loss.mean() 122 | elif reduction == 'sum': 123 | loss = loss.sum() 124 | 125 | return loss 126 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .two_stage import OVDTwoStageDetector 2 | from .fvlm import FVLM 3 | from .detic import Detic 4 | from .centernet2 import CenterNet2 5 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/detectors/detic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .centernet2 import CenterNet2 3 | import torch 4 | from torch import Tensor 5 | from mmdet.structures import SampleList 6 | from typing import Dict 7 | from mmdet.registry import MODELS 8 | 9 | 10 | @MODELS.register_module() 11 | class Detic(CenterNet2): 12 | 13 | def __init__(self, 14 | batch2ovd=None, 15 | *args, 16 | **kwargs) -> None: 17 | super().__init__(*args, **kwargs) 18 | self.batch2ovd = dict() if batch2ovd is None else batch2ovd 19 | 20 | def run_ovd(self, x, inputs, data_samples, ovd_name): 21 | losses = dict() 22 | if self.with_rpn: 23 | with torch.no_grad(): 24 | rpn_results_list = self.rpn_head_predict(x, data_samples) 25 | else: 26 | assert data_samples[0].get('proposals', None) is not None 27 | rpn_results_list = [ 28 | data_sample.proposals for data_sample in data_samples 29 | ] 30 | if isinstance(ovd_name, str): 31 | ovd_name = [ovd_name] 32 | for _ovd_name in ovd_name: 33 | losses.update(self.roi_head.run_ovd(x, data_samples, rpn_results_list, 34 | _ovd_name, inputs)) 35 | return losses 36 | 37 | def rpn_head_predict(self, x, batch_data_samples): 38 | batch_img_metas = [ 39 | data_samples.metainfo for data_samples in batch_data_samples 40 | ] 41 | outs = self.rpn_head(x) 42 | proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) 43 | predictions = self.rpn_head.predict_by_feat( 44 | *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg, rescale=False) 45 | return predictions 46 | 47 | def loss(self, multi_batch_inputs: Dict[str, Tensor], 48 | multi_batch_data_samples: Dict[str, SampleList]) -> dict: 49 | if not isinstance(multi_batch_inputs, dict): 50 | multi_batch_inputs = dict(det_batch=multi_batch_inputs) 51 | multi_batch_data_samples = dict(det_batch=multi_batch_data_samples) 52 | 53 | # detection losses 54 | losses = super().loss(multi_batch_inputs.pop('det_batch'), 55 | multi_batch_data_samples.pop('det_batch')) 56 | 57 | multi_batch_features = {k: self.extract_feat(v) 58 | for k, v in multi_batch_inputs.items()} 59 | 60 | for batch_name, ovd_name in self.batch2ovd.items(): 61 | batch_inputs = multi_batch_inputs.get(batch_name) 62 | batch_data_samples = multi_batch_data_samples.get(batch_name) 63 | batch_features = multi_batch_features.get(batch_name) 64 | loss_ovd = self.run_ovd(batch_features, 65 | batch_inputs, 66 | batch_data_samples, 67 | ovd_name) 68 | for k, v in loss_ovd.items(): 69 | losses.update({f'{batch_name}.{k}': v}) 70 | return losses 71 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/detectors/fvlm.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from torch import Tensor 3 | from mmdet.registry import MODELS 4 | from mmdet.models.detectors import TwoStageDetector 5 | from mmdet.structures import SampleList 6 | 7 | 8 | @MODELS.register_module() 9 | class FVLM(TwoStageDetector): 10 | def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: 11 | """Extract features. 12 | 13 | Args: 14 | batch_inputs (Tensor): Image tensor with shape (N, C, H ,W). 15 | 16 | Returns: 17 | tuple[Tensor]: Multi-level features that may have 18 | different resolutions. 19 | """ 20 | x = self.backbone(batch_inputs) 21 | if self.with_neck: 22 | x = self.neck(x[:self.neck.num_ins]) 23 | 24 | return x 25 | 26 | def predict(self, 27 | batch_inputs: Tensor, 28 | batch_data_samples: SampleList, 29 | rescale: bool = True) -> SampleList: 30 | assert self.with_bbox, 'Bbox head must be implemented.' 31 | x = self.backbone(batch_inputs) 32 | clip_x = x[-1] 33 | if self.with_neck: 34 | x = self.neck(x[:self.neck.num_ins]) 35 | 36 | # If there are no pre-defined proposals, use RPN to get proposals 37 | if batch_data_samples[0].get('proposals', None) is None: 38 | rpn_results_list = self.rpn_head.predict( 39 | x, batch_data_samples, rescale=False) 40 | else: 41 | rpn_results_list = [ 42 | data_sample.proposals for data_sample in batch_data_samples 43 | ] 44 | 45 | results_list = self.roi_head.predict( 46 | x, rpn_results_list, batch_data_samples, rescale=rescale, 47 | clip_x=clip_x, clip_pool=self.backbone.clip_pool 48 | ) 49 | 50 | batch_data_samples = self.add_pred_to_datasample( 51 | batch_data_samples, results_list) 52 | return batch_data_samples 53 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/detectors/two_stage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import copy 3 | import torch 4 | from torch import Tensor 5 | from mmdet.structures import SampleList 6 | from mmdet.models.detectors.two_stage import TwoStageDetector 7 | from mmdet.registry import MODELS 8 | from typing import Dict 9 | 10 | 11 | @MODELS.register_module() 12 | class OVDTwoStageDetector(TwoStageDetector): 13 | def __init__(self, batch2ovd=None, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.batch2ovd = dict() if batch2ovd is None else batch2ovd 16 | # mapping from batch name to ovd name 17 | 18 | def run_ovd(self, x, inputs, data_samples, ovd_name): 19 | losses = dict() 20 | if self.with_rpn: 21 | with torch.no_grad(): 22 | rpn_results_list = self.rpn_head_predict(x, data_samples) 23 | else: 24 | assert data_samples[0].get('proposals', None) is not None 25 | rpn_results_list = [ 26 | data_sample.proposals for data_sample in data_samples 27 | ] 28 | if isinstance(ovd_name, str): 29 | ovd_name = [ovd_name] 30 | for _ovd_name in ovd_name: 31 | losses.update(self.roi_head.run_ovd(x, data_samples, rpn_results_list, 32 | _ovd_name, inputs)) 33 | return losses 34 | 35 | def rpn_head_predict(self, x, batch_data_samples): 36 | batch_img_metas = [ 37 | data_samples.metainfo for data_samples in batch_data_samples 38 | ] 39 | outs = self.rpn_head(x) 40 | proposal_cfg = self.train_cfg.get('rpn_proposal', 41 | self.test_cfg.rpn) 42 | predictions = self.rpn_head.predict_by_feat( 43 | *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg, rescale=False) 44 | return predictions 45 | 46 | def loss(self, multi_batch_inputs: Dict[str, Tensor], 47 | multi_batch_data_samples: Dict[str, SampleList]) -> dict: 48 | if not isinstance(multi_batch_inputs, dict): 49 | multi_batch_inputs = dict(det_batch=multi_batch_inputs) 50 | multi_batch_data_samples = dict(det_batch=multi_batch_data_samples) 51 | 52 | multi_batch_features = {k: self.extract_feat(v) 53 | for k, v in multi_batch_inputs.items()} 54 | losses = self.det_loss(multi_batch_features.get('det_batch'), 55 | multi_batch_data_samples.get('det_batch')) 56 | 57 | for batch_name, ovd_name in self.batch2ovd.items(): 58 | batch_inputs = multi_batch_inputs.get(batch_name) 59 | batch_data_samples = multi_batch_data_samples.get(batch_name) 60 | batch_features = multi_batch_features.get(batch_name) 61 | loss_ovd = self.run_ovd(batch_features, 62 | batch_inputs, 63 | batch_data_samples, 64 | ovd_name) 65 | for k, v in loss_ovd.items(): 66 | losses.update({k + f'_{batch_name}': v}) 67 | return losses 68 | 69 | def det_loss(self, x, batch_data_samples): 70 | losses = dict() 71 | 72 | # RPN forward and loss 73 | if self.with_rpn: 74 | proposal_cfg = self.train_cfg.get('rpn_proposal', 75 | self.test_cfg.rpn) 76 | rpn_data_samples = copy.deepcopy(batch_data_samples) 77 | # set cat_id of gt_labels to 0 in RPN 78 | for data_sample in rpn_data_samples: 79 | data_sample.gt_instances.labels = \ 80 | torch.zeros_like(data_sample.gt_instances.labels) 81 | 82 | rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict( 83 | x, rpn_data_samples, proposal_cfg=proposal_cfg) 84 | # avoid get same name with roi_head loss 85 | keys = rpn_losses.keys() 86 | for key in list(keys): 87 | if 'loss' in key and 'rpn' not in key: 88 | rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) 89 | losses.update(rpn_losses) 90 | else: 91 | assert batch_data_samples[0].get('proposals', None) is not None 92 | # use pre-defined proposals in InstanceData for the second stage 93 | # to extract ROI features. 94 | rpn_results_list = [ 95 | data_sample.proposals for data_sample in batch_data_samples 96 | ] 97 | 98 | roi_losses = self.roi_head.loss(x, rpn_results_list, 99 | batch_data_samples) 100 | losses.update(roi_losses) 101 | 102 | return losses 103 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy_loss import CustomCrossEntropyLoss 2 | from .heatmap_focal_loss import HeatmapFocalLoss 3 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/losses/cross_entropy_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn.functional as F 3 | from mmdet.registry import MODELS 4 | from mmdet.models.losses.utils import weight_reduce_loss 5 | from mmdet.models import CrossEntropyLoss 6 | from mmdet.models.losses.cross_entropy_loss import _expand_onehot_labels 7 | from ovdet.utils import load_class_freq 8 | 9 | 10 | def binary_cross_entropy(pred, 11 | label, 12 | weight=None, 13 | reduction='mean', 14 | avg_factor=None, 15 | class_weight=None, 16 | ignore_index=-100, 17 | avg_non_ignore=False, **kwargs): 18 | ignore_index = -100 if ignore_index is None else ignore_index 19 | 20 | if pred.dim() != label.dim(): 21 | label, weight, valid_mask = _expand_onehot_labels( 22 | label, weight, pred.size(-1), ignore_index) 23 | else: 24 | # should mask out the ignored elements 25 | valid_mask = ((label >= 0) & (label != ignore_index)).float() 26 | if weight is not None: 27 | # The inplace writing method will have a mismatched broadcast 28 | # shape error if the weight and valid_mask dimensions 29 | # are inconsistent such as (B,N,1) and (B,N,C). 30 | weight = weight * valid_mask 31 | else: 32 | weight = valid_mask 33 | 34 | # average loss over non-ignored elements 35 | if (avg_factor is None) and avg_non_ignore and reduction == 'mean': 36 | avg_factor = valid_mask.sum().item() 37 | 38 | # weighted element-wise losses 39 | weight = weight.float() 40 | loss = F.binary_cross_entropy_with_logits(pred, label.float(), reduction='none') 41 | if class_weight is not None: 42 | loss = loss * class_weight[None] 43 | # do the reduction for the weighted loss 44 | loss = weight_reduce_loss( 45 | loss, weight, reduction=reduction, avg_factor=avg_factor) 46 | 47 | return loss 48 | 49 | 50 | def cross_entropy(pred, 51 | label, 52 | weight=None, 53 | reduction='mean', 54 | avg_factor=None, 55 | class_weight=None, 56 | ignore_index=-100, 57 | avg_non_ignore=False): 58 | # The default value of ignore_index is the same as F.cross_entropy 59 | ignore_index = -100 if ignore_index is None else ignore_index 60 | # element-wise losses 61 | if class_weight is not None: 62 | mask_out = class_weight < 0.00001 63 | pred[:, mask_out] = -float('inf') 64 | loss = F.cross_entropy( 65 | pred, 66 | label, 67 | weight=class_weight, # still use 68 | reduction='none', 69 | ignore_index=ignore_index) 70 | 71 | # average loss over non-ignored elements 72 | # pytorch's official cross_entropy average loss over non-ignored elements 73 | # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660 # noqa 74 | if (avg_factor is None) and avg_non_ignore and reduction == 'mean': 75 | avg_factor = label.numel() - (label == ignore_index).sum().item() 76 | 77 | # apply weights and do the reduction 78 | if weight is not None: 79 | weight = weight.float() 80 | loss = weight_reduce_loss( 81 | loss, weight=weight, reduction=reduction, avg_factor=avg_factor) 82 | 83 | return loss 84 | 85 | 86 | @MODELS.register_module() 87 | class CustomCrossEntropyLoss(CrossEntropyLoss): 88 | def __init__(self, bg_weight=1.0, *args, **kwargs): 89 | super().__init__(*args, **kwargs) 90 | if self.use_sigmoid: 91 | del self.cls_criterion 92 | self.cls_criterion = binary_cross_entropy 93 | elif not self.use_mask: 94 | del self.cls_criterion 95 | self.cls_criterion = cross_entropy 96 | 97 | if isinstance(self.class_weight, str): 98 | cat_freq = load_class_freq(self.class_weight, min_count=0) 99 | self.class_weight = (cat_freq > 0.0).float().tolist() + [bg_weight] 100 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/losses/heatmap_focal_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch import Tensor 7 | 8 | from mmdet.registry import MODELS 9 | 10 | 11 | # support class-agnostic heatmap_focal_loss 12 | def heatmap_focal_loss_with_pos_inds( 13 | pred: Tensor, 14 | targets: Tensor, 15 | pos_inds: Tensor, 16 | alpha: float = 2.0, 17 | beta: float = 4.0, 18 | gamma: float = 4.0, 19 | sigmoid_clamp: float = 1e-4, 20 | ignore_high_fp: float = -1.0, 21 | pos_weight: float = 1.0, 22 | neg_weight: float = 1.0, 23 | avg_factor: Optional[Union[int, float]] = None) -> Tensor: 24 | 25 | pred = torch.clamp( 26 | pred.sigmoid_(), min=sigmoid_clamp, max=1 - sigmoid_clamp) 27 | 28 | neg_weights = torch.pow(1 - targets, beta) 29 | 30 | pos_pred = pred[pos_inds] 31 | pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) 32 | neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights 33 | if ignore_high_fp > 0: 34 | not_high_fp = (pred < ignore_high_fp).float() 35 | neg_loss = not_high_fp * neg_loss 36 | 37 | pos_loss = -pos_loss.sum() 38 | neg_loss = -neg_loss.sum() 39 | if alpha >= 0: 40 | pos_loss = alpha * pos_loss 41 | neg_loss = (1 - alpha) * neg_loss 42 | 43 | pos_loss = pos_weight * pos_loss / avg_factor 44 | neg_loss = neg_weight * neg_loss / avg_factor 45 | 46 | return pos_loss, neg_loss 47 | 48 | 49 | @MODELS.register_module() 50 | class HeatmapFocalLoss(nn.Module): 51 | """GaussianFocalLoss is a variant of focal loss. 52 | 53 | More details can be found in the `paper 54 | `_ 55 | Code is modified from `kp_utils.py 56 | `_ # noqa: E501 57 | Please notice that the target in GaussianFocalLoss is a gaussian heatmap, 58 | not 0/1 binary target. 59 | 60 | Args: 61 | alpha (float): Power of prediction. 62 | gamma (float): Power of target for negative samples. 63 | reduction (str): Options are "none", "mean" and "sum". 64 | loss_weight (float): Loss weight of current loss. 65 | pos_weight(float): Positive sample loss weight. Defaults to 1.0. 66 | neg_weight(float): Negative sample loss weight. Defaults to 1.0. 67 | """ 68 | 69 | def __init__( 70 | self, 71 | alpha: float = 2.0, 72 | beta: float = 4.0, 73 | gamma: float = 4.0, 74 | sigmoid_clamp: float = 1e-4, 75 | ignore_high_fp: float = -1.0, 76 | loss_weight: float = 1.0, 77 | pos_weight: float = 1.0, 78 | neg_weight: float = 1.0, 79 | ) -> None: 80 | super().__init__() 81 | self.alpha = alpha 82 | self.beta = beta 83 | self.gamma = gamma 84 | self.sigmoid_clamp = sigmoid_clamp 85 | self.ignore_high_fp = ignore_high_fp 86 | self.loss_weight = loss_weight 87 | self.pos_weight = pos_weight 88 | self.neg_weight = neg_weight 89 | 90 | def forward(self, 91 | pred: Tensor, 92 | target: Tensor, 93 | pos_inds: Optional[Tensor] = None, 94 | avg_factor: Optional[Union[int, float]] = None) -> Tensor: 95 | """Forward function. 96 | 97 | If you want to manually determine which positions are 98 | positive samples, you can set the pos_index and pos_label 99 | parameter. Currently, only the CenterNet update version uses 100 | the parameter. 101 | 102 | Args: 103 | pred (torch.Tensor): The prediction. The shape is (N, num_classes). 104 | target (torch.Tensor): The learning target of the prediction 105 | in gaussian distribution. The shape is (N, num_classes). 106 | pos_inds (torch.Tensor): The positive sample index. 107 | Defaults to None. 108 | pos_labels (torch.Tensor): The label corresponding to the positive 109 | sample index. Defaults to None. 110 | weight (torch.Tensor, optional): The weight of loss for each 111 | prediction. Defaults to None. 112 | avg_factor (int, float, optional): Average factor that is used to 113 | average the loss. Defaults to None. 114 | reduction_override (str, optional): The reduction method used to 115 | override the original reduction method of the loss. 116 | Defaults to None. 117 | """ 118 | 119 | pos_loss, neg_loss = heatmap_focal_loss_with_pos_inds( 120 | pred, 121 | target, 122 | pos_inds, 123 | alpha=self.alpha, 124 | beta=self.beta, 125 | gamma=self.gamma, 126 | sigmoid_clamp=self.sigmoid_clamp, 127 | ignore_high_fp=self.ignore_high_fp, 128 | pos_weight=self.pos_weight, 129 | neg_weight=self.neg_weight, 130 | avg_factor=avg_factor) 131 | return pos_loss, neg_loss 132 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .standard_roi_head import * 2 | from .detic_bbox_heads import * 3 | from .fvlm_bbox_heads import * 4 | from .detic_roi_head import * 5 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/roi_heads/detic_bbox_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .bbox_head import DeticBBoxHead 2 | from .zero_shot_classifier import ZeroShotClassifier 3 | from .detic_bbox_head import OriginalDeticBBoxHead 4 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/roi_heads/detic_bbox_heads/zero_shot_classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from mmdet.registry import MODELS 7 | 8 | 9 | @MODELS.register_module() 10 | class ZeroShotClassifier(nn.Module): 11 | 12 | def __init__( 13 | self, 14 | in_features: int, 15 | out_features: int, # num_classes 16 | zs_weight_path: str, 17 | zs_weight_dim: int = 512, 18 | use_bias: float = 0.0, 19 | norm_weight: bool = True, 20 | norm_temperature: float = 50.0, 21 | ): 22 | super().__init__() 23 | num_classes = out_features 24 | self.norm_weight = norm_weight 25 | self.norm_temperature = norm_temperature 26 | 27 | self.use_bias = use_bias < 0 28 | if self.use_bias: 29 | self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) 30 | 31 | self.linear = nn.Linear(in_features, zs_weight_dim) 32 | 33 | if zs_weight_path == 'rand': 34 | zs_weight = torch.randn((zs_weight_dim, num_classes)) 35 | nn.init.normal_(zs_weight, std=0.01) 36 | else: 37 | if zs_weight_path.endswith('npy'): 38 | zs_weight = torch.tensor( 39 | np.load(zs_weight_path), 40 | dtype=torch.float32).permute(1, 0).contiguous() # D x C 41 | else: 42 | zs_weight = torch.load( 43 | zs_weight_path).float().permute(1, 0).contiguous() # D x C 44 | zs_weight = torch.cat( 45 | [zs_weight, zs_weight.new_zeros( 46 | (zs_weight_dim, 1))], dim=1) # D x (C + 1) 47 | 48 | if self.norm_weight: 49 | zs_weight = F.normalize(zs_weight, p=2, dim=0) 50 | 51 | if zs_weight_path == 'rand': 52 | self.zs_weight = nn.Parameter(zs_weight) 53 | else: 54 | self.register_buffer('zs_weight', zs_weight) 55 | 56 | assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape 57 | 58 | def forward(self, x, classifier=None): 59 | ''' 60 | Inputs: 61 | x: B x D' 62 | classifier_info: (C', C' x D) 63 | ''' 64 | x = self.linear(x) 65 | if classifier is not None: 66 | zs_weight = classifier.permute(1, 0).contiguous() # D x C' 67 | zs_weight = F.normalize(zs_weight, p=2, dim=0) \ 68 | if self.norm_weight else zs_weight 69 | else: 70 | zs_weight = self.zs_weight 71 | if self.norm_weight: 72 | x = self.norm_temperature * F.normalize(x, p=2, dim=1) 73 | x = torch.mm(x, zs_weight) 74 | if self.use_bias: 75 | x = x + self.cls_bias 76 | return x 77 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/roi_heads/fvlm_bbox_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .convfc_bbox_head import * 2 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/README.md: -------------------------------------------------------------------------------- 1 | # CLIP models 2 | 3 | ## RN50 4 | ```python 5 | clip_cfg=dict( 6 | type='CLIP', 7 | image_encoder=dict( 8 | type='CLIPResNet', 9 | layers=[3, 4, 6, 3], 10 | output_dim=1024, 11 | heads=32, 12 | input_resolution=224, 13 | width=64, 14 | init_cfg=dict( 15 | type='Pretrained', 16 | prefix='visual', 17 | checkpoint='checkpoints/clip_r50.pth') 18 | ), 19 | text_encoder=dict( 20 | type='CLIPTextEncoder', 21 | embed_dim=1024, 22 | context_length=77, 23 | vocab_size=49408, 24 | transformer_width=512, # also the word embedding dim 25 | transformer_heads=8, 26 | transformer_layers=12, 27 | init_cfg=dict( 28 | type='Pretrained', 29 | checkpoint='checkpoints/clip_r50.pth') 30 | ) 31 | ) 32 | ``` 33 | 34 | ## ViT-B/32 35 | ```python 36 | clip_cfg=dict( 37 | type='CLIP', 38 | image_encoder=dict( 39 | type='CLIPViT', 40 | input_resolution=224, 41 | patch_size=32, 42 | width=768, 43 | layers=12, 44 | heads=12, 45 | output_dim=512, 46 | init_cfg=dict( 47 | type='Pretrained', 48 | prefix='visual', 49 | checkpoint='checkpoints/clip_vitb32.pth') 50 | ), 51 | text_encoder=dict( 52 | type='CLIPTextEncoder', 53 | embed_dim=512, 54 | context_length=77, 55 | vocab_size=49408, 56 | transformer_width=512, # also the word embedding dim 57 | transformer_heads=8, 58 | transformer_layers=12, 59 | init_cfg=dict( 60 | type='Pretrained', 61 | checkpoint='checkpoints/clip_vitb32.pth') 62 | ) 63 | ) 64 | ``` -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .text_encoder import CLIPTextEncoder 2 | # from .image_encoder import CLIPResNet, CLIPResLayer4, CLIPViT 3 | from .model import CLIP 4 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/ovdet/ovdet/models/vlms/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/common.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch import Tensor 4 | from torch.nn import MultiheadAttention 5 | from torch.nn import functional as F 6 | from typing import Optional, Tuple 7 | from collections import OrderedDict 8 | 9 | 10 | class QuickGELU(nn.Module): 11 | def forward(self, x: torch.Tensor): 12 | return x * torch.sigmoid(1.702 * x) 13 | 14 | 15 | class MultiheadSelfAttention(MultiheadAttention): 16 | def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None, 17 | need_weights: bool = True, attn_mask: Optional[Tensor] = None, return_tokens: bool = False) \ 18 | -> Tuple[Tensor, Tensor, Optional[Tensor]]: 19 | assert query is value and value is key # self-attention 20 | if return_tokens: 21 | # in_projection 22 | tokens = F.linear(value, self.in_proj_weight, bias=self.in_proj_bias)[..., -self.embed_dim:] 23 | # out_projection 24 | tokens = F.linear(tokens, self.out_proj.weight, bias=self.out_proj.bias) 25 | else: 26 | tokens = None 27 | 28 | attn_output, attn_output_weights = F.multi_head_attention_forward( 29 | query=query, key=key, value=value, 30 | embed_dim_to_check=self.embed_dim, 31 | num_heads=self.num_heads, 32 | in_proj_weight=self.in_proj_weight, 33 | in_proj_bias=self.in_proj_bias, 34 | bias_k=None, bias_v=None, 35 | add_zero_attn=False, 36 | dropout_p=0., 37 | out_proj_weight=self.out_proj.weight, 38 | out_proj_bias=self.out_proj.bias, 39 | training=self.training, 40 | key_padding_mask=key_padding_mask, need_weights=need_weights, 41 | attn_mask=attn_mask) 42 | 43 | return attn_output, tokens, attn_output_weights 44 | 45 | 46 | class ResidualAttentionBlock(nn.Module): 47 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): 48 | super().__init__() 49 | 50 | self.attn = MultiheadSelfAttention(d_model, n_head) 51 | self.ln_1 = LayerNorm(d_model) 52 | self.mlp = nn.Sequential(OrderedDict([ 53 | ("c_fc", nn.Linear(d_model, d_model * 4)), 54 | ("gelu", QuickGELU()), 55 | ("c_proj", nn.Linear(d_model * 4, d_model)) 56 | ])) 57 | self.ln_2 = LayerNorm(d_model) 58 | self.attn_mask = attn_mask 59 | 60 | def attention(self, x: torch.Tensor, return_tokens: bool, attn_masks=None): 61 | self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None 62 | length = x.shape[0] 63 | if attn_masks is None: 64 | attn_mask = None if self.attn_mask is None else self.attn_mask[:length, :length] 65 | else: 66 | attn_mask = attn_masks 67 | return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask, 68 | return_tokens=return_tokens)[:2] 69 | 70 | def forward(self, x, return_tokens=False, cls_indices=None, attn_masks=None): 71 | att, tokens = self.attention(self.ln_1(x), return_tokens, attn_masks=attn_masks) 72 | if return_tokens: 73 | assert cls_indices is not None 74 | if not isinstance(cls_indices, int): 75 | assert len(cls_indices) == x.shape[1] # x: LNC 76 | cls_tokens = x[cls_indices, torch.arange(x.shape[1])] 77 | tokens = cls_tokens[None] + tokens 78 | tokens = tokens + self.mlp(self.ln_2(tokens)) 79 | 80 | x = x + att 81 | x = x + self.mlp(self.ln_2(x)) 82 | 83 | return x, tokens 84 | else: 85 | assert tokens is None 86 | x = x + att 87 | # x = x + self.attention(self.ln_1(x)) 88 | x = x + self.mlp(self.ln_2(x)) 89 | 90 | return x, None 91 | 92 | 93 | class LayerNorm(nn.LayerNorm): 94 | """Subclass torch's LayerNorm to handle fp16.""" 95 | 96 | def forward(self, x: torch.Tensor): 97 | orig_type = x.dtype 98 | ret = super().forward(x.type(torch.float32)) 99 | return ret.type(orig_type) 100 | 101 | 102 | class Transformer(nn.Module): 103 | def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None): 104 | super().__init__() 105 | self.width = width 106 | self.layers = layers 107 | self.heads = heads 108 | self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) 109 | 110 | def forward(self, x: torch.Tensor, return_tokens=False, cls_indices=None, attn_masks=None): 111 | for i in range(self.layers - 1): 112 | x, _ = self.resblocks[i](x, attn_masks=attn_masks) 113 | return self.resblocks[-1](x, return_tokens=return_tokens, cls_indices=cls_indices, 114 | attn_masks=attn_masks) 115 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/model.py: -------------------------------------------------------------------------------- 1 | from mmdet.registry import MODELS 2 | from mmengine.model import BaseModule 3 | 4 | 5 | @MODELS.register_module() 6 | class CLIP(BaseModule): 7 | def __init__(self, text_encoder, image_encoder): 8 | super().__init__() 9 | if text_encoder is not None: 10 | self.text_encoder = MODELS.build(text_encoder) 11 | if image_encoder is not None: 12 | self.image_encoder = MODELS.build(image_encoder) 13 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text 133 | -------------------------------------------------------------------------------- /ovdet/ovdet/models/vlms/clip/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | import torch 3 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer 4 | 5 | 6 | _tokenizer = _Tokenizer() 7 | 8 | 9 | def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor: 10 | """ 11 | Returns the tokenized representation of given input string(s) 12 | 13 | Parameters 14 | ---------- 15 | texts : Union[str, List[str]] 16 | An input string or a list of input strings to tokenize 17 | 18 | context_length : int 19 | The ovd length to use; all CLIP models use 77 as the ovd length 20 | 21 | truncate: bool 22 | Whether to truncate the text in case its encoding is longer than the ovd length 23 | 24 | Returns 25 | ------- 26 | A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] 27 | """ 28 | if isinstance(texts, str): 29 | texts = [texts] 30 | 31 | sot_token = _tokenizer.encoder["<|startoftext|>"] 32 | eot_token = _tokenizer.encoder["<|endoftext|>"] 33 | all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] 34 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 35 | 36 | for i, tokens in enumerate(all_tokens): 37 | if len(tokens) > context_length: 38 | if truncate: 39 | tokens = tokens[:context_length] 40 | tokens[-1] = eot_token 41 | else: 42 | raise RuntimeError(f"Input {texts[i]} is too long for ovd length {context_length}") 43 | result[i, :len(tokens)] = torch.tensor(tokens) 44 | 45 | return result 46 | 47 | 48 | def tokenize_dynamic(texts, context_length: int = 77, truncate: bool = False): 49 | if isinstance(texts, str): 50 | texts = [texts] 51 | 52 | sot_token = _tokenizer.encoder["<|startoftext|>"] 53 | eot_token = _tokenizer.encoder["<|endoftext|>"] 54 | all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] 55 | lengths = [len(tokens) for tokens in all_tokens] 56 | context_length = min(context_length, max(lengths)) 57 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 58 | 59 | for i, tokens in enumerate(all_tokens): 60 | if len(tokens) > context_length: 61 | if truncate: 62 | tokens = tokens[:context_length] 63 | tokens[-1] = eot_token 64 | else: 65 | raise RuntimeError(f"Input {texts[i]} is too long for ovd length {context_length}") 66 | result[i, :len(tokens)] = torch.tensor(tokens) 67 | 68 | return result 69 | -------------------------------------------------------------------------------- /ovdet/ovdet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .misc import multi_apply, load_class_freq 2 | -------------------------------------------------------------------------------- /ovdet/ovdet/utils/misc.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from functools import partial 4 | from six.moves import map, zip 5 | 6 | 7 | def multi_apply(func, *args, **kwargs): 8 | """Apply function to a list of arguments. 9 | Note: 10 | This function applies the ``func`` to multiple inputs and 11 | map the multiple outputs of the ``func`` into different 12 | list. Each list contains the same type of outputs corresponding 13 | to different inputs. 14 | Args: 15 | func (Function): A function that will be applied to a list of 16 | arguments 17 | Returns: 18 | tuple(list): A tuple containing multiple list, each list contains \ 19 | a kind of returned results by the function 20 | """ 21 | pfunc = partial(func, **kwargs) if kwargs else func 22 | map_results = map(pfunc, *args) 23 | return tuple(map(list, zip(*map_results))) 24 | 25 | 26 | def load_class_freq( 27 | path='data/metadata/lvis_v1_train_cat_norare_info.json', 28 | freq_weight=1.0, 29 | min_count=0): 30 | cat_info = json.load(open(path, 'r')) 31 | cat_info = torch.tensor( 32 | [max(c['image_count'], min_count) for c in sorted(cat_info, key=lambda x: x['id'])]) 33 | freq_weight = cat_info.float() ** freq_weight 34 | return freq_weight 35 | -------------------------------------------------------------------------------- /ovdet/tools/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | NNODES=${NNODES:-1} 7 | NODE_RANK=${NODE_RANK:-0} 8 | PORT=${PORT:-29500} 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 10 | 11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 12 | python -m torch.distributed.launch \ 13 | --nnodes=$NNODES \ 14 | --node_rank=$NODE_RANK \ 15 | --master_addr=$MASTER_ADDR \ 16 | --nproc_per_node=$GPUS \ 17 | --master_port=$PORT \ 18 | $(dirname "$0")/test.py \ 19 | $CONFIG \ 20 | $CHECKPOINT \ 21 | --launcher pytorch \ 22 | ${@:4} 23 | -------------------------------------------------------------------------------- /ovdet/tools/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | NNODES=${NNODES:-1} 6 | NODE_RANK=${NODE_RANK:-0} 7 | PORT=${PORT:-29500} 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 9 | 10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 11 | python -m torch.distributed.launch \ 12 | --nnodes=$NNODES \ 13 | --node_rank=$NODE_RANK \ 14 | --master_addr=$MASTER_ADDR \ 15 | --nproc_per_node=$GPUS \ 16 | --master_port=$PORT \ 17 | $(dirname "$0")/train.py \ 18 | $CONFIG \ 19 | --launcher pytorch ${@:3} 20 | -------------------------------------------------------------------------------- /ovdet/tools/pre_processors/keep_coco_base.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from tqdm import tqdm 4 | 5 | categories_seen = [ 6 | {'id': 1, 'name': 'person'}, 7 | {'id': 2, 'name': 'bicycle'}, 8 | {'id': 3, 'name': 'car'}, 9 | {'id': 4, 'name': 'motorcycle'}, 10 | {'id': 7, 'name': 'train'}, 11 | {'id': 8, 'name': 'truck'}, 12 | {'id': 9, 'name': 'boat'}, 13 | {'id': 15, 'name': 'bench'}, 14 | {'id': 16, 'name': 'bird'}, 15 | {'id': 19, 'name': 'horse'}, 16 | {'id': 20, 'name': 'sheep'}, 17 | {'id': 23, 'name': 'bear'}, 18 | {'id': 24, 'name': 'zebra'}, 19 | {'id': 25, 'name': 'giraffe'}, 20 | {'id': 27, 'name': 'backpack'}, 21 | {'id': 31, 'name': 'handbag'}, 22 | {'id': 33, 'name': 'suitcase'}, 23 | {'id': 34, 'name': 'frisbee'}, 24 | {'id': 35, 'name': 'skis'}, 25 | {'id': 38, 'name': 'kite'}, 26 | {'id': 42, 'name': 'surfboard'}, 27 | {'id': 44, 'name': 'bottle'}, 28 | {'id': 48, 'name': 'fork'}, 29 | {'id': 50, 'name': 'spoon'}, 30 | {'id': 51, 'name': 'bowl'}, 31 | {'id': 52, 'name': 'banana'}, 32 | {'id': 53, 'name': 'apple'}, 33 | {'id': 54, 'name': 'sandwich'}, 34 | {'id': 55, 'name': 'orange'}, 35 | {'id': 56, 'name': 'broccoli'}, 36 | {'id': 57, 'name': 'carrot'}, 37 | {'id': 59, 'name': 'pizza'}, 38 | {'id': 60, 'name': 'donut'}, 39 | {'id': 62, 'name': 'chair'}, 40 | {'id': 65, 'name': 'bed'}, 41 | {'id': 70, 'name': 'toilet'}, 42 | {'id': 72, 'name': 'tv'}, 43 | {'id': 73, 'name': 'laptop'}, 44 | {'id': 74, 'name': 'mouse'}, 45 | {'id': 75, 'name': 'remote'}, 46 | {'id': 78, 'name': 'microwave'}, 47 | {'id': 79, 'name': 'oven'}, 48 | {'id': 80, 'name': 'toaster'}, 49 | {'id': 82, 'name': 'refrigerator'}, 50 | {'id': 84, 'name': 'book'}, 51 | {'id': 85, 'name': 'clock'}, 52 | {'id': 86, 'name': 'vase'}, 53 | {'id': 90, 'name': 'toothbrush'}, 54 | ] 55 | 56 | base_cat_ids = [cat['id'] for cat in categories_seen] 57 | 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--json_path", default="data/coco/annotations/instances_train2017.json", type=str) 60 | parser.add_argument("--out_path", default="data/coco/wusize/instances_train2017_base.json") 61 | args = parser.parse_args() 62 | 63 | with open(args.json_path, 'r') as f: 64 | json_coco = json.load(f) 65 | 66 | annotations = [] 67 | 68 | for ann in tqdm(json_coco['annotations']): 69 | if ann['category_id'] in base_cat_ids: 70 | annotations.append(ann) 71 | 72 | json_coco['annotations'] = annotations 73 | 74 | with open(args.out_path, 'w') as f: 75 | json.dump(json_coco, f) 76 | -------------------------------------------------------------------------------- /ovdet/tools/pre_processors/keep_coco_novel.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from tqdm import tqdm 4 | 5 | categories_unseen = [ 6 | {'id': 5, 'name': 'airplane'}, 7 | {'id': 6, 'name': 'bus'}, 8 | {'id': 17, 'name': 'cat'}, 9 | {'id': 18, 'name': 'dog'}, 10 | {'id': 21, 'name': 'cow'}, 11 | {'id': 22, 'name': 'elephant'}, 12 | {'id': 28, 'name': 'umbrella'}, 13 | {'id': 32, 'name': 'tie'}, 14 | {'id': 36, 'name': 'snowboard'}, 15 | {'id': 41, 'name': 'skateboard'}, 16 | {'id': 47, 'name': 'cup'}, 17 | {'id': 49, 'name': 'knife'}, 18 | {'id': 61, 'name': 'cake'}, 19 | {'id': 63, 'name': 'couch'}, 20 | {'id': 76, 'name': 'keyboard'}, 21 | {'id': 81, 'name': 'sink'}, 22 | {'id': 87, 'name': 'scissors'}, 23 | ] 24 | 25 | novel_cat_ids = [cat['id'] for cat in categories_unseen] 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--json_path", default="data/coco/annotations/instances_val2017.json") 29 | parser.add_argument("--out_path", default="data/coco/wusize/instances_val2017_novel.json") 30 | args = parser.parse_args() 31 | 32 | with open(args.json_path, 'r') as f: 33 | json_coco = json.load(f) 34 | 35 | annotations = [] 36 | 37 | for ann in tqdm(json_coco['annotations']): 38 | if ann['category_id'] in novel_cat_ids: 39 | annotations.append(ann) 40 | 41 | json_coco['annotations'] = annotations 42 | 43 | with open(args.out_path, 'w') as f: 44 | json.dump(json_coco, f) 45 | -------------------------------------------------------------------------------- /ovdet/tools/pre_processors/keep_lvis_base.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from tqdm import tqdm 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--json_path", default="data/lvis_v1/annotations/lvis_v1_train.json") 7 | parser.add_argument("--out_path", default="data/lvis_v1/wusize/lvis_v1_train_base.json") 8 | args = parser.parse_args() 9 | 10 | with open(args.json_path, 'r') as f: 11 | json_coco = json.load(f) 12 | 13 | annotations = [] 14 | 15 | 16 | cat_id2cat_info = {cat_info['id']: cat_info for cat_info in json_coco['categories']} 17 | for ann in tqdm(json_coco['annotations']): 18 | cat_id = ann['category_id'] 19 | cat_info = cat_id2cat_info[cat_id] 20 | frequency = cat_info['frequency'] 21 | if frequency in ['f', 'c']: 22 | annotations.append(ann) 23 | 24 | json_coco['annotations'] = annotations 25 | 26 | with open(args.out_path, 'w') as f: 27 | json.dump(json_coco, f) 28 | -------------------------------------------------------------------------------- /ovdet/tools/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /ovdet/tools/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | WORK_DIR=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | PY_ARGS=${@:5} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /ovdet/tools/test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import os.path as osp 5 | 6 | from mmengine.config import Config, DictAction 7 | from mmengine.evaluator import DumpResults 8 | from mmengine.runner import Runner 9 | 10 | from mmdet.engine.hooks.utils import trigger_visualization_hook 11 | from mmdet.registry import RUNNERS 12 | from mmdet.utils import register_all_modules 13 | import ovdet # noqa 14 | 15 | 16 | # TODO: support fuse_conv_bn and format_only 17 | def parse_args(): 18 | parser = argparse.ArgumentParser( 19 | description='MMDet test (and eval) a model') 20 | parser.add_argument('config', help='test config file path') 21 | parser.add_argument('checkpoint', help='checkpoint file') 22 | parser.add_argument( 23 | '--work-dir', 24 | help='the directory to save the file containing evaluation metrics') 25 | parser.add_argument( 26 | '--out', 27 | type=str, 28 | help='dump predictions to a pickle file for offline evaluation') 29 | parser.add_argument( 30 | '--show', action='store_true', help='show prediction results') 31 | parser.add_argument( 32 | '--show-dir', 33 | help='directory where painted images will be saved. ' 34 | 'If specified, it will be automatically saved ' 35 | 'to the work_dir/timestamp/show_dir') 36 | parser.add_argument( 37 | '--wait-time', type=float, default=2, help='the interval of show (s)') 38 | parser.add_argument( 39 | '--cfg-options', 40 | nargs='+', 41 | action=DictAction, 42 | help='override some settings in the used config, the key-value pair ' 43 | 'in xxx=yyy format will be merged into config file. If the value to ' 44 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 45 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 46 | 'Note that the quotation marks are necessary and that no white space ' 47 | 'is allowed.') 48 | parser.add_argument( 49 | '--launcher', 50 | choices=['none', 'pytorch', 'slurm', 'mpi'], 51 | default='none', 52 | help='job launcher') 53 | parser.add_argument('--local_rank', type=int, default=0) 54 | args = parser.parse_args() 55 | if 'LOCAL_RANK' not in os.environ: 56 | os.environ['LOCAL_RANK'] = str(args.local_rank) 57 | return args 58 | 59 | 60 | def main(): 61 | args = parse_args() 62 | 63 | # register all modules in mmdet into the registries 64 | # do not init the default scope here because it will be init in the runner 65 | register_all_modules(init_default_scope=False) 66 | 67 | # load config 68 | cfg = Config.fromfile(args.config) 69 | cfg.launcher = args.launcher 70 | if args.cfg_options is not None: 71 | cfg.merge_from_dict(args.cfg_options) 72 | 73 | # work_dir is determined in this priority: CLI > segment in file > filename 74 | if args.work_dir is not None: 75 | # update configs according to CLI args if args.work_dir is not None 76 | cfg.work_dir = args.work_dir 77 | elif cfg.get('work_dir', None) is None: 78 | # use config filename as default work_dir if cfg.work_dir is None 79 | cfg.work_dir = osp.join('./work_dirs', 80 | osp.splitext(osp.basename(args.config))[0]) 81 | 82 | cfg.load_from = args.checkpoint 83 | 84 | if args.show or args.show_dir: 85 | cfg = trigger_visualization_hook(cfg, args) 86 | 87 | # build the runner from config 88 | if 'runner_type' not in cfg: 89 | # build the default runner 90 | runner = Runner.from_cfg(cfg) 91 | else: 92 | # build customized runner from the registry 93 | # if 'runner_type' is set in the cfg 94 | runner = RUNNERS.build(cfg) 95 | 96 | # add `DumpResults` dummy metric 97 | if args.out is not None: 98 | assert args.out.endswith(('.pkl', '.pickle')), \ 99 | 'The dump file must be a pkl file.' 100 | runner.test_evaluator.metrics.append( 101 | DumpResults(out_file_path=args.out)) 102 | 103 | # start testing 104 | runner.test() 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /ovdet/tools/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import logging 4 | import os 5 | import os.path as osp 6 | 7 | from mmengine.config import Config, DictAction 8 | from mmengine.logging import print_log 9 | from mmengine.registry import RUNNERS 10 | from mmengine.runner import Runner 11 | 12 | from mmdet.utils import register_all_modules 13 | 14 | import ovdet # noqa 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser(description='Train a detector') 19 | parser.add_argument('config', help='train config file path') 20 | parser.add_argument('--work-dir', help='the dir to save logs and models') 21 | parser.add_argument( 22 | '--amp', 23 | action='store_true', 24 | default=False, 25 | help='enable automatic-mixed-precision training') 26 | parser.add_argument( 27 | '--auto-scale-lr', 28 | action='store_true', 29 | help='enable automatically scaling LR.') 30 | parser.add_argument( 31 | '--resume', 32 | nargs='?', 33 | type=str, 34 | const='auto', 35 | help='If specify checkpoint path, resume from it, while if not ' 36 | 'specify, try to auto resume from the latest checkpoint ' 37 | 'in the work directory.') 38 | parser.add_argument( 39 | '--cfg-options', 40 | nargs='+', 41 | action=DictAction, 42 | help='override some settings in the used config, the key-value pair ' 43 | 'in xxx=yyy format will be merged into config file. If the value to ' 44 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 45 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 46 | 'Note that the quotation marks are necessary and that no white space ' 47 | 'is allowed.') 48 | parser.add_argument( 49 | '--launcher', 50 | choices=['none', 'pytorch', 'slurm', 'mpi'], 51 | default='none', 52 | help='job launcher') 53 | parser.add_argument('--local_rank', type=int, default=0) 54 | args = parser.parse_args() 55 | if 'LOCAL_RANK' not in os.environ: 56 | os.environ['LOCAL_RANK'] = str(args.local_rank) 57 | 58 | return args 59 | 60 | 61 | def main(): 62 | args = parse_args() 63 | 64 | # register all modules in mmdet into the registries 65 | # do not init the default scope here because it will be init in the runner 66 | register_all_modules(init_default_scope=False) 67 | 68 | # load config 69 | cfg = Config.fromfile(args.config) 70 | cfg.launcher = args.launcher 71 | if args.cfg_options is not None: 72 | cfg.merge_from_dict(args.cfg_options) 73 | 74 | # work_dir is determined in this priority: CLI > segment in file > filename 75 | if args.work_dir is not None: 76 | # update configs according to CLI args if args.work_dir is not None 77 | cfg.work_dir = args.work_dir 78 | elif cfg.get('work_dir', None) is None: 79 | # use config filename as default work_dir if cfg.work_dir is None 80 | cfg.work_dir = osp.join('./work_dirs', 81 | osp.splitext(osp.basename(args.config))[0]) 82 | 83 | # enable automatic-mixed-precision training 84 | if args.amp is True: 85 | optim_wrapper = cfg.optim_wrapper.type 86 | if optim_wrapper == 'AmpOptimWrapper': 87 | print_log( 88 | 'AMP training is already enabled in your config.', 89 | logger='current', 90 | level=logging.WARNING) 91 | else: 92 | assert optim_wrapper == 'OptimWrapper', ( 93 | '`--amp` is only supported when the optimizer wrapper type is ' 94 | f'`OptimWrapper` but got {optim_wrapper}.') 95 | cfg.optim_wrapper.type = 'AmpOptimWrapper' 96 | cfg.optim_wrapper.loss_scale = 'dynamic' 97 | 98 | # enable automatically scaling LR 99 | if args.auto_scale_lr: 100 | if 'auto_scale_lr' in cfg and \ 101 | 'enable' in cfg.auto_scale_lr and \ 102 | 'base_batch_size' in cfg.auto_scale_lr: 103 | cfg.auto_scale_lr.enable = True 104 | else: 105 | raise RuntimeError('Can not find "auto_scale_lr" or ' 106 | '"auto_scale_lr.enable" or ' 107 | '"auto_scale_lr.base_batch_size" in your' 108 | ' configuration file.') 109 | 110 | # resume is determined in this priority: resume from > auto_resume 111 | if args.resume == 'auto': 112 | cfg.resume = True 113 | cfg.load_from = None 114 | elif args.resume is not None: 115 | cfg.resume = True 116 | cfg.load_from = args.resume 117 | 118 | # build the runner from config 119 | if 'runner_type' not in cfg: 120 | # build the default runner 121 | runner = Runner.from_cfg(cfg) 122 | else: 123 | # build customized runner from the registry 124 | # if 'runner_type' is set in the cfg 125 | runner = RUNNERS.build(cfg) 126 | 127 | # start training 128 | runner.train() 129 | 130 | 131 | if __name__ == '__main__': 132 | main() 133 | -------------------------------------------------------------------------------- /requirements-training.txt: -------------------------------------------------------------------------------- 1 | torch>=1.9.0 2 | torchvision 3 | webdataset>=0.2.5 4 | regex 5 | ftfy 6 | tqdm 7 | pandas 8 | braceexpand 9 | huggingface_hub 10 | transformers 11 | timm 12 | fsspec 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.9.0 2 | torchvision 3 | regex 4 | ftfy 5 | tqdm 6 | huggingface_hub 7 | sentencepiece 8 | protobuf<4 9 | timm 10 | panopticapi@git+https://github.com/cocodataset/panopticapi.git -------------------------------------------------------------------------------- /scripts/test_openai_vitb16_macc_boxes_masks.sh: -------------------------------------------------------------------------------- 1 | NAME=$1 2 | CHECKPOINT=$2 3 | torchrun --nproc_per_node 4 -m training.main --batch-size=1 \ 4 | --model ViT-B-16 --pretrained openai --test-type coco_panoptic --train-data="" \ 5 | --val-data data/coco/annotations/panoptic_val2017.json \ 6 | --embed-path metadata/coco_panoptic_clip_hand_craft_ViTB16.npy \ 7 | --val-image-root data/coco/val2017 --cache-dir $CHECKPOINT --extract-type="v2" \ 8 | --name $NAME --downsample-factor 16 --det-image-size 1024 9 | -------------------------------------------------------------------------------- /scripts/train_clim_cc3m_3e_openai_vitb16.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 4 -m training.main --batch-size=32 --lr=1e-5 --wd=0.1 --epochs=3 --workers=4 \ 2 | --model ViT-B-16 --pretrained openai --warmup 1000 --zeroshot-frequency 1 --dataset-type coco_caption \ 3 | --test-type coco_panoptic --train-data data/cc3m/cc3m_train_original_size_filtered.json \ 4 | --val-data data/coco/annotations/panoptic_val2017.json \ 5 | --embed-path metadata/coco_panoptic_clip_hand_craft_ViTB16.npy --train-image-root="data/cc3m" \ 6 | --val-image-root data/coco/val2017 --cache-dir checkpoints --log-every-n-steps 50 \ 7 | --lock-image --save-frequency 3 --lock-image-unlocked-groups 6 --extract-type="v2" \ 8 | --name clim_cc3m_3_save3_test1_openai_vitb16_6layers --downsample-factor 16 --det-image-size 1024 \ 9 | --alpha 0.7 --train-image-size 1024 10 | -------------------------------------------------------------------------------- /scripts/train_clim_coco_100e_openai_vitb16.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 4 -m training.main --batch-size=32 --lr=1e-5 --wd=0.1 --epochs=100 --workers=4 \ 2 | --model ViT-B-16 --pretrained openai --warmup 1000 --zeroshot-frequency 10 --dataset-type coco_caption \ 3 | --test-type coco_panoptic --train-data data/coco/wusize/captions_train2017_tags_allcaps.json \ 4 | --val-data data/coco/annotations/panoptic_val2017.json \ 5 | --embed-path metadata/coco_panoptic_clip_hand_craft_ViTB16.npy --train-image-root="data/coco/train2017" \ 6 | --val-image-root data/coco/val2017 --cache-dir checkpoints --log-every-n-steps 50 \ 7 | --lock-image --save-frequency 100 --lock-image-unlocked-groups 6 --extract-type="v2" \ 8 | --name clim_coco_100_save100_test10_openai_vitb16_6layers --downsample-factor 16 --det-image-size 1024 \ 9 | --alpha 0.7 --train-image-size 1024 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ Setup 2 | """ 3 | from setuptools import setup, find_packages 4 | from codecs import open 5 | from os import path 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Get the long description from the README file 10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 11 | long_description = f.read() 12 | 13 | def _read_reqs(relpath): 14 | fullpath = path.join(path.dirname(__file__), relpath) 15 | with open(fullpath) as f: 16 | return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))] 17 | 18 | REQUIREMENTS = _read_reqs("requirements.txt") 19 | TRAINING_REQUIREMENTS = _read_reqs("requirements-training.txt") 20 | 21 | exec(open('src/open_clip/version.py').read()) 22 | setup( 23 | name='open_clip_torch', 24 | version=__version__, 25 | description='OpenCLIP', 26 | long_description=long_description, 27 | long_description_content_type='text/markdown', 28 | url='https://github.com/mlfoundations/open_clip', 29 | author='', 30 | author_email='', 31 | classifiers=[ 32 | # How mature is this project? Common values are 33 | # 3 - Alpha 34 | # 4 - Beta 35 | # 5 - Production/Stable 36 | 'Development Status :: 3 - Alpha', 37 | 'Intended Audience :: Education', 38 | 'Intended Audience :: Science/Research', 39 | 'License :: OSI Approved :: Apache Software License', 40 | 'Programming Language :: Python :: 3.7', 41 | 'Programming Language :: Python :: 3.8', 42 | 'Programming Language :: Python :: 3.9', 43 | 'Programming Language :: Python :: 3.10', 44 | 'Topic :: Scientific/Engineering', 45 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 46 | 'Topic :: Software Development', 47 | 'Topic :: Software Development :: Libraries', 48 | 'Topic :: Software Development :: Libraries :: Python Modules', 49 | ], 50 | 51 | # Note that this is a string of words separated by whitespace, not a list. 52 | keywords='CLIP pretrained', 53 | package_dir={'': 'src'}, 54 | packages=find_packages(where='src'), 55 | include_package_data=True, 56 | install_requires=REQUIREMENTS, 57 | extras_require={ 58 | "training": TRAINING_REQUIREMENTS, 59 | }, 60 | python_requires='>=3.7', 61 | ) 62 | -------------------------------------------------------------------------------- /src/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .coca_model import CoCa 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ 7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 8 | from .openai import load_openai_model, list_openai_models 9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ 10 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub 12 | from .tokenizer import SimpleTokenizer, tokenize, decode 13 | from .transform import image_transform, AugmentationCfg 14 | -------------------------------------------------------------------------------- /src/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /src/open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /src/open_clip/customs.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch.nn import MultiheadAttention 3 | from torch.nn import functional as F 4 | from typing import Optional, Tuple 5 | 6 | 7 | class MultiheadSelfAttention(MultiheadAttention): 8 | def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None, 9 | need_weights: bool = True, attn_mask: Optional[Tensor] = None, return_tokens: bool = False) \ 10 | -> Tuple[Tensor, Tensor]: 11 | assert query is value and value is key # self-attention 12 | if return_tokens: 13 | # in_projection 14 | tokens = F.linear(value, self.in_proj_weight, bias=self.in_proj_bias)[..., -self.embed_dim:] 15 | # out_projection 16 | tokens = F.linear(tokens, self.out_proj.weight, bias=self.out_proj.bias) 17 | else: 18 | tokens = None 19 | 20 | attn_output, attn_output_weights = F.multi_head_attention_forward( 21 | query=query, key=key, value=value, 22 | embed_dim_to_check=self.embed_dim, 23 | num_heads=self.num_heads, 24 | in_proj_weight=self.in_proj_weight, 25 | in_proj_bias=self.in_proj_bias, 26 | bias_k=None, bias_v=None, 27 | add_zero_attn=False, 28 | dropout_p=0., 29 | out_proj_weight=self.out_proj.weight, 30 | out_proj_bias=self.out_proj.bias, 31 | training=self.training, 32 | key_padding_mask=key_padding_mask, need_weights=need_weights, 33 | attn_mask=attn_mask) 34 | 35 | return attn_output, tokens # , attn_output_weights 36 | -------------------------------------------------------------------------------- /src/open_clip/eva_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\ 6 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 7 | from .openai import load_openai_model, list_openai_models 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\ 9 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 10 | from .tokenizer import SimpleTokenizer, tokenize 11 | from .transform import image_transform -------------------------------------------------------------------------------- /src/open_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/open_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /src/open_clip/eva_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /src/open_clip/eva_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | "bert": { 46 | "config_names": { 47 | "context_length": "max_position_embeddings", 48 | "vocab_size": "vocab_size", 49 | "width": "hidden_size", 50 | "heads": "num_attention_heads", 51 | "layers": "num_hidden_layers", 52 | "layer_attr": "layer", 53 | "token_embeddings_attr": "embeddings" 54 | }, 55 | "pooler": "mean_pooler", 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/open_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /src/open_clip/eva_clip/transform.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms.functional as F 6 | 7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ 8 | CenterCrop 9 | 10 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 11 | 12 | 13 | class ResizeMaxSize(nn.Module): 14 | 15 | def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0): 16 | super().__init__() 17 | if not isinstance(max_size, int): 18 | raise TypeError(f"Size should be int. Got {type(max_size)}") 19 | self.max_size = max_size 20 | self.interpolation = interpolation 21 | self.fn = min if fn == 'min' else min 22 | self.fill = fill 23 | 24 | def forward(self, img): 25 | if isinstance(img, torch.Tensor): 26 | height, width = img.shape[:2] 27 | else: 28 | width, height = img.size 29 | scale = self.max_size / float(max(height, width)) 30 | if scale != 1.0: 31 | new_size = tuple(round(dim * scale) for dim in (height, width)) 32 | img = F.resize(img, new_size, self.interpolation) 33 | pad_h = self.max_size - new_size[0] 34 | pad_w = self.max_size - new_size[1] 35 | img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill) 36 | return img 37 | 38 | 39 | def _convert_to_rgb(image): 40 | return image.convert('RGB') 41 | 42 | 43 | # class CatGen(nn.Module): 44 | # def __init__(self, num=4): 45 | # self.num = num 46 | # def mixgen_batch(image, text): 47 | # batch_size = image.shape[0] 48 | # index = np.random.permutation(batch_size) 49 | 50 | # cat_images = [] 51 | # for i in range(batch_size): 52 | # # image mixup 53 | # image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:] 54 | # # text concat 55 | # text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0] 56 | # text = torch.stack(text) 57 | # return image, text 58 | 59 | 60 | def image_transform( 61 | image_size: int, 62 | is_train: bool, 63 | mean: Optional[Tuple[float, ...]] = None, 64 | std: Optional[Tuple[float, ...]] = None, 65 | resize_longest_max: bool = False, 66 | fill_color: int = 0, 67 | ): 68 | mean = mean or OPENAI_DATASET_MEAN 69 | if not isinstance(mean, (list, tuple)): 70 | mean = (mean,) * 3 71 | 72 | std = std or OPENAI_DATASET_STD 73 | if not isinstance(std, (list, tuple)): 74 | std = (std,) * 3 75 | 76 | if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: 77 | # for square size, pass size as int so that Resize() uses aspect preserving shortest edge 78 | image_size = image_size[0] 79 | 80 | normalize = Normalize(mean=mean, std=std) 81 | if is_train: 82 | return Compose([ 83 | RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), 84 | _convert_to_rgb, 85 | ToTensor(), 86 | normalize, 87 | ]) 88 | else: 89 | if resize_longest_max: 90 | transforms = [ 91 | ResizeMaxSize(image_size, fill=fill_color) 92 | ] 93 | else: 94 | transforms = [ 95 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 96 | CenterCrop(image_size), 97 | ] 98 | transforms.extend([ 99 | _convert_to_rgb, 100 | ToTensor(), 101 | normalize, 102 | ]) 103 | return Compose(transforms) 104 | -------------------------------------------------------------------------------- /src/open_clip/generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/open_clip/generation_utils.py -------------------------------------------------------------------------------- /src/open_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/RN50x64.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": [ 6 | 3, 7 | 15, 8 | 36, 9 | 10 10 | ], 11 | "width": 128, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 1024, 18 | "heads": 16, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/coca_ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "attn_pooler_heads": 8 28 | }, 29 | "custom_text": true 30 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/coca_ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 768, 25 | "heads": 12, 26 | "layers": 12, 27 | "attn_pooler_heads": 12 28 | }, 29 | "custom_text": true 30 | } 31 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/coca_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "multimodal_cfg": { 4 | "width": 768, 5 | "context_length": 76, 6 | "vocab_size": 64000, 7 | "mlp_ratio": 4, 8 | "layers": 12, 9 | "dim_head": 64, 10 | "heads": 12, 11 | "n_queries": 256, 12 | "attn_pooler_heads": 8 13 | }, 14 | "vision_cfg": { 15 | "image_size": 288, 16 | "layers": 12, 17 | "width": 768, 18 | "patch_size": 18, 19 | "output_tokens": true 20 | }, 21 | "text_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 64000, 24 | "layers": 12, 25 | "heads": 12, 26 | "width": 768, 27 | "embed_cls": true, 28 | "output_tokens": true 29 | }, 30 | "custom_text": true 31 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/coca_roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "output_tokens": true 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "linear", 14 | "width": 768, 15 | "output_tokens": true 16 | }, 17 | "multimodal_cfg": { 18 | "context_length": 76, 19 | "width": 768, 20 | "heads": 8, 21 | "layers": 12 22 | }, 23 | "custom_text": true 24 | } 25 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_base_w.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_base_w_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_large_d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_large_d_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_small", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_tiny", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 20 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/convnext_xxlarge_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/mt5-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "google/mt5-base", 11 | "hf_tokenizer_name": "google/mt5-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/mt5-xl-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "google/mt5-xl", 12 | "hf_tokenizer_name": "google/mt5-xl", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "xlm-roberta-base", 11 | "hf_tokenizer_name": "xlm-roberta-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "xlm-roberta-large", 12 | "hf_tokenizer_name": "xlm-roberta-large", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/open_clip/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import repeat 2 | import collections.abc 3 | 4 | from torch import nn as nn 5 | from torchvision.ops.misc import FrozenBatchNorm2d 6 | 7 | 8 | def freeze_batch_norm_2d(module, module_match={}, name=''): 9 | """ 10 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is 11 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and 12 | returned. Otherwise, the module is walked recursively and submodules are converted in place. 13 | 14 | Args: 15 | module (torch.nn.Module): Any PyTorch module. 16 | module_match (dict): Dictionary of full module names to freeze (all if empty) 17 | name (str): Full module name (prefix) 18 | 19 | Returns: 20 | torch.nn.Module: Resulting module 21 | 22 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 23 | """ 24 | res = module 25 | is_match = True 26 | if module_match: 27 | is_match = name in module_match 28 | if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)): 29 | res = FrozenBatchNorm2d(module.num_features) 30 | res.num_features = module.num_features 31 | res.affine = module.affine 32 | if module.affine: 33 | res.weight.data = module.weight.data.clone().detach() 34 | res.bias.data = module.bias.data.clone().detach() 35 | res.running_mean.data = module.running_mean.data 36 | res.running_var.data = module.running_var.data 37 | res.eps = module.eps 38 | else: 39 | for child_name, child in module.named_children(): 40 | full_child_name = '.'.join([name, child_name]) if name else child_name 41 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name) 42 | if new_child is not child: 43 | res.add_module(child_name, new_child) 44 | return res 45 | 46 | 47 | # From PyTorch internals 48 | def _ntuple(n): 49 | def parse(x): 50 | if isinstance(x, collections.abc.Iterable): 51 | return x 52 | return tuple(repeat(x, n)) 53 | return parse 54 | 55 | 56 | to_1tuple = _ntuple(1) 57 | to_2tuple = _ntuple(2) 58 | to_3tuple = _ntuple(3) 59 | to_4tuple = _ntuple(4) 60 | to_ntuple = lambda n, x: _ntuple(n)(x) 61 | -------------------------------------------------------------------------------- /src/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.16.0' 2 | -------------------------------------------------------------------------------- /src/training/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | -------------------------------------------------------------------------------- /src/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wusize/CLIM/7baade3d4b1cae93de3682ac37257db54932b944/src/training/__init__.py -------------------------------------------------------------------------------- /src/training/clim.py: -------------------------------------------------------------------------------- 1 | # TODO: process mosaicked image 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | class CLIM: 7 | mosaic_choices = [2, 3, 4] 8 | 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def __call__(self, batch, model, dist_model, loss, device, cast_dtype, 13 | distributed, args): 14 | if distributed: 15 | model = model.module 16 | images, texts = batch 17 | images = images.to(device=device, dtype=cast_dtype, non_blocking=True) 18 | texts = texts.to(device=device, non_blocking=True) 19 | 20 | mosaicked_images, pseudo_boxes_list, single_images \ 21 | = self.split_a_batch(images, args.train_image_size) 22 | single_image_features = model.encode_image(single_images, normalize=True) 23 | with torch.no_grad(): 24 | text_features = model.encode_text(texts, normalize=True) 25 | logit_scale = model.logit_scale.exp() 26 | 27 | pseudo_region_features = model.encode_pseudo_boxes( 28 | mosaicked_images, pseudo_boxes_list, normalize=True, extract_type=args.extract_type) 29 | image_features = torch.cat([pseudo_region_features, single_image_features], dim=0) 30 | 31 | contrast_loss = loss(image_features, 32 | text_features, 33 | logit_scale, 34 | output_dict=False, ) 35 | 36 | losses = dict(loss_contrast=contrast_loss * args.contrast_weight) 37 | 38 | return losses, len(images), logit_scale 39 | 40 | 41 | @staticmethod 42 | def _generate_normed_boxes(M, N): 43 | grid_x, grid_y = torch.meshgrid(torch.linspace(0, 1, N + 1), torch.linspace(0, 1, M + 1), 44 | indexing='xy') 45 | x0y0s = torch.stack([grid_x[:M, :N], grid_y[:M, :N]], dim=-1) 46 | x1y1s = torch.stack([grid_x[1:, 1:], grid_y[1:, 1:]], dim=-1) 47 | pseudo_boxes = torch.cat([x0y0s, x1y1s], 48 | dim=-1).view(-1, 4) 49 | return pseudo_boxes 50 | 51 | def split_a_batch(self, images, train_image_size): 52 | batch_size = images.shape[0] 53 | choices = self.mosaic_choices 54 | min_images = sum([c**2 for c in choices]) 55 | 56 | assert batch_size >= min_images 57 | num_single = batch_size % min_images 58 | num_groups = batch_size // min_images 59 | # assert num_single == 0 60 | split = [c for c in choices for _ in range(num_groups)] 61 | # split = [2] * num_groups + [3] * num_groups + [4] * num_groups 62 | pseudo_boxes_list = [self._generate_normed_boxes(s, s).to(images) for s in split] 63 | 64 | images_list = torch.split(images, [s**2 for s in split] + [num_single], dim=0) 65 | 66 | mosaicked_images_list = [ 67 | F.interpolate(self._mosaic_a_minibatch(imgs, s, s), size=train_image_size, mode='bicubic') 68 | for imgs, s in zip(images_list[:-1], split)] 69 | 70 | mosaicked_images = torch.cat(mosaicked_images_list) 71 | 72 | return mosaicked_images, pseudo_boxes_list, images_list[-1] 73 | 74 | @staticmethod 75 | def _mosaic_a_minibatch(images, M, N): 76 | bs, _, h, w = images.shape 77 | assert bs % (M * N) == 0 78 | num_mosaic = bs // (M*N) 79 | images_for_mosaic = images.permute(0, 2, 3, 1) 80 | images_for_mosaic = images_for_mosaic.view(num_mosaic, M, N, h, w, 3) 81 | images_for_mosaic = images_for_mosaic.permute(0, 1, 3, 2, 4, 5).contiguous() 82 | mosaicked_images = images_for_mosaic.view(num_mosaic, M * h, N * w, 3) 83 | mosaicked_images = mosaicked_images.permute(0, 3, 1, 2) 84 | 85 | return mosaicked_images 86 | -------------------------------------------------------------------------------- /src/training/custom_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import torch.nn as nn 4 | import torchvision.transforms.functional as F 5 | from torchvision.transforms import RandomCrop, InterpolationMode 6 | 7 | 8 | class CustomRandomResize(nn.Module): 9 | 10 | def __init__(self, scale=(0.5, 2.0), interpolation=InterpolationMode.BILINEAR): 11 | super().__init__() 12 | self.min_scale, self.max_scale = min(scale), max(scale) 13 | self.interpolation = interpolation 14 | 15 | def forward(self, img): 16 | if isinstance(img, torch.Tensor): 17 | height, width = img.shape[:2] 18 | else: 19 | width, height = img.size 20 | scale = random.uniform(self.min_scale, self.max_scale) 21 | new_size = [int(height * scale), int(width * scale)] 22 | img = F.resize(img, new_size, self.interpolation) 23 | 24 | return img 25 | 26 | 27 | class CustomRandomCrop(RandomCrop): 28 | def forward(self, img): 29 | """ 30 | Args: 31 | img (PIL Image or Tensor): Image to be cropped. 32 | 33 | Returns: 34 | PIL Image or Tensor: Cropped image. 35 | """ 36 | 37 | width, height = F.get_image_size(img) 38 | tar_h, tar_w = self.size 39 | 40 | tar_h = min(tar_h, height) 41 | tar_w = min(tar_w, width) 42 | i, j, h, w = self.get_params(img, (tar_h, tar_w)) 43 | 44 | return F.crop(img, i, j, h, w) 45 | -------------------------------------------------------------------------------- /src/training/distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | try: 7 | import horovod.torch as hvd 8 | except ImportError: 9 | hvd = None 10 | 11 | 12 | def is_global_master(args): 13 | return args.rank == 0 14 | 15 | 16 | def is_local_master(args): 17 | return args.local_rank == 0 18 | 19 | 20 | def is_master(args, local=False): 21 | return is_local_master(args) if local else is_global_master(args) 22 | 23 | 24 | def is_using_horovod(): 25 | # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set 26 | # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required... 27 | ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"] 28 | pmi_vars = ["PMI_RANK", "PMI_SIZE"] 29 | if all([var in os.environ for var in ompi_vars]) or all([var in os.environ for var in pmi_vars]): 30 | return True 31 | else: 32 | return False 33 | 34 | 35 | def is_using_distributed(): 36 | if 'WORLD_SIZE' in os.environ: 37 | return int(os.environ['WORLD_SIZE']) > 1 38 | if 'SLURM_NTASKS' in os.environ: 39 | return int(os.environ['SLURM_NTASKS']) > 1 40 | return False 41 | 42 | 43 | def world_info_from_env(): 44 | local_rank = 0 45 | for v in ('LOCAL_RANK', 'MPI_LOCALRANKID', 'SLURM_LOCALID', 'OMPI_COMM_WORLD_LOCAL_RANK'): 46 | if v in os.environ: 47 | local_rank = int(os.environ[v]) 48 | break 49 | global_rank = 0 50 | for v in ('RANK', 'PMI_RANK', 'SLURM_PROCID', 'OMPI_COMM_WORLD_RANK'): 51 | if v in os.environ: 52 | global_rank = int(os.environ[v]) 53 | break 54 | world_size = 1 55 | for v in ('WORLD_SIZE', 'PMI_SIZE', 'SLURM_NTASKS', 'OMPI_COMM_WORLD_SIZE'): 56 | if v in os.environ: 57 | world_size = int(os.environ[v]) 58 | break 59 | 60 | return local_rank, global_rank, world_size 61 | 62 | 63 | def init_distributed_device(args): 64 | # Distributed training = training on more than one GPU. 65 | # Works in both single and multi-node scenarios. 66 | args.distributed = False 67 | args.world_size = 1 68 | args.rank = 0 # global rank 69 | args.local_rank = 0 70 | if args.horovod: 71 | assert hvd is not None, "Horovod is not installed" 72 | hvd.init() 73 | args.local_rank = int(hvd.local_rank()) 74 | args.rank = hvd.rank() 75 | args.world_size = hvd.size() 76 | args.distributed = True 77 | os.environ['LOCAL_RANK'] = str(args.local_rank) 78 | os.environ['RANK'] = str(args.rank) 79 | os.environ['WORLD_SIZE'] = str(args.world_size) 80 | elif is_using_distributed(): 81 | if 'SLURM_PROCID' in os.environ: 82 | # DDP via SLURM 83 | args.local_rank, args.rank, args.world_size = world_info_from_env() 84 | # SLURM var -> torch.distributed vars in case needed 85 | os.environ['LOCAL_RANK'] = str(args.local_rank) 86 | os.environ['RANK'] = str(args.rank) 87 | os.environ['WORLD_SIZE'] = str(args.world_size) 88 | torch.distributed.init_process_group( 89 | backend=args.dist_backend, 90 | init_method=args.dist_url, 91 | world_size=args.world_size, 92 | rank=args.rank, 93 | ) 94 | else: 95 | # DDP via torchrun, torch.distributed.launch 96 | args.local_rank, _, _ = world_info_from_env() 97 | torch.distributed.init_process_group( 98 | backend=args.dist_backend, 99 | init_method=args.dist_url) 100 | args.world_size = torch.distributed.get_world_size() 101 | args.rank = torch.distributed.get_rank() 102 | args.distributed = True 103 | 104 | if torch.cuda.is_available(): 105 | if args.distributed and not args.no_set_device_rank: 106 | device = 'cuda:%d' % args.local_rank 107 | else: 108 | device = 'cuda:0' 109 | torch.cuda.set_device(device) 110 | else: 111 | device = 'cpu' 112 | args.device = device 113 | device = torch.device(device) 114 | return device 115 | 116 | 117 | def broadcast_object(args, obj, src=0): 118 | # broadcast a pickle-able python object from rank-0 to all ranks 119 | if args.horovod: 120 | return hvd.broadcast_object(obj, root_rank=src) 121 | else: 122 | if args.rank == src: 123 | objects = [obj] 124 | else: 125 | objects = [None] 126 | dist.broadcast_object_list(objects, src=src) 127 | return objects[0] 128 | 129 | 130 | def all_gather_object(args, obj, dst=0): 131 | # gather a pickle-able python object across all ranks 132 | if args.horovod: 133 | return hvd.allgather_object(obj) 134 | else: 135 | objects = [None for _ in range(args.world_size)] 136 | dist.all_gather_object(objects, obj) 137 | return objects 138 | -------------------------------------------------------------------------------- /src/training/file_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import multiprocessing 4 | import subprocess 5 | import time 6 | import fsspec 7 | import torch 8 | from tqdm import tqdm 9 | 10 | def remote_sync_s3(local_dir, remote_dir): 11 | # skip epoch_latest which can change during sync. 12 | result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 13 | if result.returncode != 0: 14 | logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}") 15 | return False 16 | 17 | logging.info(f"Successfully synced with S3 bucket") 18 | return True 19 | 20 | def remote_sync_fsspec(local_dir, remote_dir): 21 | # FIXME currently this is slow and not recommended. Look into speeding up. 22 | a = fsspec.get_mapper(local_dir) 23 | b = fsspec.get_mapper(remote_dir) 24 | 25 | for k in a: 26 | # skip epoch_latest which can change during sync. 27 | if 'epoch_latest.pt' in k: 28 | continue 29 | 30 | logging.info(f'Attempting to sync {k}') 31 | if k in b and len(a[k]) == len(b[k]): 32 | logging.debug(f'Skipping remote sync for {k}.') 33 | continue 34 | 35 | try: 36 | logging.info(f'Successful sync for {k}.') 37 | b[k] = a[k] 38 | except Exception as e: 39 | logging.info(f'Error during remote sync for {k}: {e}') 40 | return False 41 | 42 | return True 43 | 44 | def remote_sync(local_dir, remote_dir, protocol): 45 | logging.info('Starting remote sync.') 46 | if protocol == 's3': 47 | return remote_sync_s3(local_dir, remote_dir) 48 | elif protocol == 'fsspec': 49 | return remote_sync_fsspec(local_dir, remote_dir) 50 | else: 51 | logging.error('Remote protocol not known') 52 | return False 53 | 54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol): 55 | while True: 56 | time.sleep(sync_every) 57 | remote_sync(local_dir, remote_dir, protocol) 58 | 59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol): 60 | p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol)) 61 | return p 62 | 63 | # Note: we are not currently using this save function. 64 | def pt_save(pt_obj, file_path): 65 | of = fsspec.open(file_path, "wb") 66 | with of as f: 67 | torch.save(pt_obj, file_path) 68 | 69 | def pt_load(file_path, map_location=None): 70 | if file_path.startswith('s3'): 71 | logging.info('Loading remote checkpoint, which may take a bit.') 72 | of = fsspec.open(file_path, "rb") 73 | with of as f: 74 | out = torch.load(f, map_location=map_location) 75 | return out 76 | 77 | def check_exists(file_path): 78 | try: 79 | with fsspec.open(file_path): 80 | pass 81 | except FileNotFoundError: 82 | return False 83 | return True 84 | -------------------------------------------------------------------------------- /src/training/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def setup_logging(log_file, level, include_host=False): 5 | if include_host: 6 | import socket 7 | hostname = socket.gethostname() 8 | formatter = logging.Formatter( 9 | f'%(asctime)s | {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S') 10 | else: 11 | formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S') 12 | 13 | logging.root.setLevel(level) 14 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] 15 | for logger in loggers: 16 | logger.setLevel(level) 17 | 18 | stream_handler = logging.StreamHandler() 19 | stream_handler.setFormatter(formatter) 20 | logging.root.addHandler(stream_handler) 21 | 22 | if log_file: 23 | file_handler = logging.FileHandler(filename=log_file) 24 | file_handler.setFormatter(formatter) 25 | logging.root.addHandler(file_handler) 26 | 27 | -------------------------------------------------------------------------------- /src/training/precision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from contextlib import suppress 3 | 4 | 5 | def get_autocast(precision): 6 | if precision == 'amp': 7 | return torch.cuda.amp.autocast 8 | elif precision == 'amp_bfloat16' or precision == 'amp_bf16': 9 | # amp_bfloat16 is more stable than amp float16 for clip training 10 | return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16) 11 | else: 12 | return suppress 13 | -------------------------------------------------------------------------------- /src/training/region_clip.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | import torch.nn as nn 5 | 6 | 7 | def get_fed_loss_inds(gt_classes, num_sample_cats, C): 8 | appeared = torch.unique(gt_classes) # C' 9 | prob = appeared.new_ones(C).float() 10 | if len(appeared) < num_sample_cats: 11 | prob[appeared] = 0 12 | more_appeared = torch.multinomial( 13 | prob, num_sample_cats - len(appeared), 14 | replacement=False) 15 | appeared = torch.cat([appeared, more_appeared]) 16 | return appeared 17 | 18 | 19 | class RegionCLIP(nn.Module): 20 | def __init__(self, args): 21 | super().__init__() 22 | embed_path = args.train_embed_path 23 | noun_embeddings = torch.from_numpy(np.load(embed_path)) 24 | noun_embeddings = F.normalize(noun_embeddings, dim=-1) 25 | self.register_buffer("noun_embeddings", noun_embeddings) 26 | self.place_holder = nn.Parameter(torch.ones(1)) 27 | 28 | def __call__(self, batch, model, dist_model, loss, device, cast_dtype, 29 | distributed, args): 30 | if distributed: 31 | model = model.module 32 | images, boxes = batch 33 | images = images.to(device=device, dtype=cast_dtype, non_blocking=True) 34 | boxes = boxes.to(device=device, non_blocking=True) 35 | 36 | boxes_list = [] 37 | boxes_label_list = [] 38 | 39 | for boxes_per_image in boxes: 40 | boxes_per_image = boxes_per_image[boxes_per_image[:, -1] > 0.5] 41 | boxes_label_list.append(boxes_per_image[:, 4].long()) 42 | boxes_list.append(boxes_per_image[:, :4]) 43 | boxes_labels = torch.cat(boxes_label_list) 44 | box_features = model.encode_pseudo_boxes(images, boxes_list, normalize=True, 45 | extract_type=args.extract_type) 46 | temp = model.logit_scale.exp().detach() 47 | boxes2nouns = box_features @ self.noun_embeddings.T * temp 48 | target = torch.zeros_like(boxes2nouns) 49 | target[range(len(boxes_labels)), boxes_labels] = 1.0 50 | 51 | appeared = get_fed_loss_inds(boxes_labels, 100, self.noun_embeddings.shape[0]) 52 | target = target[:, appeared] 53 | boxes2nouns = boxes2nouns[:, appeared] 54 | 55 | loss_cls = F.binary_cross_entropy_with_logits(boxes2nouns, target, reduction='none') # B x C 56 | loss_cls = loss_cls.sum(-1).mean() 57 | 58 | image_size = model.visual.image_size 59 | if isinstance(image_size, int): 60 | tar_h = tar_w = image_size 61 | else: 62 | tar_h, tar_w = image_size 63 | images = F.interpolate(images, size=(tar_h, tar_w), mode='bilinear') 64 | 65 | losses = dict(loss_contrast=loss_cls * args.contrast_weight) 66 | 67 | return losses, len(images), temp 68 | -------------------------------------------------------------------------------- /src/training/scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def assign_learning_rate(optimizer, new_lr): 5 | for param_group in optimizer.param_groups: 6 | param_group["lr"] = new_lr 7 | 8 | 9 | def _warmup_lr(base_lr, warmup_length, step): 10 | return base_lr * (step + 1) / warmup_length 11 | 12 | 13 | def const_lr(optimizer, base_lr, warmup_length, steps): 14 | def _lr_adjuster(step): 15 | if step < warmup_length: 16 | lr = _warmup_lr(base_lr, warmup_length, step) 17 | else: 18 | lr = base_lr 19 | assign_learning_rate(optimizer, lr) 20 | return lr 21 | return _lr_adjuster 22 | 23 | 24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.): 25 | def _lr_adjuster(step): 26 | start_cooldown_step = steps - cooldown_steps 27 | if step < warmup_length: 28 | lr = _warmup_lr(base_lr, warmup_length, step) 29 | else: 30 | if step < start_cooldown_step: 31 | lr = base_lr 32 | else: 33 | e = step - start_cooldown_step 34 | es = steps - start_cooldown_step 35 | # linear decay if power == 1; polynomial decay otherwise; 36 | decay = (1 - (e/es)) ** cooldown_power 37 | lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr 38 | assign_learning_rate(optimizer, lr) 39 | return lr 40 | return _lr_adjuster 41 | 42 | 43 | def cosine_lr(optimizer, base_lr, warmup_length, steps): 44 | def _lr_adjuster(step): 45 | if step < warmup_length: 46 | lr = _warmup_lr(base_lr, warmup_length, step) 47 | else: 48 | e = step - warmup_length 49 | es = steps - warmup_length 50 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr 51 | assign_learning_rate(optimizer, lr) 52 | return lr 53 | return _lr_adjuster 54 | -------------------------------------------------------------------------------- /src/training/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from functools import partial 3 | from six.moves import map, zip 4 | 5 | 6 | def multi_apply(func, *args, **kwargs): 7 | """Apply function to a list of arguments. 8 | Note: 9 | This function applies the ``func`` to multiple inputs and 10 | map the multiple outputs of the ``func`` into different 11 | list. Each list contains the same type of outputs corresponding 12 | to different inputs. 13 | Args: 14 | func (Function): A function that will be applied to a list of 15 | arguments 16 | Returns: 17 | tuple(list): A tuple containing multiple list, each list contains \ 18 | a kind of returned results by the function 19 | """ 20 | pfunc = partial(func, **kwargs) if kwargs else func 21 | map_results = map(pfunc, *args) 22 | return tuple(map(list, zip(*map_results))) 23 | 24 | 25 | def mask2box(mask): 26 | ys, xs = np.where(mask) 27 | y0, y1 = ys.min(), ys.max() 28 | x0, x1 = xs.min(), xs.max() 29 | 30 | return x0, y0, x1, y1 31 | --------------------------------------------------------------------------------