├── .gitignore ├── INSTALL.md ├── LICENSE ├── README.md ├── configs ├── scan_vitB.yaml ├── scan_vitL.yaml └── scan_vitL_demo.yaml ├── datasets ├── DATASETS.md ├── prepare_ade20k_full_sem_seg.py ├── prepare_ade20k_sem_seg.py ├── prepare_coco_stuff_sem_seg.py ├── prepare_pascal_context.py └── prepare_voc_sem_seg.py ├── demo.py ├── imgs ├── cs.png ├── pipeline.png ├── results.png └── visual.png ├── open_clip_training ├── .github │ └── workflows │ │ ├── ci.yml │ │ ├── clear-cache.yml │ │ └── python-publish.yml ├── .gitignore ├── CITATION.cff ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs │ ├── Interacting_with_open_clip.ipynb │ ├── Interacting_with_open_coca.ipynb │ ├── LOW_ACC.md │ ├── PRETRAINED.md │ ├── clip_conceptual_captions.md │ ├── clipa.md │ ├── datacomp_models.md │ ├── openclip_results.csv │ └── script_examples │ │ ├── clipa │ │ ├── vit_b16 │ │ │ ├── i50_t16_finetune.sh │ │ │ └── i50_t16_pretrain.sh │ │ └── vit_l16 │ │ │ ├── i17_t16_finetune.sh │ │ │ ├── i17_t16_pretrain.sh │ │ │ ├── i37_t8_finetune.sh │ │ │ └── i37_t8_pretrain.sh │ │ ├── clipav2 │ │ └── vit_h14 │ │ │ ├── i257_t32_finetunex4.sh │ │ │ ├── i50_t8_pretrain.sh │ │ │ └── i577_t32_finetunex1.sh │ │ └── stability_example.sh ├── pytest.ini ├── scripts │ ├── clipav1_vit_l16_i37_t8.sh │ ├── clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh │ ├── h14_224_32_finetune.sh │ └── h14_84_8_pretrain.sh ├── setup.py ├── src │ ├── clip_adapter │ │ └── clip_adapter.py │ ├── open_clip │ │ ├── __init__.py │ │ ├── big_vision.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── coca_model.py │ │ ├── constants.py │ │ ├── factory.py │ │ ├── generation_utils.py │ │ ├── hf_configs.py │ │ ├── hf_model.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── EVA01-g-14-plus.json │ │ │ ├── EVA01-g-14.json │ │ │ ├── EVA02-B-16.json │ │ │ ├── EVA02-E-14-plus.json │ │ │ ├── EVA02-E-14.json │ │ │ ├── EVA02-L-14-336.json │ │ │ ├── EVA02-L-14.json │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── RN50x64.json │ │ │ ├── ViT-B-16-SigLIP-256.json │ │ │ ├── ViT-B-16-SigLIP-384.json │ │ │ ├── ViT-B-16-SigLIP-512.json │ │ │ ├── ViT-B-16-SigLIP-i18n-256.json │ │ │ ├── ViT-B-16-SigLIP.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-B-16-quickgelu.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-256.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-H-14-378-quickgelu.json │ │ │ ├── ViT-H-14-CLIPA-336.json │ │ │ ├── ViT-H-14-CLIPA.json │ │ │ ├── ViT-H-14-quickgelu.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-14-CLIPA-336.json │ │ │ ├── ViT-L-14-CLIPA.json │ │ │ ├── ViT-L-14-quickgelu.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-L-16-SigLIP-256.json │ │ │ ├── ViT-L-16-SigLIP-384.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-M-16-alt.json │ │ │ ├── ViT-M-16.json │ │ │ ├── ViT-M-32-alt.json │ │ │ ├── ViT-M-32.json │ │ │ ├── ViT-S-16-alt.json │ │ │ ├── ViT-S-16.json │ │ │ ├── ViT-S-32-alt.json │ │ │ ├── ViT-S-32.json │ │ │ ├── ViT-SO400M-14-SigLIP-384.json │ │ │ ├── ViT-SO400M-14-SigLIP.json │ │ │ ├── ViT-bigG-14-CLIPA-336.json │ │ │ ├── ViT-bigG-14-CLIPA.json │ │ │ ├── ViT-bigG-14.json │ │ │ ├── ViT-e-14.json │ │ │ ├── ViT-g-14.json │ │ │ ├── coca_ViT-B-32.json │ │ │ ├── coca_ViT-L-14.json │ │ │ ├── coca_base.json │ │ │ ├── coca_roberta-ViT-B-32.json │ │ │ ├── convnext_base.json │ │ │ ├── convnext_base_w.json │ │ │ ├── convnext_base_w_320.json │ │ │ ├── convnext_large.json │ │ │ ├── convnext_large_d.json │ │ │ ├── convnext_large_d_320.json │ │ │ ├── convnext_small.json │ │ │ ├── convnext_tiny.json │ │ │ ├── convnext_xlarge.json │ │ │ ├── convnext_xxlarge.json │ │ │ ├── convnext_xxlarge_320.json │ │ │ ├── mt5-base-ViT-B-32.json │ │ │ ├── mt5-xl-ViT-H-14.json │ │ │ ├── nllb-clip-base-siglip.json │ │ │ ├── nllb-clip-base.json │ │ │ ├── nllb-clip-large-siglip.json │ │ │ ├── nllb-clip-large.json │ │ │ ├── roberta-ViT-B-32.json │ │ │ ├── swin_base_patch4_window7_224.json │ │ │ ├── vit_medium_patch16_gap_256.json │ │ │ ├── vit_relpos_medium_patch16_cls_224.json │ │ │ ├── xlm-roberta-base-ViT-B-32.json │ │ │ └── xlm-roberta-large-ViT-H-14.json │ │ ├── modified_resnet.py │ │ ├── openai.py │ │ ├── pos_embed.py │ │ ├── pretrained.py │ │ ├── push_to_hf_hub.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── transformer.py │ │ ├── utils.py │ │ ├── version.py │ │ ├── zero_shot_classifier.py │ │ └── zero_shot_metadata.py │ ├── scripts │ │ ├── 1cap_finetune_VitL.sh │ │ └── finetune_VitL_with_mask.sh │ └── training │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── ade150_zeroshot_data.py │ │ ├── data.py │ │ ├── distributed.py │ │ ├── file_utils.py │ │ ├── main.py │ │ ├── params.py │ │ ├── precision.py │ │ ├── profiler.py │ │ ├── scheduler.py │ │ ├── train.py │ │ └── zero_shot.py ├── tests │ ├── test_download_pretrained.py │ ├── test_hf_model.py │ ├── test_inference.py │ ├── test_inference_simple.py │ ├── test_num_shards.py │ ├── test_training_simple.py │ ├── test_wds.py │ └── util_test.py └── tutorials │ └── int8_tutorial.ipynb ├── requirements.txt ├── scan ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── augmentations.py │ ├── build.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ └── mask_former_semantic_dataset_mapper.py │ └── datasets │ │ ├── __init__.py │ │ ├── csv_data.py │ │ ├── register_ade20k_full.py │ │ ├── register_cc3m.py │ │ ├── register_coco_stuff.py │ │ ├── register_pascal_context.py │ │ └── register_voc_seg.py ├── evaluation │ ├── __init__.py │ └── generalized_sem_seg_evaluation.py ├── frequency.py ├── maskformer_model.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── clip_resnet.py │ │ └── swin.py │ ├── clip_adapter │ │ ├── __init__.py │ │ ├── adapter.py │ │ ├── text_template.py │ │ └── utils.py │ ├── criterion.py │ ├── matcher.py │ ├── meta_arch │ │ ├── __init__.py │ │ ├── mask_former_head.py │ │ └── per_pixel_baseline.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ ├── fpn.py │ │ ├── msdeformattn.py │ │ └── ops │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ ├── cuda │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ ├── ms_deform_attn.h │ │ │ └── vision.cpp │ │ │ └── test.py │ └── transformer_decoder │ │ ├── __init__.py │ │ ├── mask2former_transformer_decoder.py │ │ ├── maskformer_transformer_decoder.py │ │ ├── open_vocab_mask2former_predictor.py │ │ ├── position_encoding.py │ │ └── transformer.py ├── ovseg_model.py ├── test_time_augmentation.py └── utils │ ├── __init__.py │ ├── events.py │ ├── misc.py │ ├── post_process_utils.py │ └── predictor.py ├── tools ├── convert-pretrained-clip-model-to-d2.py ├── convert-pretrained-swin-model-to-d2.py ├── convert-torchvision-to-d2.py └── replace_clip.py └── train_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | outputs 4 | instant_test_output 5 | inference_test_output 6 | 7 | 8 | 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | **/.ipynb_checkpoints/ 34 | 35 | # Editor temporaries 36 | *.swn 37 | *.swo 38 | *.swp 39 | *~ 40 | 41 | # editor settings 42 | .idea 43 | .vscode 44 | _darcs 45 | 46 | # project dirs 47 | /detectron2/model_zoo/configs 48 | /datasets/* 49 | !/datasets/*.* 50 | /projects/*/datasets 51 | /models 52 | /snippet 53 | 54 | # vs code 55 | .history 56 | 57 | amlt 58 | thirdparty 59 | wandb 60 | weights 61 | 62 | 63 | *.zip 64 | *.tar 65 | /output 66 | *.pth 67 | *.pt 68 | 69 | *.png 70 | !imgs/*.png 71 | *.txt 72 | !requirements.txt 73 | 74 | results/ 75 | 76 | openclip_data/ 77 | logs/ 78 | 79 | data 80 | !scan/data 81 | 82 | *log* 83 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux with Python ≥ 3.8 5 | - PyTorch ≥ 1.10 and torchvision that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | 10 | ### Usage 11 | 12 | Install required packages. 13 | 14 | ```bash 15 | conda create -n scan python=3.8 16 | conda activate scan 17 | conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge -y 18 | pip install -r requirements.txt 19 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html 20 | ``` 21 | 22 | 23 | 24 | Install other packages. 25 | 26 | ```bash 27 | cd scan/modeling/pixel_decoder/ops 28 | sh make.sh 29 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open-Vocabulary Segmentation with Semantic-Assisted Calibration [CVPR 2024] 2 | Yong Liu*, Sule Bai*, Guanbin Li, Yitong Wang, Yansong Tang 3 | (*equal contribution) 4 | 5 | The repository contains the official implementation of "Open-Vocabulary Segmentation with Semantic-Assisted Calibration" 6 | 7 | [Paper](https://arxiv.org/abs/2312.04089) 8 | 9 | 10 | 11 | 12 | 13 | --- 14 | ## 📖 Pipeline & Results 15 |

16 | 17 | 18 | 19 |

20 | 21 | 22 | 23 | 24 | 25 | 26 | ### Tab of Content 27 | - [Installation](#1) 28 | - [Data Preparation](#2) 29 | - [Usage](#3) 30 | - [Training](#5) 31 | - [Evaluation](#4) 32 | - [Cite](#6) 33 | 34 | 35 | 36 | 37 | If you find any bugs due to carelessness on our part in organizing the code, feel free to contact us and point that! 38 | 39 | ### Installation 40 | Please see [installation guide](./INSTALL.md). 41 | 42 | 43 | 44 | 45 | ### Data Preparation 46 | Please follow the instruction of [ov-seg](https://github.com/facebookresearch/ov-seg) to prepare the training and test data. The data should be organized like: 47 | ``` 48 | $DETECTRON2_DATASETS/ 49 | coco/ # COCOStuff-171 50 | ADEChallengeData2016/ # ADE20K-150 51 | ADE20K_2021_17_01/ # ADE20K-847 52 | VOCdevkit/ 53 | VOC2012/ # PASCALVOC-20 54 | VOC2010/ # PASCALContext-59, PASCALContext-459 55 | ``` 56 | 57 | 58 | 59 | 60 | ### Usage 61 | 62 | - #### Pretrained Weight 63 | We have provided the pretrained SCAN-VitL weights and the finetuned Contextual-shifted CLIP weights. Please download them from [here](https://drive.google.com/drive/folders/1obgHGQngtQms0u5YUJRnwd4y1IzME-c8?usp=drive_link). 64 | 65 | 66 | 67 | #### Evaluation 68 | 69 | 70 | ``` 71 | python train_net.py --eval-only --config-file --num-gpus OUTPUT_DIR MODEL.WEIGHTS 72 | ``` 73 | - Here is an example: 74 | ``` 75 | python train_net.py --num-gpu 8 --eval-only --config-file configs/scan_vitL.yaml MODEL.WEIGHTS ./SCAN.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\) MODEL.CLIP_ADAPTER.REPLACE_RATIO 0.05 MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.75 MODEL.CLIP_ADAPTER.MASK_THR 0.55 76 | ``` 77 | 78 | 79 | #### Training 80 | 1. Train the segmentation model: 81 | ``` 82 | python train_net.py --config-file --num-gpus 83 | ``` 84 | 85 | - Here is an example: 86 | 87 | ``` 88 | python train_net.py --num-gpu 8 --config-file configs/scan_vitL.yaml 89 | ``` 90 | 91 | 2. Fuse segmentation model with finetuned CLIP. 92 | 93 | We have provided the [finetuned CLIP weights](https://drive.google.com/drive/folders/1obgHGQngtQms0u5YUJRnwd4y1IzME-c8?usp=drive_link). You can directly fuse the pretrained weights with the segmentation model to get the final model. The fuse command is: 94 | ``` 95 | cd tools 96 | python replace_clip.py 97 | ``` 98 | You need to specify the "clip_ckpt" and "ovseg_model" in the file according to your CLIP path and segmentation model path. 99 | 100 | 101 | (Optional) If you want to finetune the CLIP model from scratch, please follow [ov-seg](https://github.com/facebookresearch/ov-seg) to prepare the corresponding data. The finetued command is: 102 | 103 | ``` 104 | cd open_clip_training 105 | cd src 106 | bash scripts/finetune_VitL_with_mask.sh 107 | ``` 108 | 109 | 110 | 111 | 112 | ### Cite 113 | 114 | If you find our work helpful, we'd appreciate it if you could cite our paper in your work. 115 | ``` 116 | @article{liu2023open, 117 | title={Open-Vocabulary Segmentation with Semantic-Assisted Calibration}, 118 | author={Liu, Yong and Bai, Sule and Li, Guanbin and Wang, Yitong and Tang, Yansong}, 119 | journal={arXiv preprint arXiv:2312.04089}, 120 | year={2023} 121 | } 122 | ``` 123 | -------------------------------------------------------------------------------- /configs/scan_vitB.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "SCAN" 3 | BACKBONE: 4 | FREEZE_AT: 0 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 128 8 | DEPTHS: [2, 2, 18, 2] 9 | NUM_HEADS: [4, 8, 16, 32] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [123.675, 116.280, 103.530] 17 | PIXEL_STD: [58.395, 57.120, 57.375] 18 | SELECT_ORI_CLIP_ID: [6, 9, 12] 19 | FREQUENCY_SIGMA: [9, 7, 3] 20 | CLIP_VISION_DIM: 768 21 | SCAN_DIM: 512 22 | PATCH_SIZE: 14 23 | SEM_SEG_HEAD: 24 | NAME: "OpenVocaMask2FormerHead" 25 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 26 | IGNORE_VALUE: 255 27 | NUM_CLASSES: 171 # number of categories in training set 28 | EMBEDDING_DIM: 512 29 | EMBED_LAYERS: 2 30 | COMMON_STRIDE: 4 # not used, hard-coded 31 | LOSS_WEIGHT: 1.0 32 | CONVS_DIM: 256 33 | MASK_DIM: 256 34 | NORM: "GN" 35 | # pixel decoder 36 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 37 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 38 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 39 | COMMON_STRIDE: 4 40 | TRANSFORMER_ENC_LAYERS: 6 41 | MASK_FORMER: 42 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 43 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 44 | DEEP_SUPERVISION: True 45 | NO_OBJECT_WEIGHT: 0.1 46 | CLASS_WEIGHT: 2.0 47 | MASK_WEIGHT: 5.0 48 | DICE_WEIGHT: 5.0 49 | HIDDEN_DIM: 256 50 | NUM_OBJECT_QUERIES: 100 51 | NHEADS: 8 52 | DROPOUT: 0.0 53 | DIM_FEEDFORWARD: 2048 54 | ENC_LAYERS: 0 55 | PRE_NORM: False 56 | ENFORCE_INPUT_PROJ: False 57 | SIZE_DIVISIBILITY: 32 58 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 59 | TRAIN_NUM_POINTS: 12544 60 | OVERSAMPLE_RATIO: 3.0 61 | IMPORTANCE_SAMPLE_RATIO: 0.75 62 | CLIP_ADAPTER: 63 | TEXT_TEMPLATES: "vild" 64 | CLIP_MODEL_NAME: "ViT-B-16" 65 | MASK_FILL: "mean" 66 | MASK_EXPAND_RATIO: 1.0 67 | MASK_MATTING: False # use soft background, default not used 68 | REGION_RESIZED: True # resize to the input of clip, e.g., 224 69 | CLIP_ENSEMBLE: True # use ensemble of two classification branches 70 | # For the [MASK_THR, CLIP_ENSEMBLE_WEIGHT], we have the following settings: 71 | # a847: [0.25, 0.75] a150: [0.4, 0.7] pc459: [0.25, 0.7] pc59: [0.25, 0.35] voc20: [0.2, 0.45] 72 | MASK_THR: 0.4 73 | CLIP_ENSEMBLE_WEIGHT: 0.7 74 | # For the REPLACE_RATIO, we have the following settings: 75 | # a847: 0.05 a150: 0.05 pc459: 0.05 pc59: 0.05 voc20: 0.1 76 | REPLACE_RATIO: 0.15 77 | REPLACE_LAYER: [1, 3, 5] 78 | DATASETS: 79 | TRAIN: ("coco_2017_train_stuff_sem_seg",) 80 | TEST: ("ade20k_sem_seg_val",) 81 | SOLVER: 82 | IMS_PER_BATCH: 32 83 | BASE_LR: 0.00006 84 | MAX_ITER: 120000 85 | WARMUP_FACTOR: 1e-6 86 | WARMUP_ITERS: 1500 87 | LR_SCHEDULER_NAME: "WarmupPolyLR" 88 | WEIGHT_DECAY: 0.01 89 | WEIGHT_DECAY_NORM: 0.0 90 | WEIGHT_DECAY_EMBED: 0.0 91 | BACKBONE_MULTIPLIER: 1.0 92 | TEST_IMS_PER_BATCH: 1 93 | CLIP_GRADIENTS: 94 | ENABLED: True 95 | CLIP_TYPE: "full_model" 96 | CLIP_VALUE: 0.01 97 | NORM_TYPE: 2.0 98 | INPUT: 99 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 100 | MIN_SIZE_TRAIN_SAMPLING: "choice" 101 | MIN_SIZE_TEST: 640 102 | MAX_SIZE_TRAIN: 2560 103 | MAX_SIZE_TEST: 2560 104 | CROP: 105 | ENABLED: True 106 | TYPE: "absolute" 107 | SIZE: (640, 640) 108 | SINGLE_CATEGORY_MAX_AREA: 1.0 109 | COLOR_AUG_SSD: True 110 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 111 | FORMAT: "RGB" 112 | TEST: 113 | EVAL_PERIOD: 5000 114 | # SEMANTIC_ON: True 115 | # INSTANCE_ON: False 116 | # PANOPTIC_ON: False 117 | AUG: 118 | ENABLED: False 119 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 120 | MAX_SIZE: 3584 121 | FLIP: True 122 | DATALOADER: 123 | FILTER_EMPTY_ANNOTATIONS: True 124 | NUM_WORKERS: 16 125 | VERSION: 2 126 | METRIC: 'Vanilla' # Vanilla or SG-IoU 127 | OUTPUT_DIR: output/SCAN-VitB -------------------------------------------------------------------------------- /configs/scan_vitL.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "SCAN" 3 | BACKBONE: 4 | FREEZE_AT: 0 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 128 8 | DEPTHS: [2, 2, 18, 2] 9 | NUM_HEADS: [4, 8, 16, 32] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [123.675, 116.280, 103.530] 17 | PIXEL_STD: [58.395, 57.120, 57.375] 18 | SELECT_ORI_CLIP_ID: [12, 18, 24] 19 | FREQUENCY_SIGMA: [9, 7, 3] 20 | CLIP_VISION_DIM: 1024 21 | SCAN_DIM: 768 22 | PATCH_SIZE: 16 23 | SEM_SEG_HEAD: 24 | NAME: "OpenVocaMask2FormerHead" 25 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 26 | IGNORE_VALUE: 255 27 | NUM_CLASSES: 171 # number of categories in training set 28 | EMBEDDING_DIM: 768 29 | EMBED_LAYERS: 2 30 | COMMON_STRIDE: 4 # not used, hard-coded 31 | LOSS_WEIGHT: 1.0 32 | CONVS_DIM: 256 33 | MASK_DIM: 256 34 | NORM: "GN" 35 | # pixel decoder 36 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 37 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 38 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 39 | COMMON_STRIDE: 4 40 | TRANSFORMER_ENC_LAYERS: 6 41 | MASK_FORMER: 42 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 43 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 44 | DEEP_SUPERVISION: True 45 | NO_OBJECT_WEIGHT: 0.1 46 | CLASS_WEIGHT: 2.0 47 | MASK_WEIGHT: 5.0 48 | DICE_WEIGHT: 5.0 49 | HIDDEN_DIM: 256 50 | NUM_OBJECT_QUERIES: 100 51 | NHEADS: 8 52 | DROPOUT: 0.0 53 | DIM_FEEDFORWARD: 2048 54 | ENC_LAYERS: 0 55 | PRE_NORM: False 56 | ENFORCE_INPUT_PROJ: False 57 | SIZE_DIVISIBILITY: 32 58 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 59 | TRAIN_NUM_POINTS: 12544 60 | OVERSAMPLE_RATIO: 3.0 61 | IMPORTANCE_SAMPLE_RATIO: 0.75 62 | CLIP_ADAPTER: 63 | TEXT_TEMPLATES: "vild" 64 | CLIP_MODEL_NAME: "ViT-L-14" 65 | MASK_FILL: "mean" 66 | MASK_EXPAND_RATIO: 1.0 67 | MASK_MATTING: False # use soft background, default not used 68 | REGION_RESIZED: True # resize to the input of clip, e.g., 224 69 | CLIP_ENSEMBLE: True # use ensemble of two classification branches 70 | # For the [MASK_THR, CLIP_ENSEMBLE_WEIGHT], we have the following settings: 71 | # a847: [0.3, 0.75] a150: [0.55, 0.75] pc459: [0.25, 0.65] pc59: [0.5, 0.5] voc20: [0.2, 0.65] 72 | MASK_THR: 0.4 73 | CLIP_ENSEMBLE_WEIGHT: 0.7 74 | # For the REPLACE_RATIO, we have the following settings: 75 | # a847: 0.15 a150: 0.05 pc459: 0.05 pc59: 0.05 voc20: 0.1 76 | REPLACE_RATIO: 0.15 77 | REPLACE_LAYER: [1, 3, 5, 7, 9] 78 | DATASETS: 79 | TRAIN: ("coco_2017_train_stuff_sem_seg",) 80 | TEST: ("ade20k_sem_seg_val",) 81 | SOLVER: 82 | IMS_PER_BATCH: 32 83 | BASE_LR: 0.00006 84 | MAX_ITER: 120000 85 | WARMUP_FACTOR: 1e-6 86 | WARMUP_ITERS: 1500 87 | LR_SCHEDULER_NAME: "WarmupPolyLR" 88 | WEIGHT_DECAY: 0.01 89 | WEIGHT_DECAY_NORM: 0.0 90 | WEIGHT_DECAY_EMBED: 0.0 91 | BACKBONE_MULTIPLIER: 1.0 92 | TEST_IMS_PER_BATCH: 1 93 | CLIP_GRADIENTS: 94 | ENABLED: True 95 | CLIP_TYPE: "full_model" 96 | CLIP_VALUE: 0.01 97 | NORM_TYPE: 2.0 98 | INPUT: 99 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 100 | MIN_SIZE_TRAIN_SAMPLING: "choice" 101 | MIN_SIZE_TEST: 640 102 | MAX_SIZE_TRAIN: 2560 103 | MAX_SIZE_TEST: 2560 104 | CROP: 105 | ENABLED: True 106 | TYPE: "absolute" 107 | SIZE: (640, 640) 108 | SINGLE_CATEGORY_MAX_AREA: 1.0 109 | COLOR_AUG_SSD: True 110 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 111 | FORMAT: "RGB" 112 | TEST: 113 | EVAL_PERIOD: 5000 114 | # SEMANTIC_ON: True 115 | # INSTANCE_ON: False 116 | # PANOPTIC_ON: False 117 | AUG: 118 | ENABLED: False 119 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 120 | MAX_SIZE: 3584 121 | FLIP: True 122 | DATALOADER: 123 | FILTER_EMPTY_ANNOTATIONS: True 124 | NUM_WORKERS: 16 125 | VERSION: 2 126 | METRIC: 'Vanilla' # Vanilla or SG-IoU 127 | OUTPUT_DIR: output/SCAN-VitL -------------------------------------------------------------------------------- /configs/scan_vitL_demo.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "SCANDEMO" 3 | BACKBONE: 4 | FREEZE_AT: 0 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 128 8 | DEPTHS: [2, 2, 18, 2] 9 | NUM_HEADS: [4, 8, 16, 32] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [123.675, 116.280, 103.530] 17 | PIXEL_STD: [58.395, 57.120, 57.375] 18 | SELECT_ORI_CLIP_ID: [12, 18, 24] 19 | FREQUENCY_SIGMA: [9, 7, 3] 20 | SEM_SEG_HEAD: 21 | NAME: "OpenVocaMask2FormerHead" 22 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 23 | IGNORE_VALUE: 255 24 | NUM_CLASSES: 171 # number of categories in training set 25 | EMBEDDING_DIM: 768 26 | EMBED_LAYERS: 2 27 | COMMON_STRIDE: 4 # not used, hard-coded 28 | LOSS_WEIGHT: 1.0 29 | CONVS_DIM: 256 30 | MASK_DIM: 256 31 | NORM: "GN" 32 | # pixel decoder 33 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 34 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 35 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 36 | COMMON_STRIDE: 4 37 | TRANSFORMER_ENC_LAYERS: 6 38 | MASK_FORMER: 39 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 40 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 41 | DEEP_SUPERVISION: True 42 | NO_OBJECT_WEIGHT: 0.1 43 | CLASS_WEIGHT: 2.0 44 | MASK_WEIGHT: 5.0 45 | DICE_WEIGHT: 5.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 100 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | CLIP_ADAPTER: 60 | TEXT_TEMPLATES: "vild" 61 | CLIP_MODEL_NAME: "ViT-L/14" 62 | MASK_FILL: "mean" 63 | MASK_EXPAND_RATIO: 1.0 64 | MASK_THR: 0.4 # choose the foreground objects 65 | MASK_MATTING: False # use soft background, default not used 66 | REGION_RESIZED: True # resize to the input of clip, e.g., 224 67 | CLIP_ENSEMBLE: True # use ensemble of two classification branches 68 | CLIP_ENSEMBLE_WEIGHT: 0.7 69 | REPLACE_RATIO: 0.15 70 | REPLACE_LAYER: [1, 3, 5, 7, 9] 71 | DATASETS: 72 | TRAIN: ("coco_2017_train_stuff_sem_seg",) 73 | TEST: ("ade20k_sem_seg_val",) 74 | SOLVER: 75 | IMS_PER_BATCH: 32 76 | BASE_LR: 0.00006 77 | MAX_ITER: 120000 78 | WARMUP_FACTOR: 1e-6 79 | WARMUP_ITERS: 1500 80 | WEIGHT_DECAY: 0.01 81 | WEIGHT_DECAY_NORM: 0.0 82 | WEIGHT_DECAY_EMBED: 0.0 83 | BACKBONE_MULTIPLIER: 1.0 84 | TEST_IMS_PER_BATCH: 1 85 | CLIP_GRADIENTS: 86 | ENABLED: True 87 | CLIP_TYPE: "full_model" 88 | CLIP_VALUE: 0.01 89 | NORM_TYPE: 2.0 90 | INPUT: 91 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 92 | MIN_SIZE_TRAIN_SAMPLING: "choice" 93 | MIN_SIZE_TEST: 640 94 | MAX_SIZE_TRAIN: 2560 95 | MAX_SIZE_TEST: 2560 96 | CROP: 97 | ENABLED: True 98 | TYPE: "absolute" 99 | SIZE: (640, 640) 100 | SINGLE_CATEGORY_MAX_AREA: 1.0 101 | COLOR_AUG_SSD: True 102 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 103 | FORMAT: "RGB" 104 | TEST: 105 | EVAL_PERIOD: 5000 106 | AUG: 107 | ENABLED: False 108 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 109 | MAX_SIZE: 3584 110 | FLIP: True 111 | DATALOADER: 112 | FILTER_EMPTY_ANNOTATIONS: True 113 | NUM_WORKERS: 16 114 | VERSION: 2 -------------------------------------------------------------------------------- /datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output, index=None): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | if index is not None: 17 | mapping = {i: k for k, i in enumerate(index)} 18 | img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)( 19 | img.astype(np.float) 20 | ).astype(np.uint8) 21 | Image.fromarray(img).save(output) 22 | 23 | 24 | if __name__ == "__main__": 25 | dataset_dir = ( 26 | Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 27 | ) 28 | print('Caution: we only generate the validation set!') 29 | for name in ["validation"]: 30 | annotation_dir = dataset_dir / "annotations" / name 31 | output_dir = dataset_dir / "annotations_detectron2" / name 32 | output_dir.mkdir(parents=True, exist_ok=True) 33 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 34 | output_file = output_dir / file.name 35 | convert(file, output_file) 36 | -------------------------------------------------------------------------------- /datasets/prepare_pascal_context.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import tqdm 5 | import os 6 | import os.path as osp 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | from PIL import Image 11 | import scipy.io 12 | 13 | def convert_pc59(mask_path, new_mask_path, pc59_dict): 14 | mat = scipy.io.loadmat(mask_path) 15 | mask = mat['LabelMap'] 16 | 17 | mask_copy = np.ones_like(mask, dtype=np.uint8) * 255 18 | for trID, clsID in pc59_dict.items(): 19 | mask_copy[mask == clsID] = trID 20 | 21 | min_value = np.amin(mask_copy) 22 | assert min_value >= 0, print(min_value) 23 | Image.fromarray(mask_copy).save(new_mask_path, "PNG") 24 | 25 | def convert_pc459(mask_path, new_mask_path): 26 | mat = scipy.io.loadmat(mask_path) 27 | mask = mat['LabelMap'] 28 | mask = mask - 1 29 | min_value = np.amin(mask) 30 | assert min_value >= 0, print(min_value) 31 | Image.fromarray(mask).save(new_mask_path, "TIFF") 32 | 33 | 34 | if __name__ == "__main__": 35 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) 36 | print('Caution: we only generate the validation set!') 37 | pc_path = dataset_dir / "VOCdevkit/VOC2010" 38 | 39 | val_list = open(pc_path / "pascalcontext_val.txt", "r") 40 | pc459_labels = open(pc_path / "labels.txt", "r") 41 | pc59_labels = open(pc_path / "59_labels.txt", "r") 42 | 43 | pc459_dict = {} 44 | for line in pc459_labels.readlines(): 45 | if ':' in line: 46 | idx, name = line.split(':') 47 | idx = int(idx.strip()) 48 | name = name.strip() 49 | pc459_dict[name] = idx 50 | 51 | pc59_dict = {} 52 | for i, line in enumerate(pc59_labels.readlines()): 53 | name = line.split(':')[-1].strip() 54 | if name is not '': 55 | pc59_dict[i] = pc459_dict[name] 56 | 57 | pc459_dir = pc_path / "annotations_detectron2" / "pc459_val" 58 | pc459_dir.mkdir(parents=True, exist_ok=True) 59 | pc59_dir = pc_path / "annotations_detectron2" / "pc59_val" 60 | pc59_dir.mkdir(parents=True, exist_ok=True) 61 | 62 | for line in tqdm.tqdm(val_list.readlines()): 63 | fileid = line.strip() 64 | ori_mask = f'{pc_path}/trainval/{fileid}.mat' 65 | pc459_dst = f'{pc459_dir}/{fileid}.tif' 66 | pc59_dst = f'{pc59_dir}/{fileid}.png' 67 | if osp.exists(ori_mask): 68 | convert_pc459(ori_mask, pc459_dst) 69 | convert_pc59(ori_mask, pc59_dst, pc59_dict) 70 | -------------------------------------------------------------------------------- /datasets/prepare_voc_sem_seg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | # Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py 4 | 5 | import os 6 | import os.path as osp 7 | from pathlib import Path 8 | import tqdm 9 | 10 | import numpy as np 11 | from PIL import Image 12 | 13 | 14 | clsID_to_trID = { 15 | 0: 255, 16 | 1: 0, 17 | 2: 1, 18 | 3: 2, 19 | 4: 3, 20 | 5: 4, 21 | 6: 5, 22 | 7: 6, 23 | 8: 7, 24 | 9: 8, 25 | 10: 9, 26 | 11: 10, 27 | 12: 11, 28 | 13: 12, 29 | 14: 13, 30 | 15: 14, 31 | 16: 15, 32 | 17: 16, 33 | 18: 17, 34 | 19: 18, 35 | 20: 19, 36 | 255: 255, 37 | } 38 | 39 | def convert_to_trainID( 40 | maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix="" 41 | ): 42 | mask = np.array(Image.open(maskpath)) 43 | mask_copy = np.ones_like(mask, dtype=np.uint8) * 255 44 | for clsID, trID in clsID_to_trID.items(): 45 | mask_copy[mask == clsID] = trID 46 | seg_filename = ( 47 | osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath)) 48 | if is_train 49 | else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath)) 50 | ) 51 | if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255: 52 | return 53 | Image.fromarray(mask_copy).save(seg_filename, "PNG") 54 | 55 | 56 | 57 | if __name__ == "__main__": 58 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) 59 | print('Caution: we only generate the validation set!') 60 | voc_path = dataset_dir / "VOCdevkit" / "VOC2012" 61 | out_mask_dir = voc_path / "annotations_detectron2" 62 | out_image_dir = voc_path / "images_detectron2" 63 | for name in ["val"]: 64 | os.makedirs((out_mask_dir / name), exist_ok=True) 65 | os.makedirs((out_image_dir / name), exist_ok=True) 66 | val_list = [ 67 | osp.join(voc_path, "SegmentationClassAug", f + ".png") 68 | for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist() 69 | ] 70 | for file in tqdm.tqdm(val_list): 71 | convert_to_trainID(file, out_mask_dir, is_train=False) 72 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import argparse 5 | import glob 6 | import multiprocessing as mp 7 | import os 8 | import time 9 | import cv2 10 | import tqdm 11 | 12 | from detectron2.config import get_cfg 13 | 14 | from detectron2.projects.deeplab import add_deeplab_config 15 | from detectron2.data.detection_utils import read_image 16 | from detectron2.utils.logger import setup_logger 17 | from scan import add_ovseg_config 18 | 19 | from scan.utils import VisualizationDemo 20 | 21 | # constants 22 | WINDOW_NAME = "Open vocabulary segmentation" 23 | 24 | 25 | def setup_cfg(args): 26 | # load config from file and command-line arguments 27 | cfg = get_cfg() 28 | # for poly lr schedule 29 | add_deeplab_config(cfg) 30 | add_ovseg_config(cfg) 31 | cfg.merge_from_file(args.config_file) 32 | cfg.merge_from_list(args.opts) 33 | cfg.freeze() 34 | return cfg 35 | 36 | 37 | def get_parser(): 38 | parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation") 39 | parser.add_argument( 40 | "--config-file", 41 | default="configs/ovseg_swinB_vitL_mask2former_demo.yaml", 42 | metavar="FILE", 43 | help="path to config file", 44 | ) 45 | parser.add_argument( 46 | "--input", 47 | nargs="+", 48 | help="A list of space separated input images; " 49 | "or a single glob pattern such as 'directory/*.jpg'", 50 | default='./data/ADEChallengeData2016/images/validation/*.jpg' 51 | ) 52 | parser.add_argument( 53 | "--class-names", 54 | nargs="+", 55 | default="building", 56 | help="A list of user-defined class_names" 57 | ) 58 | parser.add_argument( 59 | "--output", 60 | default='./pred', 61 | help="A file or directory to save output visualizations. " 62 | "If not given, will show output in an OpenCV window.", 63 | ) 64 | parser.add_argument( 65 | "--opts", 66 | help="Modify config options using the command-line 'KEY VALUE' pairs", 67 | default=['MODEL.WEIGHTS', '/opt/tiger/ljyaronld/OVSeg/ckpt/SwinB-Mask2Former-openclip_datacomp-frequency_121824-aux-split_query_only_crossattn-final.pth'], 68 | nargs=argparse.REMAINDER, 69 | ) 70 | return parser 71 | 72 | 73 | if __name__ == "__main__": 74 | mp.set_start_method("spawn", force=True) 75 | args = get_parser().parse_args() 76 | setup_logger(name="fvcore") 77 | logger = setup_logger() 78 | logger.info("Arguments: " + str(args)) 79 | 80 | cfg = setup_cfg(args) 81 | 82 | demo = VisualizationDemo(cfg) 83 | classes = [] 84 | with open('/opt/tiger/ljyaronld/OVSeg/a_150.txt', 'r') as file: 85 | for line in file: 86 | classes.append(line.strip()) 87 | class_names = classes 88 | class_names = args.class_names 89 | if args.input: 90 | if len(args.input) == 1: 91 | args.input = glob.glob(os.path.expanduser(args.input[0])) 92 | assert args.input, "The input path(s) was not found" 93 | for path in tqdm.tqdm(args.input, disable=not args.output): 94 | # use PIL, to be consistent with evaluation 95 | img = read_image(path, format="BGR") 96 | start_time = time.time() 97 | predictions, visualized_output = demo.run_on_image(img, class_names) 98 | logger.info( 99 | "{}: {} in {:.2f}s".format( 100 | path, 101 | "detected {} instances".format(len(predictions["instances"])) 102 | if "instances" in predictions 103 | else "finished", 104 | time.time() - start_time, 105 | ) 106 | ) 107 | 108 | if args.output: 109 | if os.path.isdir(args.output): 110 | assert os.path.isdir(args.output), args.output 111 | out_filename = os.path.join(args.output, os.path.basename(path)) 112 | else: 113 | assert len(args.input) == 1, "Please specify a directory with args.output" 114 | out_filename = args.output 115 | visualized_output.save(out_filename) 116 | else: 117 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 118 | cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) 119 | if cv2.waitKey(0) == 27: 120 | break # esc to quit 121 | else: 122 | raise NotImplementedError -------------------------------------------------------------------------------- /imgs/cs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/cs.png -------------------------------------------------------------------------------- /imgs/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/pipeline.png -------------------------------------------------------------------------------- /imgs/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/results.png -------------------------------------------------------------------------------- /imgs/visual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/visual.png -------------------------------------------------------------------------------- /open_clip_training/.github/workflows/clear-cache.yml: -------------------------------------------------------------------------------- 1 | name: Clear cache 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | permissions: 7 | actions: write 8 | 9 | jobs: 10 | clear-cache: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Clear cache 14 | uses: actions/github-script@v6 15 | with: 16 | script: | 17 | const caches = await github.rest.actions.getActionsCacheList({ 18 | owner: context.repo.owner, 19 | repo: context.repo.repo, 20 | }) 21 | for (const cache of caches.data.actions_caches) { 22 | console.log(cache) 23 | await github.rest.actions.deleteActionsCacheById({ 24 | owner: context.repo.owner, 25 | repo: context.repo.repo, 26 | cache_id: cache.id, 27 | }) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /open_clip_training/.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions-ecosystem/action-regex-match@v2 13 | id: regex-match 14 | with: 15 | text: ${{ github.event.head_commit.message }} 16 | regex: '^Release ([^ ]+)' 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.8' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Release 26 | if: ${{ steps.regex-match.outputs.match != '' }} 27 | uses: softprops/action-gh-release@v1 28 | with: 29 | tag_name: v${{ steps.regex-match.outputs.group1 }} 30 | - name: Build and publish 31 | if: ${{ steps.regex-match.outputs.match != '' }} 32 | env: 33 | TWINE_USERNAME: __token__ 34 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 35 | run: | 36 | python setup.py sdist bdist_wheel 37 | twine upload dist/* 38 | -------------------------------------------------------------------------------- /open_clip_training/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | wandb/ 3 | models/ 4 | features/ 5 | results/ 6 | 7 | tests/data/ 8 | *.pt 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | sync.sh 140 | gpu1sync.sh 141 | .idea 142 | *.pdf 143 | **/._* 144 | **/*DS_* 145 | **.jsonl 146 | src/sbatch 147 | src/misc 148 | .vscode 149 | src/debug 150 | core.* 151 | 152 | # Allow 153 | !src/evaluation/misc/results_dbs/* -------------------------------------------------------------------------------- /open_clip_training/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | message: If you use this software, please cite it as below. 3 | authors: 4 | - family-names: Ilharco 5 | given-names: Gabriel 6 | - family-names: Wortsman 7 | given-names: Mitchell 8 | - family-names: Wightman 9 | given-names: Ross 10 | - family-names: Gordon 11 | given-names: Cade 12 | - family-names: Carlini 13 | given-names: Nicholas 14 | - family-names: Taori 15 | given-names: Rohan 16 | - family-names: Dave 17 | given-names: Achal 18 | - family-names: Shankar 19 | given-names: Vaishaal 20 | - family-names: Namkoong 21 | given-names: Hongseok 22 | - family-names: Miller 23 | given-names: John 24 | - family-names: Hajishirzi 25 | given-names: Hannaneh 26 | - family-names: Farhadi 27 | given-names: Ali 28 | - family-names: Schmidt 29 | given-names: Ludwig 30 | title: OpenCLIP 31 | version: v0.1 32 | doi: 10.5281/zenodo.5143773 33 | date-released: 2021-07-28 34 | -------------------------------------------------------------------------------- /open_clip_training/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman, 2 | Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, 3 | John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, 4 | Ludwig Schmidt 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /open_clip_training/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/open_clip/bpe_simple_vocab_16e6.txt.gz 2 | include src/open_clip/model_configs/*.json 3 | 4 | -------------------------------------------------------------------------------- /open_clip_training/Makefile: -------------------------------------------------------------------------------- 1 | install: ## [Local development] Upgrade pip, install requirements, install package. 2 | python -m pip install -U pip 3 | python -m pip install -e . 4 | 5 | install-training: 6 | python -m pip install -r requirements-training.txt 7 | 8 | install-test: ## [Local development] Install test requirements 9 | python -m pip install -r requirements-test.txt 10 | 11 | test: ## [Local development] Run unit tests 12 | python -m pytest -x -s -v tests 13 | -------------------------------------------------------------------------------- /open_clip_training/docs/LOW_ACC.md: -------------------------------------------------------------------------------- 1 | As we describe in more detail below, CLIP models in a medium accuracy regime already allow us to draw conclusions about the robustness of larger CLIP models since the models follow reliable scaling laws. 2 | 3 | [Cherti et al., 2022](https://arxiv.org/abs/2212.07143) and [Gadre et al., 2023](https://arxiv.org/abs/2304.14108) show additional discussions about the scaling behavior of CLIP models. 4 | 5 | ## Scaling trends 6 | 7 | The plot below shows how zero-shot performance of CLIP models varies as we scale the number of samples used for training. Zero-shot performance increases steadily for both ImageNet and [ImageNetV2](https://arxiv.org/abs/1902.10811), and is far from saturated at ~15M samples. 8 | 9 | 10 | 11 | ## Why are low-accuracy CLIP models interesting? 12 | 13 | **TL;DR:** CLIP models have high effective robustness, even at small scales. 14 | 15 | CLIP models are particularly intriguing because they are more robust to natural distribution shifts (see Section 3.3 in the [CLIP paper](https://arxiv.org/abs/2103.00020)). 16 | This phenomena is illustrated by the figure below, with ImageNet accuracy on the x-axis 17 | and [ImageNetV2](https://arxiv.org/abs/1902.10811) (a reproduction of the ImageNet validation set with distribution shift) accuracy on the y-axis. 18 | Standard training denotes training on the ImageNet train set and the CLIP zero-shot models 19 | are shown as stars. 20 | 21 | ![CLIP scatter plot](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/effective_robustness.png) 22 | 23 | As observed by [Taori et al., 2020](https://arxiv.org/abs/2007.00644) and [Miller et al., 2021](https://arxiv.org/abs/2107.04649), the in-distribution 24 | and out-of-distribution accuracies of models trained on ImageNet follow a predictable linear trend (the red line in the above plot). *Effective robustness* 25 | quantifies robustness as accuracy beyond this baseline, i.e., how far a model lies above the red line. Ideally a model would not suffer from distribution shift and fall on the y = x line ([trained human labelers are within a percentage point of the y = x line](http://proceedings.mlr.press/v119/shankar20c.html)). 26 | 27 | Even though the CLIP models trained with 28 | this codebase achieve much lower accuracy than those trained by OpenAI, our models still lie on the same 29 | trend of improved effective robustness (the purple line). Therefore, we can study what makes 30 | CLIP robust without requiring industrial-scale compute. 31 | 32 | For more information on effective robustness, please see: 33 | 34 | - [Recht et al., 2019](https://arxiv.org/abs/1902.10811). 35 | - [Taori et al., 2020](https://arxiv.org/abs/2007.00644). 36 | - [Miller et al., 2021](https://arxiv.org/abs/2107.04649). 37 | 38 | To know more about the factors that contribute to CLIP's robustness refer to [Fang et al., 2022](https://arxiv.org/abs/2205.01397). -------------------------------------------------------------------------------- /open_clip_training/docs/clip_conceptual_captions.md: -------------------------------------------------------------------------------- 1 | ## Additional training curves for CLIP on Conceptual Captions 2 | 3 | # Zero shot accuracy 4 | ![](/docs/clip_zeroshot.png) 5 | 6 | # Training loss curve 7 | ![](/docs/clip_loss.png) 8 | 9 | # Validation loss curve 10 | ![](/docs/clip_val_loss.png) 11 | 12 | # Validation recall 13 | ![](/docs/clip_recall.png) -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipa/vit_b16/i50_t16_finetune.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 -m training.main \ 2 | --save-frequency 1 \ 3 | --save-most-recent \ 4 | --zeroshot-frequency 1 \ 5 | --train-data '/path/to/laion-400m' \ 6 | --dataset-type webdataset \ 7 | --lr "2.56e-5" \ 8 | --beta1 0.9 \ 9 | --beta2 0.95 \ 10 | --warmup 3072 \ 11 | --wd 0.2 \ 12 | --batch-size 1024 \ 13 | --aug-cfg scale='(0.4, 1.0)' \ 14 | --epochs 1 \ 15 | --train-num-samples 131072000 \ 16 | --workers 6 \ 17 | --model ViT-B-16-CL16 \ 18 | --pretrained '/path/to/ckpt' \ 19 | --precision 'amp_bf16' \ 20 | --ddp-static-graph \ 21 | --local-loss \ 22 | --gather-with-grad \ 23 | --grad-checkpointing \ 24 | --log-every-n-steps 256 \ 25 | --seed 0 \ 26 | --logs ./logs/ \ 27 | --imagenet-val '/path/to/imagenet/val' 28 | -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipa/vit_b16/i50_t16_pretrain.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 -m training.main \ 2 | --save-frequency 1 \ 3 | --save-most-recent \ 4 | --zeroshot-frequency 1 \ 5 | --train-data '/path/to/laion-400m' \ 6 | --dataset-type webdataset \ 7 | --lr "2.048e-3" \ 8 | --beta1 0.9 \ 9 | --beta2 0.95 \ 10 | --warmup 782 \ 11 | --wd 0.2 \ 12 | --batch-size 8192 \ 13 | --aug-cfg scale='(0.4, 1.0)' \ 14 | --epochs 6 \ 15 | --workers 6 \ 16 | --model ViT-B-16-CL16 \ 17 | --precision 'amp_bf16' \ 18 | --ddp-static-graph \ 19 | --local-loss \ 20 | --gather-with-grad \ 21 | --force-image-size 112 \ 22 | --grad-checkpointing \ 23 | --log-every-n-steps 32 \ 24 | --seed 0 \ 25 | --logs ./logs/ \ 26 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipa/vit_l16/i17_t16_finetune.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 -m training.main \ 2 | --save-frequency 1 \ 3 | --save-most-recent \ 4 | --zeroshot-frequency 1 \ 5 | --train-data '/path/to/laion-400m' \ 6 | --dataset-type webdataset \ 7 | --lr "2.24e-5" \ 8 | --beta1 0.9 \ 9 | --beta2 0.95 \ 10 | --warmup 3571 \ 11 | --wd 0.2 \ 12 | --batch-size 896 \ 13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 14 | --epochs 1 \ 15 | --train-num-samples 131072000 \ 16 | --workers 6 \ 17 | --model ViT-L-16-CL16-GAP \ 18 | --pretrained '/path/to/ckpt' \ 19 | --precision 'amp_bf16' \ 20 | --ddp-static-graph \ 21 | --local-loss \ 22 | --gather-with-grad \ 23 | --grad-checkpointing \ 24 | --log-every-n-steps 293 \ 25 | --seed 0 \ 26 | --logs ./logs/ \ 27 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipa/vit_l16/i17_t16_pretrain.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 -m training.main \ 2 | --save-frequency 1 \ 3 | --save-most-recent \ 4 | --zeroshot-frequency 1 \ 5 | --train-data '/path/to/laion-400m' \ 6 | --dataset-type webdataset \ 7 | --lr "1.024e-3" \ 8 | --beta1 0.9 \ 9 | --beta2 0.95 \ 10 | --warmup 1563 \ 11 | --wd 0.2 \ 12 | --batch-size 4096 \ 13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 14 | --epochs 6 \ 15 | --workers 6 \ 16 | --model ViT-L-16-CL16-GAP \ 17 | --precision 'amp_bf16' \ 18 | --ddp-static-graph \ 19 | --local-loss \ 20 | --gather-with-grad \ 21 | --force-image-size 64 \ 22 | --grad-checkpointing \ 23 | --log-every-n-steps 64 \ 24 | --seed 0 \ 25 | --logs ./logs/ \ 26 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipa/vit_l16/i37_t8_finetune.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 -m training.main \ 2 | --save-frequency 1 \ 3 | --save-most-recent \ 4 | --zeroshot-frequency 1 \ 5 | --train-data '/path/to/laion-400m' \ 6 | --dataset-type webdataset \ 7 | --lr "2.24e-5" \ 8 | --beta1 0.9 \ 9 | --beta2 0.95 \ 10 | --warmup 3571 \ 11 | --wd 0.2 \ 12 | --batch-size 896 \ 13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 14 | --epochs 1 \ 15 | --train-num-samples 131072000 \ 16 | --workers 6 \ 17 | --model ViT-L-16-CL32-GAP \ 18 | --pretrained '/path/to/ckpt' \ 19 | --precision 'amp_bf16' \ 20 | --ddp-static-graph \ 21 | --local-loss \ 22 | --gather-with-grad \ 23 | --grad-checkpointing \ 24 | --log-every-n-steps 293 \ 25 | --seed 0 \ 26 | --logs ./logs/ \ 27 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipa/vit_l16/i37_t8_pretrain.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 -m training.main \ 2 | --save-frequency 1 \ 3 | --save-most-recent \ 4 | --zeroshot-frequency 1 \ 5 | --train-data '/path/to/laion-400m' \ 6 | --dataset-type webdataset \ 7 | --lr "1.024e-3" \ 8 | --beta1 0.9 \ 9 | --beta2 0.95 \ 10 | --warmup 1563 \ 11 | --wd 0.2 \ 12 | --batch-size 4096 \ 13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 14 | --epochs 6 \ 15 | --workers 6 \ 16 | --model ViT-L-16-CL8-Syntax-GAP \ 17 | --precision 'amp_bf16' \ 18 | --ddp-static-graph \ 19 | --local-loss \ 20 | --gather-with-grad \ 21 | --force-image-size 96 \ 22 | --grad-checkpointing \ 23 | --log-every-n-steps 64 \ 24 | --seed 0 \ 25 | --logs ./logs/ \ 26 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipav2/vit_h14/i257_t32_finetunex4.sh: -------------------------------------------------------------------------------- 1 | # have not been tested. use it at your own discretion 2 | # the original experiment was run on tpu v3-256. 3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups. 4 | torchrun --nproc_per_node 8 -m training.main \ 5 | --save-frequency 1 \ 6 | --save-most-recent \ 7 | --zeroshot-frequency 1 \ 8 | --train-data '/path/to/laion2b_or_datacomp1b' \ 9 | --train-num-samples 131072000 \ 10 | --dataset-type webdataset \ 11 | --lr "5.12e-5" \ 12 | --beta1 0.9 \ 13 | --beta2 0.95 \ 14 | --warmup 800 \ 15 | --wd 0.2 \ 16 | --batch-size 4096 \ 17 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 18 | --epochs 4 \ 19 | --workers 6 \ 20 | --model ViT-H-14-CL32-GAP \ 21 | --pretrained '/path/to/pretrain84_ckpt' \ 22 | --precision 'amp_bf16' \ 23 | --ddp-static-graph \ 24 | --local-loss \ 25 | --gather-with-grad \ 26 | --force-image-size 224 \ 27 | --force-patch-dropout 0.3 \ 28 | --grad-checkpointing \ 29 | --log-every-n-steps 64 \ 30 | --seed 0 \ 31 | --logs ./logs/ \ 32 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipav2/vit_h14/i50_t8_pretrain.sh: -------------------------------------------------------------------------------- 1 | # have not been tested. use it at your own discretion 2 | # the original experiment was run on tpu v3-256. 3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups. 4 | torchrun --nproc_per_node 8 -m training.main \ 5 | --save-frequency 1 \ 6 | --save-most-recent \ 7 | --zeroshot-frequency 1 \ 8 | --train-data '/path/to/laion2b_or_datacomp1b' \ 9 | --train-num-samples 4e8 \ 10 | --dataset-type webdataset \ 11 | --lr "2.048e-3" \ 12 | --beta1 0.9 \ 13 | --beta2 0.95 \ 14 | --warmup 3200 \ 15 | --wd 0.2 \ 16 | --batch-size 8192 \ 17 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 18 | --epochs 32 \ 19 | --workers 6 \ 20 | --model ViT-H-14-CL8-Syntax-GAP \ 21 | --precision 'amp_bf16' \ 22 | --ddp-static-graph \ 23 | --local-loss \ 24 | --gather-with-grad \ 25 | --force-image-size 84 \ 26 | --grad-checkpointing \ 27 | --log-every-n-steps 32 \ 28 | --seed 0 \ 29 | --logs ./logs/ \ 30 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/clipav2/vit_h14/i577_t32_finetunex1.sh: -------------------------------------------------------------------------------- 1 | # have not been tested. use it at your own discretion 2 | # the original experiment was run on tpu v3-256. 3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups. 4 | torchrun --nproc_per_node 8 -m training.main \ 5 | --save-frequency 1 \ 6 | --save-most-recent \ 7 | --zeroshot-frequency 1 \ 8 | --train-data '/path/to/laion2b_or_datacomp1b' \ 9 | --train-num-samples 131072000 \ 10 | --dataset-type webdataset \ 11 | --lr "6.4e-6" \ 12 | --beta1 0.9 \ 13 | --beta2 0.95 \ 14 | --warmup 1600 \ 15 | --wd 0.2 \ 16 | --batch-size 2048 \ 17 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 18 | --epochs 1 \ 19 | --workers 6 \ 20 | --model ViT-H-14-CL32-GAP \ 21 | --pretrained '/path/to/finetune224_ckpt' \ 22 | --precision 'amp_bf16' \ 23 | --ddp-static-graph \ 24 | --local-loss \ 25 | --gather-with-grad \ 26 | --force-image-size 336 \ 27 | --force-patch-dropout 0.4 \ 28 | --grad-checkpointing \ 29 | --log-every-n-steps 64 \ 30 | --seed 0 \ 31 | --logs ./logs/ \ 32 | --imagenet-val '/path/to/imagenet/val' -------------------------------------------------------------------------------- /open_clip_training/docs/script_examples/stability_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=g40423 3 | #SBATCH --job-name=testopenclip 4 | #SBATCH --nodes 30 5 | #SBATCH --ntasks-per-node=8 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --output=%x_%j.out 8 | #SBATCH --comment=laion 9 | #SBATCH --open-mode=append 10 | #SBATCH --exclusive 11 | 12 | module load openmpi 13 | module load cuda/11.7 14 | 15 | export MASTER_ADDR=`hostname` 16 | export MASTER_PORT=12802 17 | export NCCL_PROTO=simple 18 | export FI_EFA_FORK_SAFE=1 19 | export FI_LOG_LEVEL=1 20 | export FI_EFA_USE_DEVICE_RDMA=1 21 | export NCCL_DEBUG=info 22 | 23 | export PYTHONFAULTHANDLER=1 24 | 25 | export CUDA_LAUNCH_BLOCKING=0 26 | export OMPI_MCA_mtl_base_verbose=1 27 | export FI_EFA_ENABLE_SHM_TRANSFER=0 28 | export FI_PROVIDER=efa 29 | export FI_EFA_TX_MIN_CREDITS=64 30 | export NCCL_TREE_THRESHOLD=0 31 | 32 | cd /admin/home-mitchellw/open_clip/src 33 | export PYTHONPATH="$PYTHONPATH:/admin/home-mitchellw/open_clip/src" 34 | 35 | EXP_NAME="test-B-32-laion5b-lr1e-3-bs90k" 36 | 37 | srun --comment laion --cpu_bind=v --accel-bind=gn python -m training.main \ 38 | --save-frequency 1 \ 39 | --train-data="pipe:aws s3 cp s3://s-datasets/laion5b/{laion2B-data/{000000..231349}.tar,laion2B-multi-data/{000000..226687}.tar,laion1B-nolang-data/{000000..127231}.tar} -" \ 40 | --train-num-samples 135646078 \ 41 | --dataset-type webdataset \ 42 | --dataset-resampled \ 43 | --warmup 2000 \ 44 | --batch-size=375 \ 45 | --epochs=97 \ 46 | --lr 1e-3 \ 47 | --workers=8 \ 48 | --report-to wandb \ 49 | --name ${EXP_NAME} \ 50 | --logs /scratch/logs/ \ 51 | --model ViT-B-32 \ 52 | --seed 0 \ 53 | --ddp-static-graph \ 54 | --local-loss \ 55 | --gather-with-grad \ 56 | --grad-checkpointing \ 57 | --precision amp_bfloat16 \ 58 | --wandb-project-name open_clip6 \ 59 | --resume "latest" \ 60 | --remote-sync s3://s-laion/mitchellw/logs 61 | -------------------------------------------------------------------------------- /open_clip_training/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | regression_test 4 | -------------------------------------------------------------------------------- /open_clip_training/scripts/clipav1_vit_l16_i37_t8.sh: -------------------------------------------------------------------------------- 1 | # eval on a single gpu 2 | CUDA_VISIBLE_DEVICES=2 TORCH_CUDNN_V8_API_ENABLED=1 TFDS_PREFETCH_SIZE=8192 python3 -m training.main \ 3 | --model ViT-L-16-CL32-GAP \ 4 | --pretrained "/path/to/clipa_vit_l16_i37_t8.pt" \ 5 | --seed 0 \ 6 | --imagenet-val '/path/to/ImageNet/val' -------------------------------------------------------------------------------- /open_clip_training/scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=1 python3 -m training.main \ 2 | --model ViT-H-14-CL32-GAP-BigVision \ 3 | --pretrained "/path/to/vit_h14_i84_224_336_cl32_gap_datacomp1b.pt" \ 4 | --force-image-size 336 \ 5 | --square-resize-only \ 6 | --interpolation 'bilinear' \ 7 | --image-mean 0.485 0.456 0.406 \ 8 | --image-std 0.229 0.224 0.225 \ 9 | --seed 0 \ 10 | --imagenet-val '/path/to/ImageNet/val' 11 | -------------------------------------------------------------------------------- /open_clip_training/scripts/h14_224_32_finetune.sh: -------------------------------------------------------------------------------- 1 | # 64k batchsize for 2.048e-3 lr 2 | TORCH_CUDNN_V8_API_ENABLED=1 torchrun --nproc_per_node 8 -m training.main \ 3 | --save-frequency 1 \ 4 | --save-most-recent \ 5 | --zeroshot-frequency 1 \ 6 | --train-data '/path/to/laion' \ 7 | --dataset-type webdataset \ 8 | --lr "2.048e-3" \ 9 | --beta1 0.9 \ 10 | --beta2 0.95 \ 11 | --warmup 782 \ 12 | --wd 0.2 \ 13 | --batch-size 4096 \ 14 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 15 | --epochs=7 \ 16 | --workers=6 \ 17 | --model ViT-H-14-CL32-GAP \ 18 | --precision 'amp_bf16' \ 19 | --local-loss \ 20 | --gather-with-grad \ 21 | --force-image-size 224 \ 22 | --grad-checkpointing \ 23 | --log-every-n-steps 32 \ 24 | --seed 0 \ 25 | --logs ./logs/ \ 26 | --imagenet-val '/path/to/ImageNet/val' \ 27 | --name 'name' \ 28 | --report-to "wandb" \ 29 | --wandb-project-name "project_name" 30 | 31 | 32 | -------------------------------------------------------------------------------- /open_clip_training/scripts/h14_84_8_pretrain.sh: -------------------------------------------------------------------------------- 1 | # 64k batchsize for 2.048e-3 lr 2 | TORCH_CUDNN_V8_API_ENABLED=1 torchrun --nproc_per_node 8 -m training.main \ 3 | --save-frequency 1 \ 4 | --save-most-recent \ 5 | --zeroshot-frequency 1 \ 6 | --train-data '/path/to/laion' \ 7 | --dataset-type webdataset \ 8 | --lr "2.048e-3" \ 9 | --beta1 0.9 \ 10 | --beta2 0.95 \ 11 | --warmup 782 \ 12 | --wd 0.2 \ 13 | --batch-size 4096 \ 14 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \ 15 | --epochs=7 \ 16 | --workers=6 \ 17 | --model ViT-H-14-CL8-SyntaxMask-GAP \ 18 | --precision 'amp_bf16' \ 19 | --local-loss \ 20 | --gather-with-grad \ 21 | --force-image-size 84 \ 22 | --grad-checkpointing \ 23 | --log-every-n-steps 32 \ 24 | --seed 0 \ 25 | --logs ./logs/ \ 26 | --imagenet-val '/path/to/ImageNet/val' \ 27 | --name 'name' \ 28 | --report-to "wandb" \ 29 | --wandb-project-name "project_name" 30 | 31 | 32 | -------------------------------------------------------------------------------- /open_clip_training/setup.py: -------------------------------------------------------------------------------- 1 | """ Setup 2 | """ 3 | from setuptools import setup, find_packages 4 | from codecs import open 5 | from os import path 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Get the long description from the README file 10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 11 | long_description = f.read() 12 | 13 | def _read_reqs(relpath): 14 | fullpath = path.join(path.dirname(__file__), relpath) 15 | with open(fullpath) as f: 16 | return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))] 17 | 18 | REQUIREMENTS = _read_reqs("requirements.txt") 19 | TRAINING_REQUIREMENTS = _read_reqs("requirements-training.txt") 20 | 21 | exec(open('src/open_clip/version.py').read()) 22 | setup( 23 | name='open_clip_torch', 24 | version=__version__, 25 | description='OpenCLIP', 26 | long_description=long_description, 27 | long_description_content_type='text/markdown', 28 | url='https://github.com/mlfoundations/open_clip', 29 | author='', 30 | author_email='', 31 | classifiers=[ 32 | # How mature is this project? Common values are 33 | # 3 - Alpha 34 | # 4 - Beta 35 | # 5 - Production/Stable 36 | 'Development Status :: 3 - Alpha', 37 | 'Intended Audience :: Education', 38 | 'Intended Audience :: Science/Research', 39 | 'License :: OSI Approved :: Apache Software License', 40 | 'Programming Language :: Python :: 3.7', 41 | 'Programming Language :: Python :: 3.8', 42 | 'Programming Language :: Python :: 3.9', 43 | 'Programming Language :: Python :: 3.10', 44 | 'Topic :: Scientific/Engineering', 45 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 46 | 'Topic :: Software Development', 47 | 'Topic :: Software Development :: Libraries', 48 | 'Topic :: Software Development :: Libraries :: Python Modules', 49 | ], 50 | 51 | # Note that this is a string of words separated by whitespace, not a list. 52 | keywords='CLIP pretrained', 53 | package_dir={'': 'src'}, 54 | packages=find_packages(where='src'), 55 | include_package_data=True, 56 | install_requires=REQUIREMENTS, 57 | extras_require={ 58 | "training": TRAINING_REQUIREMENTS, 59 | }, 60 | python_requires='>=3.7', 61 | ) 62 | -------------------------------------------------------------------------------- /open_clip_training/src/clip_adapter/clip_adapter.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Union, Callable, Optional, List 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import nn 7 | 8 | from open_clip.factory import create_model_and_transforms 9 | import copy 10 | 11 | class ClipAdapter(nn.Module): 12 | def __init__(self, args, device): 13 | super().__init__() 14 | model, preprocess_train, preprocess_val, preprocess_val_entire = create_model_and_transforms( 15 | args.model, 16 | args.pretrained, 17 | precision=args.precision, 18 | device=device, 19 | jit=args.torchscript, 20 | force_quick_gelu=args.force_quick_gelu, 21 | force_custom_text=args.force_custom_text, 22 | force_patch_dropout=args.force_patch_dropout, 23 | force_image_size=args.force_image_size, 24 | image_mean=args.image_mean, 25 | image_std=args.image_std, 26 | image_interpolation=args.image_interpolation, 27 | image_resize_mode=args.image_resize_mode, # only effective for inference 28 | aug_cfg=args.aug_cfg, 29 | pretrained_image=args.pretrained_image, 30 | output_dict=True, 31 | with_mask=args.with_mask, 32 | mask_emb_depth=args.mask_emb_depth 33 | ) 34 | 35 | self.clip_model = model 36 | self.preprocess_train = preprocess_train 37 | self.preprocess_val = preprocess_val 38 | self.preprocess_val_entire = preprocess_val_entire 39 | 40 | self.original_clip_visual = copy.deepcopy(model.visual) 41 | for _, param in self.original_clip_visual.named_parameters(): 42 | param.requires_grad = False 43 | 44 | def forward(self, original_image, image, text, mask=None): 45 | if image is None: 46 | return self.encode_text(text) 47 | elif text is None: 48 | ori_image_features = self.original_clip_visual(original_image, get_embedding=True) 49 | image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features, mask=mask) # [32, 768] 50 | 51 | image_features = F.normalize(image_features, dim=-1) # [32, 768] 52 | return {'image_features': image_features} 53 | 54 | if mask is None: 55 | ori_image_features = self.original_clip_visual(original_image, get_embedding=True) 56 | image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features) # [32, 768] 57 | else: 58 | ori_image_features = self.original_clip_visual(original_image, get_embedding=True) 59 | image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features, mask=mask) # [32, 768] 60 | 61 | image_features = F.normalize(image_features, dim=-1) # [32, 768] 62 | 63 | text_features = self.clip_model.encode_text(text) 64 | text_features = F.normalize(text_features, dim=-1) # [32, 768] 65 | 66 | # return image_features, text_features, self.clip_model.logit_scale.exp() 67 | out_dict = { 68 | "image_features": image_features, 69 | "text_features": text_features, 70 | "logit_scale": self.clip_model.logit_scale.exp() 71 | } 72 | if self.clip_model.logit_bias is not None: 73 | out_dict['logit_bias'] = self.logit_bias 74 | return out_dict -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .coca_model import CoCa 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ 7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \ 8 | get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg 9 | from .openai import load_openai_model, list_openai_models 10 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ 11 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 12 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub 13 | from .tokenizer import SimpleTokenizer, tokenize, decode 14 | from .transform import image_transform, AugmentationCfg 15 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy 16 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES 17 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 4 | IMAGENET_STD = (0.229, 0.224, 0.225) 5 | INCEPTION_MEAN = (0.5, 0.5, 0.5) 6 | INCEPTION_STD = (0.5, 0.5, 0.5) 7 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/open_clip/generation_utils.py -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | # https://huggingface.co/docs/transformers/model_doc/bert 46 | "bert": { 47 | "config_names": { 48 | "context_length": "max_position_embeddings", 49 | "vocab_size": "vocab_size", 50 | "width": "hidden_size", 51 | "heads": "num_attention_heads", 52 | "layers": "num_hidden_layers", 53 | }, 54 | "pooler": "cls_pooler", 55 | }, 56 | # https://huggingface.co/docs/transformers/model_doc/m2m_100 57 | "m2m_100": { 58 | "config_names": { 59 | "context_length": "max_position_embeddings", 60 | "vocab_size": "vocab_size", 61 | "width": "d_model", 62 | "heads": "encoder_attention_heads", 63 | "layers": "encoder_layers", 64 | }, 65 | "pooler": "cls_pooler", 66 | }, 67 | } 68 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA01-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA01-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA02-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_base_patch16_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA02-E-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1280, 14 | "heads": 20, 15 | "layers": 32 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA02-E-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA02-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "timm_model_name": "eva02_large_patch14_clip_336", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/EVA02-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_large_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/RN50x64.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": [ 6 | 3, 7 | 15, 8 | 36, 9 | 10 10 | ], 11 | "width": 128, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 1024, 18 | "heads": 16, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 256, 7 | "timm_model_name": "vit_base_patch16_siglip_256", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_base_patch16_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 512, 7 | "timm_model_name": "vit_base_patch16_siglip_512", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 256, 7 | "timm_model_name": "vit_base_patch16_siglip_256", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 250000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 224, 7 | "timm_model_name": "vit_base_patch16_siglip_224", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-32-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-H-14-378-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 378, 6 | "layers": 32, 7 | "width": 1280, 8 | "head_width": 80, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14, 9 | "no_ln_pre": true, 10 | "pool_type": "avg", 11 | "final_ln_after_pool": true 12 | }, 13 | "text_cfg": { 14 | "context_length": 32, 15 | "vocab_size": 32000, 16 | "hf_tokenizer_name": "bert-base-uncased", 17 | "tokenizer_kwargs": { 18 | "strip_sep_token": true 19 | }, 20 | "width": 1024, 21 | "heads": 16, 22 | "layers": 24, 23 | "pool_type": "last", 24 | "no_causal_mask": true 25 | } 26 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-H-14-CLIPA.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14, 9 | "no_ln_pre": true, 10 | "pool_type": "avg", 11 | "final_ln_after_pool": true 12 | }, 13 | "text_cfg": { 14 | "context_length": 32, 15 | "vocab_size": 32000, 16 | "hf_tokenizer_name": "bert-base-uncased", 17 | "tokenizer_kwargs": { 18 | "strip_sep_token": true 19 | }, 20 | "width": 1024, 21 | "heads": 16, 22 | "layers": 24, 23 | "pool_type": "last", 24 | "no_causal_mask": true 25 | } 26 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-H-14-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 32, 7 | "width": 1280, 8 | "head_width": 80, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-14-CLIPA-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "no_ln_pre": true, 9 | "pool_type": "avg", 10 | "final_ln_after_pool": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 32, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "bert-base-uncased", 16 | "tokenizer_kwargs": { 17 | "strip_sep_token": true 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "pool_type": "last", 23 | "no_causal_mask": true 24 | } 25 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-14-CLIPA.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "no_ln_pre": true, 9 | "pool_type": "avg", 10 | "final_ln_after_pool": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 32, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "bert-base-uncased", 16 | "tokenizer_kwargs": { 17 | "strip_sep_token": true 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "pool_type": "last", 23 | "no_causal_mask": true 24 | } 25 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-14-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 24, 7 | "width": 1024, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-16-SigLIP-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 256, 7 | "timm_model_name": "vit_large_patch16_siglip_256", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_large_patch16_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1152, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_so400m_patch14_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1152, 20 | "heads": 16, 21 | "layers": 27, 22 | "mlp_ratio": 3.7362, 23 | "no_causal_mask": true, 24 | "proj_bias": true, 25 | "pool_type": "last", 26 | "norm_kwargs":{ 27 | "eps": 1e-6 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1152, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 224, 7 | "timm_model_name": "vit_so400m_patch14_siglip_224", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 16, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1152, 20 | "heads": 16, 21 | "layers": 27, 22 | "mlp_ratio": 3.7362, 23 | "no_causal_mask": true, 24 | "proj_bias": true, 25 | "pool_type": "last", 26 | "norm_kwargs":{ 27 | "eps": 1e-6 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14, 10 | "no_ln_pre": true, 11 | "pool_type": "avg", 12 | "final_ln_after_pool": true 13 | }, 14 | "text_cfg": { 15 | "context_length": 32, 16 | "vocab_size": 32000, 17 | "hf_tokenizer_name": "bert-base-uncased", 18 | "tokenizer_kwargs": { 19 | "strip_sep_token": true 20 | }, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "pool_type": "last", 25 | "no_causal_mask": true 26 | } 27 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14, 10 | "no_ln_pre": true, 11 | "pool_type": "avg", 12 | "final_ln_after_pool": true 13 | }, 14 | "text_cfg": { 15 | "context_length": 32, 16 | "vocab_size": 32000, 17 | "hf_tokenizer_name": "bert-base-uncased", 18 | "tokenizer_kwargs": { 19 | "strip_sep_token": true 20 | }, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "pool_type": "last", 25 | "no_causal_mask": true 26 | } 27 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/coca_ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "attn_pooler_heads": 8 28 | }, 29 | "custom_text": true 30 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/coca_ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 768, 25 | "heads": 12, 26 | "layers": 12, 27 | "attn_pooler_heads": 12 28 | }, 29 | "custom_text": true 30 | } 31 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/coca_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "multimodal_cfg": { 4 | "width": 768, 5 | "context_length": 76, 6 | "vocab_size": 64000, 7 | "mlp_ratio": 4, 8 | "layers": 12, 9 | "dim_head": 64, 10 | "heads": 12, 11 | "n_queries": 256, 12 | "attn_pooler_heads": 8 13 | }, 14 | "vision_cfg": { 15 | "image_size": 288, 16 | "layers": 12, 17 | "width": 768, 18 | "patch_size": 18, 19 | "output_tokens": true 20 | }, 21 | "text_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 64000, 24 | "layers": 12, 25 | "heads": 12, 26 | "width": 768, 27 | "embed_cls": true, 28 | "output_tokens": true 29 | }, 30 | "custom_text": true 31 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/coca_roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "output_tokens": true 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "hf_proj_type": "linear", 14 | "width": 768, 15 | "output_tokens": true 16 | }, 17 | "multimodal_cfg": { 18 | "context_length": 76, 19 | "width": 768, 20 | "heads": 8, 21 | "layers": 12 22 | }, 23 | "custom_text": true 24 | } 25 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_base_w.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_base_w_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_large_d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_large_d_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_small", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_tiny", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 20 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/convnext_xxlarge_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/mt5-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "google/mt5-base", 11 | "hf_tokenizer_name": "google/mt5-base", 12 | "hf_pooler_type": "mean_pooler" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/mt5-xl-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "google/mt5-xl", 12 | "hf_tokenizer_name": "google/mt5-xl", 13 | "hf_pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/nllb-clip-base-siglip.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "custom_text": true, 4 | "init_logit_bias": -10, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_base_patch16_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "hf_model_name": "facebook/nllb-200-distilled-600M", 14 | "hf_tokenizer_name": "facebook/nllb-200-distilled-600M", 15 | "hf_proj_type": "linear", 16 | "hf_pooler_type": "cls_pooler" 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/nllb-clip-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "facebook/nllb-200-distilled-600M", 11 | "hf_tokenizer_name": "facebook/nllb-200-distilled-600M", 12 | "hf_proj_type": "linear", 13 | "hf_pooler_type": "cls_pooler" 14 | } 15 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/nllb-clip-large-siglip.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1152, 3 | "custom_text": true, 4 | "init_logit_bias": -10, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_so400m_patch14_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "hf_model_name": "facebook/nllb-200-distilled-1.3B", 14 | "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B", 15 | "hf_proj_type": "linear", 16 | "hf_pooler_type": "cls_pooler" 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/nllb-clip-large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "facebook/nllb-200-distilled-1.3B", 12 | "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B", 13 | "hf_proj_type": "linear", 14 | "hf_pooler_type": "cls_pooler" 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "hf_pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "xlm-roberta-base", 11 | "hf_tokenizer_name": "xlm-roberta-base", 12 | "hf_pooler_type": "mean_pooler" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "xlm-roberta-large", 12 | "hf_tokenizer_name": "xlm-roberta-large", 13 | "hf_pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/openai.py: -------------------------------------------------------------------------------- 1 | """ OpenAI pretrained model functions 2 | 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. 4 | """ 5 | 6 | import os 7 | import warnings 8 | from typing import List, Optional, Union 9 | 10 | import torch 11 | 12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 13 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype 14 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url 15 | 16 | __all__ = ["list_openai_models", "load_openai_model"] 17 | 18 | 19 | def list_openai_models() -> List[str]: 20 | """Returns the names of available CLIP models""" 21 | return list_pretrained_models_by_tag('openai') 22 | 23 | 24 | def load_openai_model( 25 | name: str, 26 | precision: Optional[str] = None, 27 | device: Optional[Union[str, torch.device]] = None, 28 | cache_dir: Optional[str] = None, 29 | ): 30 | """Load a CLIP model 31 | 32 | Parameters 33 | ---------- 34 | name : str 35 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 36 | precision: str 37 | Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. 38 | device : Union[str, torch.device] 39 | The device to put the loaded model 40 | cache_dir : Optional[str] 41 | The directory to cache the downloaded model weights 42 | 43 | Returns 44 | ------- 45 | model : torch.nn.Module 46 | The CLIP model 47 | preprocess : Callable[[PIL.Image], torch.Tensor] 48 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 49 | """ 50 | if device is None: 51 | device = "cuda" if torch.cuda.is_available() else "cpu" 52 | if precision is None: 53 | precision = 'fp32' if device == 'cpu' else 'fp16' 54 | 55 | if get_pretrained_url(name, 'openai'): 56 | model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir) 57 | elif os.path.isfile(name): 58 | model_path = name 59 | else: 60 | raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}") 61 | 62 | try: 63 | # loading JIT archive 64 | model = torch.jit.load(model_path, map_location="cpu").eval() 65 | state_dict = None 66 | except RuntimeError: 67 | # loading saved state dict 68 | state_dict = torch.load(model_path, map_location="cpu") 69 | 70 | # Build a non-jit model from the OpenAI jitted model state dict 71 | cast_dtype = get_cast_dtype(precision) 72 | try: 73 | model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) 74 | except KeyError: 75 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} 76 | model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) 77 | 78 | # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use 79 | model = model.to(device) 80 | # FIXME support pure fp16/bf16 precision modes 81 | if precision != 'fp16': 82 | model.float() 83 | if precision == 'bf16': 84 | # for bf16, convert back to low-precision 85 | convert_weights_to_lp(model, dtype=torch.bfloat16) 86 | 87 | # add mean / std attributes for consistency with OpenCLIP models 88 | model.visual.image_mean = OPENAI_DATASET_MEAN 89 | model.visual.image_std = OPENAI_DATASET_STD 90 | return model 91 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/pos_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # Position embedding utils 8 | # -------------------------------------------------------- 9 | 10 | import numpy as np 11 | 12 | import torch 13 | 14 | # -------------------------------------------------------- 15 | # 2D sine-cosine position embedding 16 | # References: 17 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py 18 | # MoCo v3: https://github.com/facebookresearch/moco-v3 19 | # -------------------------------------------------------- 20 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): 21 | """ 22 | grid_size: int of the grid height and width 23 | return: 24 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 25 | """ 26 | grid_h = np.arange(grid_size, dtype=np.float32) 27 | grid_w = np.arange(grid_size, dtype=np.float32) 28 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 29 | grid = np.stack(grid, axis=0) 30 | 31 | grid = grid.reshape([2, 1, grid_size, grid_size]) 32 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 33 | if cls_token: 34 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) 35 | return pos_embed 36 | 37 | 38 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 39 | assert embed_dim % 2 == 0 40 | 41 | # use half of dimensions to encode grid_h 42 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 43 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 44 | 45 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) 46 | return emb 47 | 48 | 49 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 50 | """ 51 | embed_dim: output dimension for each position 52 | pos: a list of positions to be encoded: size (M,) 53 | out: (M, D) 54 | """ 55 | assert embed_dim % 2 == 0 56 | omega = np.arange(embed_dim // 2, dtype=float) 57 | omega /= embed_dim / 2. 58 | omega = 1. / 10000**omega # (D/2,) 59 | 60 | pos = pos.reshape(-1) # (M,) 61 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product 62 | 63 | emb_sin = np.sin(out) # (M, D/2) 64 | emb_cos = np.cos(out) # (M, D/2) 65 | 66 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) 67 | return emb 68 | 69 | 70 | # -------------------------------------------------------- 71 | # Interpolate position embeddings for high-resolution 72 | # References: 73 | # DeiT: https://github.com/facebookresearch/deit 74 | # -------------------------------------------------------- 75 | def interpolate_pos_embed(model, checkpoint_model): 76 | if 'pos_embed' in checkpoint_model: 77 | pos_embed_checkpoint = checkpoint_model['pos_embed'] 78 | embedding_size = pos_embed_checkpoint.shape[-1] 79 | num_patches = model.patch_embed.num_patches 80 | num_extra_tokens = model.pos_embed.shape[-2] - num_patches 81 | # height (== width) for the checkpoint position embedding 82 | orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) 83 | # height (== width) for the new position embedding 84 | new_size = int(num_patches ** 0.5) 85 | # class_token and dist_token are kept unchanged 86 | if orig_size != new_size: 87 | print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) 88 | extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] 89 | # only the position tokens are interpolated 90 | pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] 91 | pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) 92 | pos_tokens = torch.nn.functional.interpolate( 93 | pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) 94 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) 95 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) 96 | checkpoint_model['pos_embed'] = new_pos_embed 97 | -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import repeat 2 | import collections.abc 3 | 4 | import torch 5 | from torch import nn as nn 6 | from torchvision.ops.misc import FrozenBatchNorm2d 7 | 8 | 9 | def freeze_batch_norm_2d(module, module_match={}, name=''): 10 | """ 11 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is 12 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and 13 | returned. Otherwise, the module is walked recursively and submodules are converted in place. 14 | 15 | Args: 16 | module (torch.nn.Module): Any PyTorch module. 17 | module_match (dict): Dictionary of full module names to freeze (all if empty) 18 | name (str): Full module name (prefix) 19 | 20 | Returns: 21 | torch.nn.Module: Resulting module 22 | 23 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 24 | """ 25 | res = module 26 | is_match = True 27 | if module_match: 28 | is_match = name in module_match 29 | if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)): 30 | res = FrozenBatchNorm2d(module.num_features) 31 | res.num_features = module.num_features 32 | res.affine = module.affine 33 | if module.affine: 34 | res.weight.data = module.weight.data.clone().detach() 35 | res.bias.data = module.bias.data.clone().detach() 36 | res.running_mean.data = module.running_mean.data 37 | res.running_var.data = module.running_var.data 38 | res.eps = module.eps 39 | else: 40 | for child_name, child in module.named_children(): 41 | full_child_name = '.'.join([name, child_name]) if name else child_name 42 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name) 43 | if new_child is not child: 44 | res.add_module(child_name, new_child) 45 | return res 46 | 47 | 48 | # From PyTorch internals 49 | def _ntuple(n): 50 | def parse(x): 51 | if isinstance(x, collections.abc.Iterable): 52 | return x 53 | return tuple(repeat(x, n)) 54 | return parse 55 | 56 | 57 | to_1tuple = _ntuple(1) 58 | to_2tuple = _ntuple(2) 59 | to_3tuple = _ntuple(3) 60 | to_4tuple = _ntuple(4) 61 | to_ntuple = lambda n, x: _ntuple(n)(x) 62 | 63 | # Replaces all linear layers with linear_replacement 64 | # TODO: add int8 support for other linear layers including attn and convnets 65 | def replace_linear(model, linear_replacement, include_modules=['c_fc', 'c_proj'], copy_weights=True): 66 | for name, module in model.named_children(): 67 | if len(list(module.children())) > 0: 68 | replace_linear(module, linear_replacement, include_modules, copy_weights) 69 | 70 | if isinstance(module, torch.nn.Linear) and name in include_modules: 71 | old_module = model._modules[name] 72 | model._modules[name] = linear_replacement( 73 | module.in_features, 74 | module.out_features, 75 | module.bias is not None, 76 | ) 77 | if copy_weights: 78 | model._modules[name].weight.data.copy_(old_module.weight.data) 79 | if model._modules[name].bias is not None: 80 | model._modules[name].bias.data.copy_(old_module.bias) 81 | 82 | return model 83 | 84 | def convert_int8_model_to_inference_mode(model): 85 | for m in model.modules(): 86 | if hasattr(m, 'prepare_for_eval'): 87 | int8_original_dtype = m.weight.dtype 88 | m.prepare_for_eval() 89 | m.int8_original_dtype = int8_original_dtype -------------------------------------------------------------------------------- /open_clip_training/src/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.22.0' 2 | -------------------------------------------------------------------------------- /open_clip_training/src/scripts/1cap_finetune_VitL.sh: -------------------------------------------------------------------------------- 1 | torchrun --master_port 12345 --nproc_per_node 8 -m training.main \ 2 | --train-data /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/coco_proposal_1cap.csv \ 3 | --train-num-samples 442117 \ 4 | --lr 0.000005 \ 5 | --warmup 100 \ 6 | --force-quick-gelu \ 7 | --dataset-type csv \ 8 | --batch-size 32 \ 9 | --precision amp \ 10 | --workers 8 \ 11 | --model ViT-L-14 \ 12 | --lock-text \ 13 | --zeroshot-frequency 1 \ 14 | --save-frequency 1 \ 15 | --epochs 10 \ 16 | --pretrained datacomp_xl_s13b_b90k \ 17 | --ade-val /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/ade_gt_150cls_val -------------------------------------------------------------------------------- /open_clip_training/src/scripts/finetune_VitL_with_mask.sh: -------------------------------------------------------------------------------- 1 | torchrun --master_port 12345 --nproc_per_node 8 -m training.main \ 2 | --train-data /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/coco_proposal_1cap.csv \ 3 | --train-num-samples 442117 \ 4 | --lr 0.000005 \ 5 | --warmup 100 \ 6 | --force-quick-gelu \ 7 | --dataset-type csv \ 8 | --batch-size 32 \ 9 | --precision amp \ 10 | --workers 8 \ 11 | --model ViT-L-14 \ 12 | --lock-text \ 13 | --zeroshot-frequency 1 \ 14 | --save-frequency 1 \ 15 | --epochs 10 \ 16 | --pretrained datacomp_xl_s13b_b90k \ 17 | --ade-val /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/ade_gt_150cls_val \ 18 | --with-mask -------------------------------------------------------------------------------- /open_clip_training/src/training/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | -------------------------------------------------------------------------------- /open_clip_training/src/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/training/__init__.py -------------------------------------------------------------------------------- /open_clip_training/src/training/file_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import multiprocessing 4 | import subprocess 5 | import time 6 | import fsspec 7 | import torch 8 | from tqdm import tqdm 9 | 10 | def remote_sync_s3(local_dir, remote_dir): 11 | # skip epoch_latest which can change during sync. 12 | result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 13 | if result.returncode != 0: 14 | logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}") 15 | return False 16 | 17 | logging.info(f"Successfully synced with S3 bucket") 18 | return True 19 | 20 | def remote_sync_fsspec(local_dir, remote_dir): 21 | # FIXME currently this is slow and not recommended. Look into speeding up. 22 | a = fsspec.get_mapper(local_dir) 23 | b = fsspec.get_mapper(remote_dir) 24 | 25 | for k in a: 26 | # skip epoch_latest which can change during sync. 27 | if 'epoch_latest.pt' in k: 28 | continue 29 | 30 | logging.info(f'Attempting to sync {k}') 31 | if k in b and len(a[k]) == len(b[k]): 32 | logging.debug(f'Skipping remote sync for {k}.') 33 | continue 34 | 35 | try: 36 | logging.info(f'Successful sync for {k}.') 37 | b[k] = a[k] 38 | except Exception as e: 39 | logging.info(f'Error during remote sync for {k}: {e}') 40 | return False 41 | 42 | return True 43 | 44 | def remote_sync(local_dir, remote_dir, protocol): 45 | logging.info('Starting remote sync.') 46 | if protocol == 's3': 47 | return remote_sync_s3(local_dir, remote_dir) 48 | elif protocol == 'fsspec': 49 | return remote_sync_fsspec(local_dir, remote_dir) 50 | else: 51 | logging.error('Remote protocol not known') 52 | return False 53 | 54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol): 55 | while True: 56 | time.sleep(sync_every) 57 | remote_sync(local_dir, remote_dir, protocol) 58 | 59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol): 60 | p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol)) 61 | return p 62 | 63 | # Note: we are not currently using this save function. 64 | def pt_save(pt_obj, file_path): 65 | of = fsspec.open(file_path, "wb") 66 | with of as f: 67 | torch.save(pt_obj, file_path) 68 | 69 | def pt_load(file_path, map_location=None): 70 | if file_path.startswith('s3'): 71 | logging.info('Loading remote checkpoint, which may take a bit.') 72 | of = fsspec.open(file_path, "rb") 73 | with of as f: 74 | out = torch.load(f, map_location=map_location) 75 | return out 76 | 77 | def check_exists(file_path): 78 | try: 79 | with fsspec.open(file_path): 80 | pass 81 | except FileNotFoundError: 82 | return False 83 | return True 84 | -------------------------------------------------------------------------------- /open_clip_training/src/training/precision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from contextlib import suppress 3 | 4 | 5 | def get_autocast(precision): 6 | if precision == 'amp': 7 | return torch.cuda.amp.autocast 8 | elif precision == 'amp_bfloat16' or precision == 'amp_bf16': 9 | # amp_bfloat16 is more stable than amp float16 for clip training 10 | return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16) 11 | else: 12 | return suppress 13 | -------------------------------------------------------------------------------- /open_clip_training/src/training/scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def assign_learning_rate(optimizer, new_lr): 5 | for param_group in optimizer.param_groups: 6 | param_group["lr"] = new_lr 7 | 8 | 9 | def _warmup_lr(base_lr, warmup_length, step): 10 | return base_lr * (step + 1) / warmup_length 11 | 12 | 13 | def const_lr(optimizer, base_lr, warmup_length, steps): 14 | def _lr_adjuster(step): 15 | if step < warmup_length: 16 | lr = _warmup_lr(base_lr, warmup_length, step) 17 | else: 18 | lr = base_lr 19 | assign_learning_rate(optimizer, lr) 20 | return lr 21 | return _lr_adjuster 22 | 23 | 24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.): 25 | def _lr_adjuster(step): 26 | start_cooldown_step = steps - cooldown_steps 27 | if step < warmup_length: 28 | lr = _warmup_lr(base_lr, warmup_length, step) 29 | else: 30 | if step < start_cooldown_step: 31 | lr = base_lr 32 | else: 33 | e = step - start_cooldown_step 34 | es = steps - start_cooldown_step 35 | # linear decay if power == 1; polynomial decay otherwise; 36 | decay = (1 - (e/es)) ** cooldown_power 37 | lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr 38 | assign_learning_rate(optimizer, lr) 39 | return lr 40 | return _lr_adjuster 41 | 42 | 43 | def cosine_lr(optimizer, base_lr, warmup_length, steps): 44 | def _lr_adjuster(step): 45 | if step < warmup_length: 46 | lr = _warmup_lr(base_lr, warmup_length, step) 47 | else: 48 | e = step - warmup_length 49 | es = steps - warmup_length 50 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr 51 | assign_learning_rate(optimizer, lr) 52 | return lr 53 | return _lr_adjuster 54 | -------------------------------------------------------------------------------- /open_clip_training/src/training/zero_shot.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | from tqdm import tqdm 5 | 6 | from open_clip import get_input_dtype, get_tokenizer, build_zero_shot_classifier, \ 7 | IMAGENET_CLASSNAMES, OPENAI_IMAGENET_TEMPLATES 8 | from .precision import get_autocast 9 | from .ade150_zeroshot_data import ade150_classnames 10 | from torchmetrics import Accuracy 11 | 12 | def accuracy(output, target, topk=(1,)): 13 | pred = output.topk(max(topk), 1, True, True)[1].t() 14 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 15 | return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk], pred[0] 16 | 17 | 18 | def run(model, classifier, dataloader, args): 19 | autocast = get_autocast(args.precision) 20 | input_dtype = get_input_dtype(args.precision) 21 | 22 | with torch.no_grad(): 23 | top1, top5, n = 0., 0., 0. 24 | preds = [] 25 | targets = [] 26 | macc = Accuracy('multiclass', num_classes=150, average='macro').cuda() 27 | for images, target, entire_images in tqdm(dataloader, unit_scale=args.batch_size): 28 | if args.with_mask: 29 | images, masks = images 30 | masks = masks.to(device=args.device, dtype=input_dtype) 31 | else: 32 | images = images 33 | masks=None 34 | images = images.to(device=args.device, dtype=input_dtype) 35 | target = target.to(args.device) 36 | entire_images = entire_images.to(device=args.device, dtype=input_dtype) 37 | 38 | with autocast(): 39 | # predict 40 | output = model(original_image=entire_images, image=images, mask=masks, text=None) 41 | image_features = output['image_features'] if isinstance(output, dict) else output[0] 42 | logits = 100. * image_features @ classifier 43 | 44 | # measure accuracy 45 | (acc1, acc5), pred = accuracy(logits, target, topk=(1, 5)) 46 | preds.append(pred) 47 | targets.append(target) 48 | top1 += acc1 49 | top5 += acc5 50 | n += images.size(0) 51 | preds = torch.cat(preds) 52 | targets = torch.cat(targets) 53 | top1 = (top1 / n) 54 | top5 = (top5 / n) 55 | return top1, top5, macc(preds, targets).item() 56 | 57 | 58 | def zero_shot_eval(model, data, epoch, args, tokenizer=None): 59 | if 'imagenet-val' not in data and 'imagenet-v2' not in data and 'ade-val' not in data: 60 | return {} 61 | if args.zeroshot_frequency == 0: 62 | return {} 63 | if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs: 64 | return {} 65 | if args.distributed and not args.horovod: 66 | model = model.module 67 | 68 | logging.info('Starting zero-shot imagenet.') 69 | if tokenizer is None: 70 | tokenizer = get_tokenizer(args.model) 71 | 72 | logging.info('Building zero-shot classifier') 73 | autocast = get_autocast(args.precision) 74 | with autocast(): 75 | if 'ade-val' in data: 76 | classifier = build_zero_shot_classifier( 77 | model, 78 | tokenizer=tokenizer, 79 | classnames=ade150_classnames, 80 | templates=OPENAI_IMAGENET_TEMPLATES, 81 | num_classes_per_batch=10, 82 | device=args.device, 83 | use_tqdm=True, 84 | ) 85 | else: 86 | classifier = build_zero_shot_classifier( 87 | model, 88 | tokenizer=tokenizer, 89 | classnames=IMAGENET_CLASSNAMES, 90 | templates=OPENAI_IMAGENET_TEMPLATES, 91 | num_classes_per_batch=10, 92 | device=args.device, 93 | use_tqdm=True, 94 | ) 95 | 96 | logging.info('Using classifier') 97 | results = {} 98 | if 'imagenet-val' in data: 99 | top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args) 100 | results['imagenet-zeroshot-val-top1'] = top1 101 | results['imagenet-zeroshot-val-top5'] = top5 102 | if 'imagenet-v2' in data: 103 | top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args) 104 | results['imagenetv2-zeroshot-val-top1'] = top1 105 | results['imagenetv2-zeroshot-val-top5'] = top5 106 | if 'ade-val' in data: 107 | top1, top5, macc = run(model, classifier, data['ade-val'].dataloader, args) 108 | results['ade150-zeroshot-val-top1'] = top1 109 | results['ade150-zeroshot-val-top5'] = top5 110 | 111 | logging.info('Finished zero-shot imagenet.') 112 | 113 | return results 114 | -------------------------------------------------------------------------------- /open_clip_training/tests/test_hf_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch 4 | from open_clip.hf_model import _POOLERS, HFTextEncoder 5 | from transformers import AutoConfig 6 | from transformers.modeling_outputs import BaseModelOutput 7 | # test poolers 8 | def test_poolers(): 9 | bs, sl, d = 2, 10, 5 10 | h = torch.arange(sl).repeat(bs).reshape(bs, sl)[..., None] * torch.linspace(0.2, 1., d) 11 | mask = torch.ones(bs, sl, dtype=torch.bool) 12 | mask[:2, 6:] = False 13 | x = BaseModelOutput(h) 14 | for name, cls in _POOLERS.items(): 15 | pooler = cls() 16 | res = pooler(x, mask) 17 | assert res.shape == (bs, d), f"{name} returned wrong shape" 18 | 19 | # test HFTextEncoder 20 | @pytest.mark.parametrize("model_id", ["arampacha/roberta-tiny", "roberta-base", "xlm-roberta-base", "google/mt5-base"]) 21 | def test_pretrained_text_encoder(model_id): 22 | bs, sl, d = 2, 10, 64 23 | cfg = AutoConfig.from_pretrained(model_id) 24 | model = HFTextEncoder(model_id, d, proj_type='linear') 25 | x = torch.randint(0, cfg.vocab_size, (bs, sl)) 26 | with torch.no_grad(): 27 | emb = model(x) 28 | 29 | assert emb.shape == (bs, d) 30 | -------------------------------------------------------------------------------- /open_clip_training/tests/test_inference_simple.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from open_clip.factory import get_tokenizer 4 | import pytest 5 | import open_clip 6 | import os 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 8 | 9 | if hasattr(torch._C, '_jit_set_profiling_executor'): 10 | # legacy executor is too slow to compile large models for unit tests 11 | # no need for the fusion performance here 12 | torch._C._jit_set_profiling_executor(True) 13 | torch._C._jit_set_profiling_mode(False) 14 | 15 | 16 | test_simple_models = [ 17 | # model, pretrained, jit, force_custom_text 18 | ("ViT-B-32", "laion2b_s34b_b79k", False, False), 19 | ("ViT-B-32", "laion2b_s34b_b79k", True, False), 20 | ("ViT-B-32", "laion2b_s34b_b79k", True, True), 21 | ("roberta-ViT-B-32", "laion2b_s12b_b32k", False, False), 22 | ] 23 | 24 | 25 | @pytest.mark.parametrize("model_type,pretrained,jit,force_custom_text", test_simple_models) 26 | def test_inference_simple( 27 | model_type, 28 | pretrained, 29 | jit, 30 | force_custom_text, 31 | ): 32 | model, _, preprocess = open_clip.create_model_and_transforms( 33 | model_type, 34 | pretrained=pretrained, 35 | jit=jit, 36 | force_custom_text=force_custom_text, 37 | ) 38 | tokenizer = get_tokenizer(model_type) 39 | 40 | current_dir = os.path.dirname(os.path.realpath(__file__)) 41 | 42 | image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0) 43 | text = tokenizer(["a diagram", "a dog", "a cat"]) 44 | 45 | with torch.no_grad(): 46 | image_features = model.encode_image(image) 47 | text_features = model.encode_text(text) 48 | 49 | text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) 50 | 51 | assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0] 52 | -------------------------------------------------------------------------------- /open_clip_training/tests/test_num_shards.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from training.data import get_dataset_size 4 | 5 | @pytest.mark.parametrize( 6 | "shards,expected_size", 7 | [ 8 | ('/path/to/shard.tar', 1), 9 | ('/path/to/shard_{000..000}.tar', 1), 10 | ('/path/to/shard_{000..009}.tar', 10), 11 | ('/path/to/shard_{000..009}_{000..009}.tar', 100), 12 | ('/path/to/shard.tar::/path/to/other_shard_{000..009}.tar', 11), 13 | ('/path/to/shard_{000..009}.tar::/path/to/other_shard_{000..009}.tar', 20), 14 | (['/path/to/shard.tar'], 1), 15 | (['/path/to/shard.tar', '/path/to/other_shard.tar'], 2), 16 | ] 17 | ) 18 | def test_num_shards(shards, expected_size): 19 | _, size = get_dataset_size(shards) 20 | assert size == expected_size, f'Expected {expected_size} for {shards} but found {size} instead.' 21 | -------------------------------------------------------------------------------- /open_clip_training/tests/test_training_simple.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import pytest 5 | from PIL import Image 6 | import torch 7 | from training.main import main 8 | 9 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 10 | 11 | if hasattr(torch._C, '_jit_set_profiling_executor'): 12 | # legacy executor is too slow to compile large models for unit tests 13 | # no need for the fusion performance here 14 | torch._C._jit_set_profiling_executor(True) 15 | torch._C._jit_set_profiling_mode(False) 16 | 17 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 18 | def test_training(): 19 | main([ 20 | '--save-frequency', '1', 21 | '--zeroshot-frequency', '1', 22 | '--dataset-type', "synthetic", 23 | '--train-num-samples', '16', 24 | '--warmup', '1', 25 | '--batch-size', '4', 26 | '--lr', '1e-3', 27 | '--wd', '0.1', 28 | '--epochs', '1', 29 | '--workers', '2', 30 | '--model', 'RN50' 31 | ]) 32 | 33 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 34 | def test_training_coca(): 35 | main([ 36 | '--save-frequency', '1', 37 | '--zeroshot-frequency', '1', 38 | '--dataset-type', "synthetic", 39 | '--train-num-samples', '16', 40 | '--warmup', '1', 41 | '--batch-size', '4', 42 | '--lr', '1e-3', 43 | '--wd', '0.1', 44 | '--epochs', '1', 45 | '--workers', '2', 46 | '--model', 'coca_ViT-B-32' 47 | ]) 48 | 49 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 50 | def test_training_mt5(): 51 | main([ 52 | '--save-frequency', '1', 53 | '--zeroshot-frequency', '1', 54 | '--dataset-type', "synthetic", 55 | '--train-num-samples', '16', 56 | '--warmup', '1', 57 | '--batch-size', '4', 58 | '--lr', '1e-3', 59 | '--wd', '0.1', 60 | '--epochs', '1', 61 | '--workers', '2', 62 | '--model', 'mt5-base-ViT-B-32', 63 | '--lock-text', 64 | '--lock-text-unlocked-layers', '2' 65 | ]) 66 | 67 | 68 | 69 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 70 | def test_training_unfreezing_vit(): 71 | main([ 72 | '--save-frequency', '1', 73 | '--zeroshot-frequency', '1', 74 | '--dataset-type', "synthetic", 75 | '--train-num-samples', '16', 76 | '--warmup', '1', 77 | '--batch-size', '4', 78 | '--lr', '1e-3', 79 | '--wd', '0.1', 80 | '--epochs', '1', 81 | '--workers', '2', 82 | '--model', 'ViT-B-32', 83 | '--lock-image', 84 | '--lock-image-unlocked-groups', '5', 85 | '--accum-freq', '2' 86 | ]) 87 | 88 | 89 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 90 | def test_training_clip_with_jit(): 91 | main([ 92 | '--save-frequency', '1', 93 | '--zeroshot-frequency', '1', 94 | '--dataset-type', "synthetic", 95 | '--train-num-samples', '16', 96 | '--warmup', '1', 97 | '--batch-size', '4', 98 | '--lr', '1e-3', 99 | '--wd', '0.1', 100 | '--epochs', '1', 101 | '--workers', '2', 102 | '--model', 'ViT-B-32', 103 | '--torchscript' 104 | ]) 105 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | wandb 7 | fire 8 | opencv-python 9 | pandas 10 | braceexpand 11 | torch-ema 12 | torchmetrics==0.11.4 13 | setuptools==59.5.0 14 | webdataset>=0.2.5 15 | numpy==1.23.0 -------------------------------------------------------------------------------- /scan/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | # from .config import add_maskformer2_config 7 | from .config import add_ovseg_config 8 | 9 | # dataset loading 10 | # from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 11 | # from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 12 | # from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 13 | # MaskFormerInstanceDatasetMapper, 14 | # ) 15 | # from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 16 | # MaskFormerPanopticDatasetMapper, 17 | # ) 18 | # from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 19 | # MaskFormerSemanticDatasetMapper, 20 | # ) 21 | 22 | # models 23 | # from .maskformer_model import MaskFormer 24 | from .test_time_augmentation import SemanticSegmentorWithTTA 25 | 26 | # evaluation 27 | # from .evaluation.instance_evaluation import InstanceSegEvaluator 28 | from .ovseg_model import SCAN, SCANDEMO -------------------------------------------------------------------------------- /scan/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | from .dataset_mappers import * 5 | from . import datasets 6 | from .build import ( 7 | build_detection_train_loader, 8 | build_detection_test_loader, 9 | ) 10 | -------------------------------------------------------------------------------- /scan/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 5 | -------------------------------------------------------------------------------- /scan/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import register_coco_stuff, register_voc_seg 3 | from . import register_cc3m 4 | from . import register_ade20k_full 5 | from . import register_pascal_context -------------------------------------------------------------------------------- /scan/data/datasets/register_voc_seg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | 7 | PASCALVOC20_NAMES = ( 8 | "aeroplane", 9 | "bicycle", 10 | "bird", 11 | "boat", 12 | "bottle", 13 | "bus", 14 | "car", 15 | "cat", 16 | "chair", 17 | "cow", 18 | "diningtable", 19 | "dog", 20 | "horse", 21 | "motorbike", 22 | "person", 23 | "pottedplant", 24 | "sheep", 25 | "sofa", 26 | "train", 27 | "tvmonitor", 28 | ) 29 | 30 | def _get_voc_meta(cat_list): 31 | ret = { 32 | "stuff_classes": cat_list, 33 | } 34 | return ret 35 | 36 | 37 | def register_pascalvoc(root): 38 | root = os.path.join(root, "VOCdevkit/VOC2012") 39 | meta = _get_voc_meta(PASCALVOC20_NAMES) 40 | 41 | for name, image_dirname, sem_seg_dirname in [ 42 | ("val", "JPEGImages", "annotations_detectron2/val"), 43 | ]: 44 | image_dir = os.path.join(root, image_dirname) 45 | gt_dir = os.path.join(root, sem_seg_dirname) 46 | all_name = f"pascalvoc20_sem_seg_{name}" 47 | DatasetCatalog.register( 48 | all_name, 49 | lambda x=image_dir, y=gt_dir: load_sem_seg( 50 | y, x, gt_ext="png", image_ext="jpg" 51 | ), 52 | ) 53 | MetadataCatalog.get(all_name).set( 54 | image_root=image_dir, 55 | sem_seg_root=gt_dir, 56 | evaluator_type="sem_seg", 57 | ignore_label=255, 58 | **meta, 59 | ) 60 | 61 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 62 | register_pascalvoc(_root) 63 | -------------------------------------------------------------------------------- /scan/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator, SGIoU_SemSegEvaluator 5 | -------------------------------------------------------------------------------- /scan/frequency.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class LFM(nn.Module): 6 | def __init__(self, num_channels): 7 | super(LFM, self).__init__() 8 | self.conv1 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0) 9 | self.conv2 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0) 10 | 11 | def make_gaussian(self, y_idx, x_idx, height, width, sigma=7): 12 | yv, xv = torch.meshgrid([torch.arange(0, height), torch.arange(0, width)]) 13 | 14 | yv = yv.unsqueeze(0).float().cuda() 15 | xv = xv.unsqueeze(0).float().cuda() 16 | 17 | 18 | g = torch.exp(- ((yv - y_idx) ** 2 + (xv - x_idx) ** 2) / (2 * sigma ** 2)) 19 | 20 | return g.unsqueeze(0) #1, 1, H, W 21 | 22 | 23 | def forward(self, x, sigma): 24 | b, c, h, w = x.shape 25 | x = x.float() 26 | y = torch.fft.fft2(x) 27 | 28 | 29 | h_idx, w_idx = h // 2, w // 2 30 | high_filter = self.make_gaussian(h_idx, w_idx, h, w, sigma=sigma) 31 | y = y * (1 - high_filter) 32 | 33 | y_imag = y.imag 34 | y_real = y.real 35 | y_f = torch.cat([y_real, y_imag], dim=1) 36 | y = F.relu(self.conv1(y_f)) 37 | 38 | y = self.conv2(y).float() 39 | y_real, y_imag = torch.chunk(y, 2, dim=1) 40 | y = torch.complex(y_real, y_imag) 41 | 42 | y = torch.fft.ifft2(y, s=(h, w)).float() 43 | return x + y 44 | 45 | class MLP(nn.Module): 46 | def __init__(self, input_dim, output_dim): 47 | super(MLP, self).__init__() 48 | self.fc1 = nn.Linear(input_dim, output_dim) 49 | self.fc2 = nn.Linear(output_dim, output_dim) 50 | 51 | def forward(self, x): 52 | x = self.fc2(self.fc1(x)) 53 | return x 54 | 55 | 56 | class CA(nn.Module): 57 | def __init__(self, input_dim, num): 58 | super(CA, self).__init__() 59 | self.num = num 60 | self.multiattn = nn.ModuleList() 61 | self.ln = nn.ModuleList() 62 | for i in range(num): 63 | self.multiattn.append(nn.MultiheadAttention(embed_dim=input_dim, num_heads=8, batch_first=True)) 64 | if i != num - 1: 65 | self.ln.append(nn.LayerNorm(input_dim)) 66 | 67 | def forward(self, tgt, memory): 68 | for i in range(self.num): 69 | tgt = tgt + self.multiattn[i](tgt, memory, memory)[0] 70 | if i != self.num - 1: 71 | tgt = self.ln[i](tgt) 72 | return tgt -------------------------------------------------------------------------------- /scan/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import OpenVocaMask2FormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /scan/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | -------------------------------------------------------------------------------- /scan/modeling/clip_adapter/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | from .text_template import ( 5 | PredefinedPromptExtractor, 6 | ImageNetPromptExtractor, 7 | VILDPromptExtractor, 8 | ) 9 | from .adapter import ClipAdapter, MaskFormerClipAdapter 10 | 11 | 12 | def build_text_prompt(cfg): 13 | if cfg.TEXT_TEMPLATES == "predefined": 14 | text_templates = PredefinedPromptExtractor(cfg.PREDEFINED_PROMPT_TEMPLATES) 15 | elif cfg.TEXT_TEMPLATES == "imagenet": 16 | text_templates = ImageNetPromptExtractor() 17 | elif cfg.TEXT_TEMPLATES == "vild": 18 | text_templates = VILDPromptExtractor() 19 | else: 20 | raise NotImplementedError( 21 | "Prompt learner {} is not supported".format(cfg.TEXT_TEMPLATES) 22 | ) 23 | return text_templates 24 | -------------------------------------------------------------------------------- /scan/modeling/clip_adapter/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | from typing import Tuple 5 | import numpy as np 6 | import torch 7 | from detectron2.utils.comm import get_local_rank, synchronize 8 | 9 | 10 | def expand_box( 11 | x1: float, 12 | y1: float, 13 | x2: float, 14 | y2: float, 15 | expand_ratio: float = 1.0, 16 | max_h: int = None, 17 | max_w: int = None, 18 | ): 19 | cx = 0.5 * (x1 + x2) 20 | cy = 0.5 * (y1 + y2) 21 | w = x2 - x1 22 | h = y2 - y1 23 | w = w * expand_ratio 24 | h = h * expand_ratio 25 | box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] 26 | if max_h is not None: 27 | box[1] = max(0, box[1]) 28 | box[3] = min(max_h - 1, box[3]) 29 | if max_w is not None: 30 | box[0] = max(0, box[0]) 31 | box[2] = min(max_w - 1, box[2]) 32 | return [int(b) for b in box] 33 | 34 | 35 | def mask2box(mask: torch.Tensor): 36 | # use naive way 37 | row = torch.nonzero(mask.sum(dim=0))[:, 0] 38 | if len(row) == 0: 39 | return None 40 | x1 = row.min() 41 | x2 = row.max() 42 | col = np.nonzero(mask.sum(dim=1))[:, 0] 43 | y1 = col.min() 44 | y2 = col.max() 45 | return x1, y1, x2 + 1, y2 + 1 46 | 47 | 48 | def crop_with_mask( 49 | image: torch.Tensor, 50 | mask: torch.Tensor, 51 | bbox: torch.Tensor, 52 | fill: Tuple[float, float, float] = (0, 0, 0), 53 | expand_ratio: float = 1.0, 54 | ): 55 | l, t, r, b = expand_box(*bbox, expand_ratio) 56 | _, h, w = image.shape 57 | l = max(l, 0) 58 | t = max(t, 0) 59 | r = min(r, w) 60 | b = min(b, h) 61 | new_image = torch.cat( 62 | [image.new_full((1, b - t, r - l), fill_value=val) for val in fill] 63 | ) 64 | # return image[:, t:b, l:r], mask[None, t:b, l:r] 65 | return image[:, t:b, l:r] * mask[None, t:b, l:r] + (1 - mask[None, t:b, l:r]) * new_image, mask[None, t:b, l:r] -------------------------------------------------------------------------------- /scan/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python3 setup.py build install 14 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /scan/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /scan/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /scan/modeling/transformer_decoder/open_vocab_mask2former_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 4 | 5 | from torch import nn 6 | from detectron2.config import configurable 7 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder, MLP 8 | 9 | 10 | class OpenVocabMask2FormerPredictor(MultiScaleMaskedTransformerDecoder): 11 | @configurable 12 | def __init__( 13 | self, 14 | in_channels, 15 | mask_classification=True, 16 | *, 17 | embedding_dim: int, 18 | embed_hidden_dim: int, 19 | embed_layers: int, 20 | hidden_dim: int, 21 | num_queries: int, 22 | nheads: int, 23 | # dropout: float, 24 | dim_feedforward: int, 25 | # enc_layers: int, 26 | dec_layers: int, 27 | pre_norm: bool, 28 | # deep_supervision: bool, 29 | mask_dim: int, 30 | enforce_input_project: bool, 31 | ): 32 | super().__init__( 33 | in_channels, 34 | False, 35 | num_classes=embedding_dim, 36 | hidden_dim=hidden_dim, 37 | num_queries=num_queries, 38 | nheads=nheads, 39 | # dropout=dropout, 40 | dim_feedforward=dim_feedforward, 41 | # enc_layers=enc_layers, 42 | dec_layers=dec_layers, 43 | pre_norm=pre_norm, 44 | # deep_supervision=deep_supervision, 45 | mask_dim=mask_dim, 46 | enforce_input_project=enforce_input_project, 47 | ) 48 | mask_classification = True 49 | self.mask_classification = mask_classification 50 | # output FFNs 51 | if self.mask_classification: 52 | self.class_embed = MLP( 53 | hidden_dim, embed_hidden_dim, embedding_dim, embed_layers 54 | ) 55 | 56 | def freeze_pretrained(self): 57 | for name, module in self.named_children(): 58 | if name not in ["class_embed"]: 59 | for param in module.parameters(): 60 | param.requires_grad = False 61 | 62 | @classmethod 63 | def from_config(cls, cfg, in_channels, mask_classification): 64 | ret = {} 65 | ret["in_channels"] = in_channels 66 | ret["mask_classification"] = mask_classification 67 | 68 | ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM 69 | ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM 70 | ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS 71 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 72 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 73 | # Transformer parameters: 74 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 75 | # ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 76 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 77 | # ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 78 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 79 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 80 | # ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 81 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 82 | 83 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 84 | 85 | return ret 86 | -------------------------------------------------------------------------------- /scan/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /scan/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /scan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .events import setup_wandb, WandbWriter 3 | from .predictor import VisualizationDemo -------------------------------------------------------------------------------- /scan/utils/events.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import os 5 | import wandb 6 | from detectron2.utils import comm 7 | from detectron2.utils.events import EventWriter, get_event_storage 8 | 9 | 10 | def setup_wandb(cfg, args): 11 | if comm.is_main_process(): 12 | init_args = { 13 | k.lower(): v 14 | for k, v in cfg.WANDB.items() 15 | if isinstance(k, str) and k not in ["config", "name"] 16 | } 17 | # only include most related part to avoid too big table 18 | # TODO: add configurable params to select which part of `cfg` should be saved in config 19 | if "config_exclude_keys" in init_args: 20 | init_args["config"] = cfg 21 | init_args["config"]["cfg_file"] = args.config_file 22 | else: 23 | init_args["config"] = { 24 | "model": cfg.MODEL, 25 | "solver": cfg.SOLVER, 26 | "cfg_file": args.config_file, 27 | } 28 | if ("name" not in init_args) or (init_args["name"] is None): 29 | init_args["name"] = os.path.basename(args.config_file) 30 | # wandb.init(**init_args) 31 | 32 | 33 | class BaseRule(object): 34 | def __call__(self, target): 35 | return target 36 | 37 | 38 | class IsIn(BaseRule): 39 | def __init__(self, keyword: str): 40 | self.keyword = keyword 41 | 42 | def __call__(self, target): 43 | return self.keyword in target 44 | 45 | 46 | class Prefix(BaseRule): 47 | def __init__(self, keyword: str): 48 | self.keyword = keyword 49 | 50 | def __call__(self, target): 51 | return "/".join([self.keyword, target]) 52 | 53 | 54 | class WandbWriter(EventWriter): 55 | """ 56 | Write all scalars to a tensorboard file. 57 | """ 58 | 59 | def __init__(self): 60 | """ 61 | Args: 62 | log_dir (str): the directory to save the output events 63 | kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` 64 | """ 65 | self._last_write = -1 66 | self._group_rules = [ 67 | (IsIn("/"), BaseRule()), 68 | (IsIn("loss"), Prefix("train")), 69 | ] 70 | 71 | def write(self): 72 | 73 | storage = get_event_storage() 74 | 75 | def _group_name(scalar_name): 76 | for (rule, op) in self._group_rules: 77 | if rule(scalar_name): 78 | return op(scalar_name) 79 | return scalar_name 80 | 81 | stats = { 82 | _group_name(name): scalars[0] 83 | for name, scalars in storage.latest().items() 84 | if scalars[1] > self._last_write 85 | } 86 | if len(stats) > 0: 87 | self._last_write = max([v[1] for k, v in storage.latest().items()]) 88 | 89 | # storage.put_{image,histogram} is only meant to be used by 90 | # tensorboard writer. So we access its internal fields directly from here. 91 | if len(storage._vis_data) >= 1: 92 | stats["image"] = [ 93 | wandb.Image(img, caption=img_name) 94 | for img_name, img, step_num in storage._vis_data 95 | ] 96 | # Storage stores all image data and rely on this writer to clear them. 97 | # As a result it assumes only one writer will use its image data. 98 | # An alternative design is to let storage store limited recent 99 | # data (e.g. only the most recent image) that all writers can access. 100 | # In that case a writer may not see all image data if its period is long. 101 | storage.clear_images() 102 | 103 | if len(storage._histograms) >= 1: 104 | 105 | def create_bar(tag, bucket_limits, bucket_counts, **kwargs): 106 | data = [ 107 | [label, val] for (label, val) in zip(bucket_limits, bucket_counts) 108 | ] 109 | table = wandb.Table(data=data, columns=["label", "value"]) 110 | return wandb.plot.bar(table, "label", "value", title=tag) 111 | 112 | stats["hist"] = [create_bar(**params) for params in storage._histograms] 113 | 114 | storage.clear_histograms() 115 | 116 | if len(stats) == 0: 117 | return 118 | # wandb.log(stats, step=storage.iter) 119 | 120 | def close(self): 121 | wandb.finish() 122 | -------------------------------------------------------------------------------- /scan/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | 17 | def _max_by_axis(the_list): 18 | # type: (List[List[int]]) -> List[int] 19 | maxes = the_list[0] 20 | for sublist in the_list[1:]: 21 | for index, item in enumerate(sublist): 22 | maxes[index] = max(maxes[index], item) 23 | return maxes 24 | 25 | 26 | class NestedTensor(object): 27 | def __init__(self, tensors, mask: Optional[Tensor]): 28 | self.tensors = tensors 29 | self.mask = mask 30 | 31 | def to(self, device): 32 | # type: (Device) -> NestedTensor # noqa 33 | cast_tensor = self.tensors.to(device) 34 | mask = self.mask 35 | if mask is not None: 36 | assert mask is not None 37 | cast_mask = mask.to(device) 38 | else: 39 | cast_mask = None 40 | return NestedTensor(cast_tensor, cast_mask) 41 | 42 | def decompose(self): 43 | return self.tensors, self.mask 44 | 45 | def __repr__(self): 46 | return str(self.tensors) 47 | 48 | 49 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 50 | # TODO make this more general 51 | if tensor_list[0].ndim == 3: 52 | if torchvision._is_tracing(): 53 | # nested_tensor_from_tensor_list() does not export well to ONNX 54 | # call _onnx_nested_tensor_from_tensor_list() instead 55 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 56 | 57 | # TODO make it support different-sized images 58 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 59 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 60 | batch_shape = [len(tensor_list)] + max_size 61 | b, c, h, w = batch_shape 62 | dtype = tensor_list[0].dtype 63 | device = tensor_list[0].device 64 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 65 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 66 | for img, pad_img, m in zip(tensor_list, tensor, mask): 67 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 68 | m[: img.shape[1], : img.shape[2]] = False 69 | else: 70 | raise ValueError("not supported") 71 | return NestedTensor(tensor, mask) 72 | 73 | 74 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 75 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 76 | @torch.jit.unused 77 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 78 | max_size = [] 79 | for i in range(tensor_list[0].dim()): 80 | max_size_i = torch.max( 81 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 82 | ).to(torch.int64) 83 | max_size.append(max_size_i) 84 | max_size = tuple(max_size) 85 | 86 | # work around for 87 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 88 | # m[: img.shape[1], :img.shape[2]] = False 89 | # which is not yet supported in onnx 90 | padded_imgs = [] 91 | padded_masks = [] 92 | for img in tensor_list: 93 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 94 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 95 | padded_imgs.append(padded_img) 96 | 97 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 98 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 99 | padded_masks.append(padded_mask.to(torch.bool)) 100 | 101 | tensor = torch.stack(padded_imgs) 102 | mask = torch.stack(padded_masks) 103 | 104 | return NestedTensor(tensor, mask=mask) 105 | 106 | 107 | def is_dist_avail_and_initialized(): 108 | if not dist.is_available(): 109 | return False 110 | if not dist.is_initialized(): 111 | return False 112 | return True 113 | 114 | def get_gt_binary_masks(gt_semseg): 115 | mask_ids = torch.unique(gt_semseg) 116 | gt_masks = [] 117 | for id in mask_ids: 118 | if id != 255: 119 | gt_masks.append(gt_semseg == id) 120 | gt_masks = torch.stack(gt_masks).float() 121 | return gt_masks -------------------------------------------------------------------------------- /scan/utils/post_process_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import torch 5 | from torch.nn import functional as F 6 | import numpy as np 7 | 8 | try: 9 | import pydensecrf.densecrf as dcrf 10 | from pydensecrf.utils import ( 11 | unary_from_softmax, 12 | unary_from_labels, 13 | create_pairwise_bilateral, 14 | create_pairwise_gaussian, 15 | ) 16 | except: 17 | dcrf = None 18 | 19 | 20 | def dense_crf_post_process( 21 | logits, 22 | image, 23 | n_labels=None, 24 | max_iters=5, 25 | pos_xy_std=(3, 3), 26 | pos_w=3, 27 | bi_xy_std=(80, 80), 28 | bi_rgb_std=(13, 13, 13), 29 | bi_w=10, 30 | ): 31 | """ 32 | logits : [C,H,W] 33 | image : [3,H,W] 34 | """ 35 | if dcrf is None: 36 | raise FileNotFoundError( 37 | "pydensecrf is required to perform dense crf inference." 38 | ) 39 | if isinstance(logits, torch.Tensor): 40 | logits = F.softmax(logits, dim=0).detach().cpu().numpy() 41 | U = unary_from_softmax(logits) 42 | n_labels = logits.shape[0] 43 | elif logits.ndim == 3: 44 | U = unary_from_softmax(logits) 45 | n_labels = logits.shape[0] 46 | else: 47 | assert n_labels is not None 48 | U = unary_from_labels(logits, n_labels, zero_unsure=False) 49 | 50 | d = dcrf.DenseCRF2D(image.shape[1], image.shape[0], n_labels) 51 | 52 | d.setUnaryEnergy(U) 53 | 54 | # This adds the color-independent term, features are the locations only. 55 | d.addPairwiseGaussian( 56 | sxy=pos_xy_std, 57 | compat=pos_w, 58 | kernel=dcrf.DIAG_KERNEL, 59 | normalization=dcrf.NORMALIZE_SYMMETRIC, 60 | ) 61 | 62 | # This adds the color-dependent term, i.e. features are (x,y,r,g,b). 63 | d.addPairwiseBilateral( 64 | sxy=bi_xy_std, 65 | srgb=bi_rgb_std, 66 | rgbim=image, 67 | compat=bi_w, 68 | kernel=dcrf.DIAG_KERNEL, 69 | normalization=dcrf.NORMALIZE_SYMMETRIC, 70 | ) 71 | # Run five inference steps. 72 | logits = d.inference(max_iters) 73 | logits = np.asarray(logits).reshape((n_labels, image.shape[0], image.shape[1])) 74 | return torch.from_numpy(logits) 75 | -------------------------------------------------------------------------------- /tools/convert-pretrained-clip-model-to-d2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | 23 | def transform(path): 24 | model = torch.load(path, map_location="cpu") 25 | print(f"loading {path}......") 26 | state_dict = model["model"] 27 | state_dict = { 28 | k.replace("visual_model.", ""): v 29 | for k, v in state_dict.items() 30 | if k.startswith("visual_model") 31 | } 32 | source_keys = [k for k in state_dict.keys() if "relative_coords" in k] 33 | for k in source_keys: 34 | state_dict[ 35 | k.replace("relative_coords", "relative_position_index") 36 | ] = state_dict[k] 37 | del state_dict[k] 38 | 39 | source_keys = [k for k in state_dict.keys() if "atten_mask_matrix" in k] 40 | for k in source_keys: 41 | state_dict[k.replace("atten_mask_matrix", "attn_mask")] = state_dict[k] 42 | del state_dict[k] 43 | 44 | source_keys = [k for k in state_dict.keys() if "rel_pos_embed_table" in k] 45 | for k in source_keys: 46 | state_dict[ 47 | k.replace("rel_pos_embed_table", "relative_position_bias_table") 48 | ] = state_dict[k] 49 | del state_dict[k] 50 | 51 | source_keys = [k for k in state_dict.keys() if "channel_reduction" in k] 52 | for k in source_keys: 53 | state_dict[k.replace("channel_reduction", "reduction")] = state_dict[k] 54 | del state_dict[k] 55 | return { 56 | k if k.startswith("backbone.") else "backbone." + k: v 57 | for k, v in state_dict.items() 58 | } 59 | 60 | 61 | if __name__ == "__main__": 62 | input = sys.argv[1] 63 | res = { 64 | "model": transform(input), 65 | "__author__": "third_party", 66 | "matching_heuristics": True, 67 | } 68 | with open(sys.argv[2], "wb") as f: 69 | pkl.dump(res, f) 70 | -------------------------------------------------------------------------------- /tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | These models typically produce slightly worse results than the 26 | pre-trained ResNets we use in official configs, which are the 27 | original ResNet models released by MSRA. 28 | """ 29 | 30 | if __name__ == "__main__": 31 | input = sys.argv[1] 32 | 33 | obj = torch.load(input, map_location="cpu") 34 | 35 | newmodel = {} 36 | for k in list(obj.keys()): 37 | old_k = k 38 | if "layer" not in k: 39 | k = "stem." + k 40 | for t in [1, 2, 3, 4]: 41 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 42 | for t in [1, 2, 3]: 43 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 44 | k = k.replace("downsample.0", "shortcut") 45 | k = k.replace("downsample.1", "shortcut.norm") 46 | print(old_k, "->", k) 47 | newmodel[k] = obj.pop(old_k).detach().numpy() 48 | 49 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 50 | 51 | with open(sys.argv[2], "wb") as f: 52 | pkl.dump(res, f) 53 | if obj: 54 | print("Unconverted keys:", obj.keys()) 55 | -------------------------------------------------------------------------------- /tools/replace_clip.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved 3 | 4 | import torch 5 | from collections import OrderedDict 6 | 7 | 8 | # PATH to finetune clip model 9 | clip_ckpt = torch.load('CS_CLIP.pt') 10 | 11 | new_model = OrderedDict() 12 | state_dict = clip_ckpt['state_dict'] 13 | 14 | for k, v in state_dict.items(): 15 | if 'clip_model' in k: 16 | new_key = k.replace('module.clip_model.','') 17 | new_model[new_key] = v 18 | 19 | # PATH to trained MaskFormer model 20 | ovseg_model = torch.load('Seg_model.pth', 'cpu') 21 | 22 | for k, v in new_model.items(): 23 | new_k = 'clip_adapter.clip_model.' + k 24 | if new_k in ovseg_model['model'].keys(): 25 | ovseg_model['model'][new_k] = v 26 | else: 27 | print(f'{new_k} does not exist in ckpt') 28 | try: 29 | ovseg_model['model']['clip_adapter.clip_model.visual.mask_embedding'] = new_model['visual.mask_embedding'] 30 | print('clip_ckpt has mask_embedding, remember to set MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD True during evaluation') 31 | except: 32 | print('clip_ckpt does not have mask_embedding, remember to set MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False during evaluation') 33 | 34 | torch.save(ovseg_model, 'SCAN.pth') 35 | --------------------------------------------------------------------------------