├── .gitignore
├── INSTALL.md
├── LICENSE
├── README.md
├── configs
├── scan_vitB.yaml
├── scan_vitL.yaml
└── scan_vitL_demo.yaml
├── datasets
├── DATASETS.md
├── prepare_ade20k_full_sem_seg.py
├── prepare_ade20k_sem_seg.py
├── prepare_coco_stuff_sem_seg.py
├── prepare_pascal_context.py
└── prepare_voc_sem_seg.py
├── demo.py
├── imgs
├── cs.png
├── pipeline.png
├── results.png
└── visual.png
├── open_clip_training
├── .github
│ └── workflows
│ │ ├── ci.yml
│ │ ├── clear-cache.yml
│ │ └── python-publish.yml
├── .gitignore
├── CITATION.cff
├── HISTORY.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
│ ├── Interacting_with_open_clip.ipynb
│ ├── Interacting_with_open_coca.ipynb
│ ├── LOW_ACC.md
│ ├── PRETRAINED.md
│ ├── clip_conceptual_captions.md
│ ├── clipa.md
│ ├── datacomp_models.md
│ ├── openclip_results.csv
│ └── script_examples
│ │ ├── clipa
│ │ ├── vit_b16
│ │ │ ├── i50_t16_finetune.sh
│ │ │ └── i50_t16_pretrain.sh
│ │ └── vit_l16
│ │ │ ├── i17_t16_finetune.sh
│ │ │ ├── i17_t16_pretrain.sh
│ │ │ ├── i37_t8_finetune.sh
│ │ │ └── i37_t8_pretrain.sh
│ │ ├── clipav2
│ │ └── vit_h14
│ │ │ ├── i257_t32_finetunex4.sh
│ │ │ ├── i50_t8_pretrain.sh
│ │ │ └── i577_t32_finetunex1.sh
│ │ └── stability_example.sh
├── pytest.ini
├── scripts
│ ├── clipav1_vit_l16_i37_t8.sh
│ ├── clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh
│ ├── h14_224_32_finetune.sh
│ └── h14_84_8_pretrain.sh
├── setup.py
├── src
│ ├── clip_adapter
│ │ └── clip_adapter.py
│ ├── open_clip
│ │ ├── __init__.py
│ │ ├── big_vision.py
│ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ ├── coca_model.py
│ │ ├── constants.py
│ │ ├── factory.py
│ │ ├── generation_utils.py
│ │ ├── hf_configs.py
│ │ ├── hf_model.py
│ │ ├── loss.py
│ │ ├── model.py
│ │ ├── model_configs
│ │ │ ├── EVA01-g-14-plus.json
│ │ │ ├── EVA01-g-14.json
│ │ │ ├── EVA02-B-16.json
│ │ │ ├── EVA02-E-14-plus.json
│ │ │ ├── EVA02-E-14.json
│ │ │ ├── EVA02-L-14-336.json
│ │ │ ├── EVA02-L-14.json
│ │ │ ├── RN101-quickgelu.json
│ │ │ ├── RN101.json
│ │ │ ├── RN50-quickgelu.json
│ │ │ ├── RN50.json
│ │ │ ├── RN50x16.json
│ │ │ ├── RN50x4.json
│ │ │ ├── RN50x64.json
│ │ │ ├── ViT-B-16-SigLIP-256.json
│ │ │ ├── ViT-B-16-SigLIP-384.json
│ │ │ ├── ViT-B-16-SigLIP-512.json
│ │ │ ├── ViT-B-16-SigLIP-i18n-256.json
│ │ │ ├── ViT-B-16-SigLIP.json
│ │ │ ├── ViT-B-16-plus-240.json
│ │ │ ├── ViT-B-16-plus.json
│ │ │ ├── ViT-B-16-quickgelu.json
│ │ │ ├── ViT-B-16.json
│ │ │ ├── ViT-B-32-256.json
│ │ │ ├── ViT-B-32-plus-256.json
│ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ ├── ViT-B-32.json
│ │ │ ├── ViT-H-14-378-quickgelu.json
│ │ │ ├── ViT-H-14-CLIPA-336.json
│ │ │ ├── ViT-H-14-CLIPA.json
│ │ │ ├── ViT-H-14-quickgelu.json
│ │ │ ├── ViT-H-14.json
│ │ │ ├── ViT-H-16.json
│ │ │ ├── ViT-L-14-280.json
│ │ │ ├── ViT-L-14-336.json
│ │ │ ├── ViT-L-14-CLIPA-336.json
│ │ │ ├── ViT-L-14-CLIPA.json
│ │ │ ├── ViT-L-14-quickgelu.json
│ │ │ ├── ViT-L-14.json
│ │ │ ├── ViT-L-16-320.json
│ │ │ ├── ViT-L-16-SigLIP-256.json
│ │ │ ├── ViT-L-16-SigLIP-384.json
│ │ │ ├── ViT-L-16.json
│ │ │ ├── ViT-M-16-alt.json
│ │ │ ├── ViT-M-16.json
│ │ │ ├── ViT-M-32-alt.json
│ │ │ ├── ViT-M-32.json
│ │ │ ├── ViT-S-16-alt.json
│ │ │ ├── ViT-S-16.json
│ │ │ ├── ViT-S-32-alt.json
│ │ │ ├── ViT-S-32.json
│ │ │ ├── ViT-SO400M-14-SigLIP-384.json
│ │ │ ├── ViT-SO400M-14-SigLIP.json
│ │ │ ├── ViT-bigG-14-CLIPA-336.json
│ │ │ ├── ViT-bigG-14-CLIPA.json
│ │ │ ├── ViT-bigG-14.json
│ │ │ ├── ViT-e-14.json
│ │ │ ├── ViT-g-14.json
│ │ │ ├── coca_ViT-B-32.json
│ │ │ ├── coca_ViT-L-14.json
│ │ │ ├── coca_base.json
│ │ │ ├── coca_roberta-ViT-B-32.json
│ │ │ ├── convnext_base.json
│ │ │ ├── convnext_base_w.json
│ │ │ ├── convnext_base_w_320.json
│ │ │ ├── convnext_large.json
│ │ │ ├── convnext_large_d.json
│ │ │ ├── convnext_large_d_320.json
│ │ │ ├── convnext_small.json
│ │ │ ├── convnext_tiny.json
│ │ │ ├── convnext_xlarge.json
│ │ │ ├── convnext_xxlarge.json
│ │ │ ├── convnext_xxlarge_320.json
│ │ │ ├── mt5-base-ViT-B-32.json
│ │ │ ├── mt5-xl-ViT-H-14.json
│ │ │ ├── nllb-clip-base-siglip.json
│ │ │ ├── nllb-clip-base.json
│ │ │ ├── nllb-clip-large-siglip.json
│ │ │ ├── nllb-clip-large.json
│ │ │ ├── roberta-ViT-B-32.json
│ │ │ ├── swin_base_patch4_window7_224.json
│ │ │ ├── vit_medium_patch16_gap_256.json
│ │ │ ├── vit_relpos_medium_patch16_cls_224.json
│ │ │ ├── xlm-roberta-base-ViT-B-32.json
│ │ │ └── xlm-roberta-large-ViT-H-14.json
│ │ ├── modified_resnet.py
│ │ ├── openai.py
│ │ ├── pos_embed.py
│ │ ├── pretrained.py
│ │ ├── push_to_hf_hub.py
│ │ ├── timm_model.py
│ │ ├── tokenizer.py
│ │ ├── transform.py
│ │ ├── transformer.py
│ │ ├── utils.py
│ │ ├── version.py
│ │ ├── zero_shot_classifier.py
│ │ └── zero_shot_metadata.py
│ ├── scripts
│ │ ├── 1cap_finetune_VitL.sh
│ │ └── finetune_VitL_with_mask.sh
│ └── training
│ │ ├── .gitignore
│ │ ├── __init__.py
│ │ ├── ade150_zeroshot_data.py
│ │ ├── data.py
│ │ ├── distributed.py
│ │ ├── file_utils.py
│ │ ├── main.py
│ │ ├── params.py
│ │ ├── precision.py
│ │ ├── profiler.py
│ │ ├── scheduler.py
│ │ ├── train.py
│ │ └── zero_shot.py
├── tests
│ ├── test_download_pretrained.py
│ ├── test_hf_model.py
│ ├── test_inference.py
│ ├── test_inference_simple.py
│ ├── test_num_shards.py
│ ├── test_training_simple.py
│ ├── test_wds.py
│ └── util_test.py
└── tutorials
│ └── int8_tutorial.ipynb
├── requirements.txt
├── scan
├── __init__.py
├── config.py
├── data
│ ├── __init__.py
│ ├── augmentations.py
│ ├── build.py
│ ├── dataset_mappers
│ │ ├── __init__.py
│ │ └── mask_former_semantic_dataset_mapper.py
│ └── datasets
│ │ ├── __init__.py
│ │ ├── csv_data.py
│ │ ├── register_ade20k_full.py
│ │ ├── register_cc3m.py
│ │ ├── register_coco_stuff.py
│ │ ├── register_pascal_context.py
│ │ └── register_voc_seg.py
├── evaluation
│ ├── __init__.py
│ └── generalized_sem_seg_evaluation.py
├── frequency.py
├── maskformer_model.py
├── modeling
│ ├── __init__.py
│ ├── backbone
│ │ ├── __init__.py
│ │ ├── clip_resnet.py
│ │ └── swin.py
│ ├── clip_adapter
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── text_template.py
│ │ └── utils.py
│ ├── criterion.py
│ ├── matcher.py
│ ├── meta_arch
│ │ ├── __init__.py
│ │ ├── mask_former_head.py
│ │ └── per_pixel_baseline.py
│ ├── pixel_decoder
│ │ ├── __init__.py
│ │ ├── fpn.py
│ │ ├── msdeformattn.py
│ │ └── ops
│ │ │ ├── functions
│ │ │ ├── __init__.py
│ │ │ └── ms_deform_attn_func.py
│ │ │ ├── make.sh
│ │ │ ├── modules
│ │ │ ├── __init__.py
│ │ │ └── ms_deform_attn.py
│ │ │ ├── setup.py
│ │ │ ├── src
│ │ │ ├── cpu
│ │ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ │ └── ms_deform_attn_cpu.h
│ │ │ ├── cuda
│ │ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ └── ms_deform_im2col_cuda.cuh
│ │ │ ├── ms_deform_attn.h
│ │ │ └── vision.cpp
│ │ │ └── test.py
│ └── transformer_decoder
│ │ ├── __init__.py
│ │ ├── mask2former_transformer_decoder.py
│ │ ├── maskformer_transformer_decoder.py
│ │ ├── open_vocab_mask2former_predictor.py
│ │ ├── position_encoding.py
│ │ └── transformer.py
├── ovseg_model.py
├── test_time_augmentation.py
└── utils
│ ├── __init__.py
│ ├── events.py
│ ├── misc.py
│ ├── post_process_utils.py
│ └── predictor.py
├── tools
├── convert-pretrained-clip-model-to-d2.py
├── convert-pretrained-swin-model-to-d2.py
├── convert-torchvision-to-d2.py
└── replace_clip.py
└── train_net.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # output dir
2 | output
3 | outputs
4 | instant_test_output
5 | inference_test_output
6 |
7 |
8 |
9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 |
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 |
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 |
32 | # ipython/jupyter notebooks
33 | **/.ipynb_checkpoints/
34 |
35 | # Editor temporaries
36 | *.swn
37 | *.swo
38 | *.swp
39 | *~
40 |
41 | # editor settings
42 | .idea
43 | .vscode
44 | _darcs
45 |
46 | # project dirs
47 | /detectron2/model_zoo/configs
48 | /datasets/*
49 | !/datasets/*.*
50 | /projects/*/datasets
51 | /models
52 | /snippet
53 |
54 | # vs code
55 | .history
56 |
57 | amlt
58 | thirdparty
59 | wandb
60 | weights
61 |
62 |
63 | *.zip
64 | *.tar
65 | /output
66 | *.pth
67 | *.pt
68 |
69 | *.png
70 | !imgs/*.png
71 | *.txt
72 | !requirements.txt
73 |
74 | results/
75 |
76 | openclip_data/
77 | logs/
78 |
79 | data
80 | !scan/data
81 |
82 | *log*
83 |
--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 |
3 | ### Requirements
4 | - Linux with Python ≥ 3.8
5 | - PyTorch ≥ 1.10 and torchvision that matches the PyTorch installation.
6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
7 | PyTorch version matches that is required by Detectron2.
8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
9 |
10 | ### Usage
11 |
12 | Install required packages.
13 |
14 | ```bash
15 | conda create -n scan python=3.8
16 | conda activate scan
17 | conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge -y
18 | pip install -r requirements.txt
19 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
20 | ```
21 |
22 |
23 |
24 | Install other packages.
25 |
26 | ```bash
27 | cd scan/modeling/pixel_decoder/ops
28 | sh make.sh
29 | ```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Open-Vocabulary Segmentation with Semantic-Assisted Calibration [CVPR 2024]
2 | Yong Liu*, Sule Bai*, Guanbin Li, Yitong Wang, Yansong Tang
3 | (*equal contribution)
4 |
5 | The repository contains the official implementation of "Open-Vocabulary Segmentation with Semantic-Assisted Calibration"
6 |
7 | [Paper](https://arxiv.org/abs/2312.04089)
8 |
9 |
10 |
11 |
12 |
13 | ---
14 | ## 📖 Pipeline & Results
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | ### Tab of Content
27 | - [Installation](#1)
28 | - [Data Preparation](#2)
29 | - [Usage](#3)
30 | - [Training](#5)
31 | - [Evaluation](#4)
32 | - [Cite](#6)
33 |
34 |
35 |
36 |
37 | If you find any bugs due to carelessness on our part in organizing the code, feel free to contact us and point that!
38 |
39 | ### Installation
40 | Please see [installation guide](./INSTALL.md).
41 |
42 |
43 |
44 |
45 | ### Data Preparation
46 | Please follow the instruction of [ov-seg](https://github.com/facebookresearch/ov-seg) to prepare the training and test data. The data should be organized like:
47 | ```
48 | $DETECTRON2_DATASETS/
49 | coco/ # COCOStuff-171
50 | ADEChallengeData2016/ # ADE20K-150
51 | ADE20K_2021_17_01/ # ADE20K-847
52 | VOCdevkit/
53 | VOC2012/ # PASCALVOC-20
54 | VOC2010/ # PASCALContext-59, PASCALContext-459
55 | ```
56 |
57 |
58 |
59 |
60 | ### Usage
61 |
62 | - #### Pretrained Weight
63 | We have provided the pretrained SCAN-VitL weights and the finetuned Contextual-shifted CLIP weights. Please download them from [here](https://drive.google.com/drive/folders/1obgHGQngtQms0u5YUJRnwd4y1IzME-c8?usp=drive_link).
64 |
65 |
66 |
67 | #### Evaluation
68 |
69 |
70 | ```
71 | python train_net.py --eval-only --config-file --num-gpus OUTPUT_DIR MODEL.WEIGHTS
72 | ```
73 | - Here is an example:
74 | ```
75 | python train_net.py --num-gpu 8 --eval-only --config-file configs/scan_vitL.yaml MODEL.WEIGHTS ./SCAN.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\) MODEL.CLIP_ADAPTER.REPLACE_RATIO 0.05 MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.75 MODEL.CLIP_ADAPTER.MASK_THR 0.55
76 | ```
77 |
78 |
79 | #### Training
80 | 1. Train the segmentation model:
81 | ```
82 | python train_net.py --config-file --num-gpus
83 | ```
84 |
85 | - Here is an example:
86 |
87 | ```
88 | python train_net.py --num-gpu 8 --config-file configs/scan_vitL.yaml
89 | ```
90 |
91 | 2. Fuse segmentation model with finetuned CLIP.
92 |
93 | We have provided the [finetuned CLIP weights](https://drive.google.com/drive/folders/1obgHGQngtQms0u5YUJRnwd4y1IzME-c8?usp=drive_link). You can directly fuse the pretrained weights with the segmentation model to get the final model. The fuse command is:
94 | ```
95 | cd tools
96 | python replace_clip.py
97 | ```
98 | You need to specify the "clip_ckpt" and "ovseg_model" in the file according to your CLIP path and segmentation model path.
99 |
100 |
101 | (Optional) If you want to finetune the CLIP model from scratch, please follow [ov-seg](https://github.com/facebookresearch/ov-seg) to prepare the corresponding data. The finetued command is:
102 |
103 | ```
104 | cd open_clip_training
105 | cd src
106 | bash scripts/finetune_VitL_with_mask.sh
107 | ```
108 |
109 |
110 |
111 |
112 | ### Cite
113 |
114 | If you find our work helpful, we'd appreciate it if you could cite our paper in your work.
115 | ```
116 | @article{liu2023open,
117 | title={Open-Vocabulary Segmentation with Semantic-Assisted Calibration},
118 | author={Liu, Yong and Bai, Sule and Li, Guanbin and Wang, Yitong and Tang, Yansong},
119 | journal={arXiv preprint arXiv:2312.04089},
120 | year={2023}
121 | }
122 | ```
123 |
--------------------------------------------------------------------------------
/configs/scan_vitB.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "SCAN"
3 | BACKBONE:
4 | FREEZE_AT: 0
5 | NAME: "D2SwinTransformer"
6 | SWIN:
7 | EMBED_DIM: 128
8 | DEPTHS: [2, 2, 18, 2]
9 | NUM_HEADS: [4, 8, 16, 32]
10 | WINDOW_SIZE: 12
11 | APE: False
12 | DROP_PATH_RATE: 0.3
13 | PATCH_NORM: True
14 | PRETRAIN_IMG_SIZE: 384
15 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16 | PIXEL_MEAN: [123.675, 116.280, 103.530]
17 | PIXEL_STD: [58.395, 57.120, 57.375]
18 | SELECT_ORI_CLIP_ID: [6, 9, 12]
19 | FREQUENCY_SIGMA: [9, 7, 3]
20 | CLIP_VISION_DIM: 768
21 | SCAN_DIM: 512
22 | PATCH_SIZE: 14
23 | SEM_SEG_HEAD:
24 | NAME: "OpenVocaMask2FormerHead"
25 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
26 | IGNORE_VALUE: 255
27 | NUM_CLASSES: 171 # number of categories in training set
28 | EMBEDDING_DIM: 512
29 | EMBED_LAYERS: 2
30 | COMMON_STRIDE: 4 # not used, hard-coded
31 | LOSS_WEIGHT: 1.0
32 | CONVS_DIM: 256
33 | MASK_DIM: 256
34 | NORM: "GN"
35 | # pixel decoder
36 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
37 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
38 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
39 | COMMON_STRIDE: 4
40 | TRANSFORMER_ENC_LAYERS: 6
41 | MASK_FORMER:
42 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
43 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
44 | DEEP_SUPERVISION: True
45 | NO_OBJECT_WEIGHT: 0.1
46 | CLASS_WEIGHT: 2.0
47 | MASK_WEIGHT: 5.0
48 | DICE_WEIGHT: 5.0
49 | HIDDEN_DIM: 256
50 | NUM_OBJECT_QUERIES: 100
51 | NHEADS: 8
52 | DROPOUT: 0.0
53 | DIM_FEEDFORWARD: 2048
54 | ENC_LAYERS: 0
55 | PRE_NORM: False
56 | ENFORCE_INPUT_PROJ: False
57 | SIZE_DIVISIBILITY: 32
58 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
59 | TRAIN_NUM_POINTS: 12544
60 | OVERSAMPLE_RATIO: 3.0
61 | IMPORTANCE_SAMPLE_RATIO: 0.75
62 | CLIP_ADAPTER:
63 | TEXT_TEMPLATES: "vild"
64 | CLIP_MODEL_NAME: "ViT-B-16"
65 | MASK_FILL: "mean"
66 | MASK_EXPAND_RATIO: 1.0
67 | MASK_MATTING: False # use soft background, default not used
68 | REGION_RESIZED: True # resize to the input of clip, e.g., 224
69 | CLIP_ENSEMBLE: True # use ensemble of two classification branches
70 | # For the [MASK_THR, CLIP_ENSEMBLE_WEIGHT], we have the following settings:
71 | # a847: [0.25, 0.75] a150: [0.4, 0.7] pc459: [0.25, 0.7] pc59: [0.25, 0.35] voc20: [0.2, 0.45]
72 | MASK_THR: 0.4
73 | CLIP_ENSEMBLE_WEIGHT: 0.7
74 | # For the REPLACE_RATIO, we have the following settings:
75 | # a847: 0.05 a150: 0.05 pc459: 0.05 pc59: 0.05 voc20: 0.1
76 | REPLACE_RATIO: 0.15
77 | REPLACE_LAYER: [1, 3, 5]
78 | DATASETS:
79 | TRAIN: ("coco_2017_train_stuff_sem_seg",)
80 | TEST: ("ade20k_sem_seg_val",)
81 | SOLVER:
82 | IMS_PER_BATCH: 32
83 | BASE_LR: 0.00006
84 | MAX_ITER: 120000
85 | WARMUP_FACTOR: 1e-6
86 | WARMUP_ITERS: 1500
87 | LR_SCHEDULER_NAME: "WarmupPolyLR"
88 | WEIGHT_DECAY: 0.01
89 | WEIGHT_DECAY_NORM: 0.0
90 | WEIGHT_DECAY_EMBED: 0.0
91 | BACKBONE_MULTIPLIER: 1.0
92 | TEST_IMS_PER_BATCH: 1
93 | CLIP_GRADIENTS:
94 | ENABLED: True
95 | CLIP_TYPE: "full_model"
96 | CLIP_VALUE: 0.01
97 | NORM_TYPE: 2.0
98 | INPUT:
99 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
100 | MIN_SIZE_TRAIN_SAMPLING: "choice"
101 | MIN_SIZE_TEST: 640
102 | MAX_SIZE_TRAIN: 2560
103 | MAX_SIZE_TEST: 2560
104 | CROP:
105 | ENABLED: True
106 | TYPE: "absolute"
107 | SIZE: (640, 640)
108 | SINGLE_CATEGORY_MAX_AREA: 1.0
109 | COLOR_AUG_SSD: True
110 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
111 | FORMAT: "RGB"
112 | TEST:
113 | EVAL_PERIOD: 5000
114 | # SEMANTIC_ON: True
115 | # INSTANCE_ON: False
116 | # PANOPTIC_ON: False
117 | AUG:
118 | ENABLED: False
119 | MIN_SIZES: [256, 384, 512, 640, 768, 896]
120 | MAX_SIZE: 3584
121 | FLIP: True
122 | DATALOADER:
123 | FILTER_EMPTY_ANNOTATIONS: True
124 | NUM_WORKERS: 16
125 | VERSION: 2
126 | METRIC: 'Vanilla' # Vanilla or SG-IoU
127 | OUTPUT_DIR: output/SCAN-VitB
--------------------------------------------------------------------------------
/configs/scan_vitL.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "SCAN"
3 | BACKBONE:
4 | FREEZE_AT: 0
5 | NAME: "D2SwinTransformer"
6 | SWIN:
7 | EMBED_DIM: 128
8 | DEPTHS: [2, 2, 18, 2]
9 | NUM_HEADS: [4, 8, 16, 32]
10 | WINDOW_SIZE: 12
11 | APE: False
12 | DROP_PATH_RATE: 0.3
13 | PATCH_NORM: True
14 | PRETRAIN_IMG_SIZE: 384
15 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16 | PIXEL_MEAN: [123.675, 116.280, 103.530]
17 | PIXEL_STD: [58.395, 57.120, 57.375]
18 | SELECT_ORI_CLIP_ID: [12, 18, 24]
19 | FREQUENCY_SIGMA: [9, 7, 3]
20 | CLIP_VISION_DIM: 1024
21 | SCAN_DIM: 768
22 | PATCH_SIZE: 16
23 | SEM_SEG_HEAD:
24 | NAME: "OpenVocaMask2FormerHead"
25 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
26 | IGNORE_VALUE: 255
27 | NUM_CLASSES: 171 # number of categories in training set
28 | EMBEDDING_DIM: 768
29 | EMBED_LAYERS: 2
30 | COMMON_STRIDE: 4 # not used, hard-coded
31 | LOSS_WEIGHT: 1.0
32 | CONVS_DIM: 256
33 | MASK_DIM: 256
34 | NORM: "GN"
35 | # pixel decoder
36 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
37 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
38 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
39 | COMMON_STRIDE: 4
40 | TRANSFORMER_ENC_LAYERS: 6
41 | MASK_FORMER:
42 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
43 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
44 | DEEP_SUPERVISION: True
45 | NO_OBJECT_WEIGHT: 0.1
46 | CLASS_WEIGHT: 2.0
47 | MASK_WEIGHT: 5.0
48 | DICE_WEIGHT: 5.0
49 | HIDDEN_DIM: 256
50 | NUM_OBJECT_QUERIES: 100
51 | NHEADS: 8
52 | DROPOUT: 0.0
53 | DIM_FEEDFORWARD: 2048
54 | ENC_LAYERS: 0
55 | PRE_NORM: False
56 | ENFORCE_INPUT_PROJ: False
57 | SIZE_DIVISIBILITY: 32
58 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
59 | TRAIN_NUM_POINTS: 12544
60 | OVERSAMPLE_RATIO: 3.0
61 | IMPORTANCE_SAMPLE_RATIO: 0.75
62 | CLIP_ADAPTER:
63 | TEXT_TEMPLATES: "vild"
64 | CLIP_MODEL_NAME: "ViT-L-14"
65 | MASK_FILL: "mean"
66 | MASK_EXPAND_RATIO: 1.0
67 | MASK_MATTING: False # use soft background, default not used
68 | REGION_RESIZED: True # resize to the input of clip, e.g., 224
69 | CLIP_ENSEMBLE: True # use ensemble of two classification branches
70 | # For the [MASK_THR, CLIP_ENSEMBLE_WEIGHT], we have the following settings:
71 | # a847: [0.3, 0.75] a150: [0.55, 0.75] pc459: [0.25, 0.65] pc59: [0.5, 0.5] voc20: [0.2, 0.65]
72 | MASK_THR: 0.4
73 | CLIP_ENSEMBLE_WEIGHT: 0.7
74 | # For the REPLACE_RATIO, we have the following settings:
75 | # a847: 0.15 a150: 0.05 pc459: 0.05 pc59: 0.05 voc20: 0.1
76 | REPLACE_RATIO: 0.15
77 | REPLACE_LAYER: [1, 3, 5, 7, 9]
78 | DATASETS:
79 | TRAIN: ("coco_2017_train_stuff_sem_seg",)
80 | TEST: ("ade20k_sem_seg_val",)
81 | SOLVER:
82 | IMS_PER_BATCH: 32
83 | BASE_LR: 0.00006
84 | MAX_ITER: 120000
85 | WARMUP_FACTOR: 1e-6
86 | WARMUP_ITERS: 1500
87 | LR_SCHEDULER_NAME: "WarmupPolyLR"
88 | WEIGHT_DECAY: 0.01
89 | WEIGHT_DECAY_NORM: 0.0
90 | WEIGHT_DECAY_EMBED: 0.0
91 | BACKBONE_MULTIPLIER: 1.0
92 | TEST_IMS_PER_BATCH: 1
93 | CLIP_GRADIENTS:
94 | ENABLED: True
95 | CLIP_TYPE: "full_model"
96 | CLIP_VALUE: 0.01
97 | NORM_TYPE: 2.0
98 | INPUT:
99 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
100 | MIN_SIZE_TRAIN_SAMPLING: "choice"
101 | MIN_SIZE_TEST: 640
102 | MAX_SIZE_TRAIN: 2560
103 | MAX_SIZE_TEST: 2560
104 | CROP:
105 | ENABLED: True
106 | TYPE: "absolute"
107 | SIZE: (640, 640)
108 | SINGLE_CATEGORY_MAX_AREA: 1.0
109 | COLOR_AUG_SSD: True
110 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
111 | FORMAT: "RGB"
112 | TEST:
113 | EVAL_PERIOD: 5000
114 | # SEMANTIC_ON: True
115 | # INSTANCE_ON: False
116 | # PANOPTIC_ON: False
117 | AUG:
118 | ENABLED: False
119 | MIN_SIZES: [256, 384, 512, 640, 768, 896]
120 | MAX_SIZE: 3584
121 | FLIP: True
122 | DATALOADER:
123 | FILTER_EMPTY_ANNOTATIONS: True
124 | NUM_WORKERS: 16
125 | VERSION: 2
126 | METRIC: 'Vanilla' # Vanilla or SG-IoU
127 | OUTPUT_DIR: output/SCAN-VitL
--------------------------------------------------------------------------------
/configs/scan_vitL_demo.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "SCANDEMO"
3 | BACKBONE:
4 | FREEZE_AT: 0
5 | NAME: "D2SwinTransformer"
6 | SWIN:
7 | EMBED_DIM: 128
8 | DEPTHS: [2, 2, 18, 2]
9 | NUM_HEADS: [4, 8, 16, 32]
10 | WINDOW_SIZE: 12
11 | APE: False
12 | DROP_PATH_RATE: 0.3
13 | PATCH_NORM: True
14 | PRETRAIN_IMG_SIZE: 384
15 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16 | PIXEL_MEAN: [123.675, 116.280, 103.530]
17 | PIXEL_STD: [58.395, 57.120, 57.375]
18 | SELECT_ORI_CLIP_ID: [12, 18, 24]
19 | FREQUENCY_SIGMA: [9, 7, 3]
20 | SEM_SEG_HEAD:
21 | NAME: "OpenVocaMask2FormerHead"
22 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
23 | IGNORE_VALUE: 255
24 | NUM_CLASSES: 171 # number of categories in training set
25 | EMBEDDING_DIM: 768
26 | EMBED_LAYERS: 2
27 | COMMON_STRIDE: 4 # not used, hard-coded
28 | LOSS_WEIGHT: 1.0
29 | CONVS_DIM: 256
30 | MASK_DIM: 256
31 | NORM: "GN"
32 | # pixel decoder
33 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
34 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
35 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
36 | COMMON_STRIDE: 4
37 | TRANSFORMER_ENC_LAYERS: 6
38 | MASK_FORMER:
39 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
40 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
41 | DEEP_SUPERVISION: True
42 | NO_OBJECT_WEIGHT: 0.1
43 | CLASS_WEIGHT: 2.0
44 | MASK_WEIGHT: 5.0
45 | DICE_WEIGHT: 5.0
46 | HIDDEN_DIM: 256
47 | NUM_OBJECT_QUERIES: 100
48 | NHEADS: 8
49 | DROPOUT: 0.0
50 | DIM_FEEDFORWARD: 2048
51 | ENC_LAYERS: 0
52 | PRE_NORM: False
53 | ENFORCE_INPUT_PROJ: False
54 | SIZE_DIVISIBILITY: 32
55 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
56 | TRAIN_NUM_POINTS: 12544
57 | OVERSAMPLE_RATIO: 3.0
58 | IMPORTANCE_SAMPLE_RATIO: 0.75
59 | CLIP_ADAPTER:
60 | TEXT_TEMPLATES: "vild"
61 | CLIP_MODEL_NAME: "ViT-L/14"
62 | MASK_FILL: "mean"
63 | MASK_EXPAND_RATIO: 1.0
64 | MASK_THR: 0.4 # choose the foreground objects
65 | MASK_MATTING: False # use soft background, default not used
66 | REGION_RESIZED: True # resize to the input of clip, e.g., 224
67 | CLIP_ENSEMBLE: True # use ensemble of two classification branches
68 | CLIP_ENSEMBLE_WEIGHT: 0.7
69 | REPLACE_RATIO: 0.15
70 | REPLACE_LAYER: [1, 3, 5, 7, 9]
71 | DATASETS:
72 | TRAIN: ("coco_2017_train_stuff_sem_seg",)
73 | TEST: ("ade20k_sem_seg_val",)
74 | SOLVER:
75 | IMS_PER_BATCH: 32
76 | BASE_LR: 0.00006
77 | MAX_ITER: 120000
78 | WARMUP_FACTOR: 1e-6
79 | WARMUP_ITERS: 1500
80 | WEIGHT_DECAY: 0.01
81 | WEIGHT_DECAY_NORM: 0.0
82 | WEIGHT_DECAY_EMBED: 0.0
83 | BACKBONE_MULTIPLIER: 1.0
84 | TEST_IMS_PER_BATCH: 1
85 | CLIP_GRADIENTS:
86 | ENABLED: True
87 | CLIP_TYPE: "full_model"
88 | CLIP_VALUE: 0.01
89 | NORM_TYPE: 2.0
90 | INPUT:
91 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
92 | MIN_SIZE_TRAIN_SAMPLING: "choice"
93 | MIN_SIZE_TEST: 640
94 | MAX_SIZE_TRAIN: 2560
95 | MAX_SIZE_TEST: 2560
96 | CROP:
97 | ENABLED: True
98 | TYPE: "absolute"
99 | SIZE: (640, 640)
100 | SINGLE_CATEGORY_MAX_AREA: 1.0
101 | COLOR_AUG_SSD: True
102 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
103 | FORMAT: "RGB"
104 | TEST:
105 | EVAL_PERIOD: 5000
106 | AUG:
107 | ENABLED: False
108 | MIN_SIZES: [256, 384, 512, 640, 768, 896]
109 | MAX_SIZE: 3584
110 | FLIP: True
111 | DATALOADER:
112 | FILTER_EMPTY_ANNOTATIONS: True
113 | NUM_WORKERS: 16
114 | VERSION: 2
--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import os
5 | from pathlib import Path
6 |
7 | import numpy as np
8 | import tqdm
9 | from PIL import Image
10 |
11 |
12 | def convert(input, output, index=None):
13 | img = np.asarray(Image.open(input))
14 | assert img.dtype == np.uint8
15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1
16 | if index is not None:
17 | mapping = {i: k for k, i in enumerate(index)}
18 | img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)(
19 | img.astype(np.float)
20 | ).astype(np.uint8)
21 | Image.fromarray(img).save(output)
22 |
23 |
24 | if __name__ == "__main__":
25 | dataset_dir = (
26 | Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
27 | )
28 | print('Caution: we only generate the validation set!')
29 | for name in ["validation"]:
30 | annotation_dir = dataset_dir / "annotations" / name
31 | output_dir = dataset_dir / "annotations_detectron2" / name
32 | output_dir.mkdir(parents=True, exist_ok=True)
33 | for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34 | output_file = output_dir / file.name
35 | convert(file, output_file)
36 |
--------------------------------------------------------------------------------
/datasets/prepare_pascal_context.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import tqdm
5 | import os
6 | import os.path as osp
7 | from pathlib import Path
8 |
9 | import numpy as np
10 | from PIL import Image
11 | import scipy.io
12 |
13 | def convert_pc59(mask_path, new_mask_path, pc59_dict):
14 | mat = scipy.io.loadmat(mask_path)
15 | mask = mat['LabelMap']
16 |
17 | mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
18 | for trID, clsID in pc59_dict.items():
19 | mask_copy[mask == clsID] = trID
20 |
21 | min_value = np.amin(mask_copy)
22 | assert min_value >= 0, print(min_value)
23 | Image.fromarray(mask_copy).save(new_mask_path, "PNG")
24 |
25 | def convert_pc459(mask_path, new_mask_path):
26 | mat = scipy.io.loadmat(mask_path)
27 | mask = mat['LabelMap']
28 | mask = mask - 1
29 | min_value = np.amin(mask)
30 | assert min_value >= 0, print(min_value)
31 | Image.fromarray(mask).save(new_mask_path, "TIFF")
32 |
33 |
34 | if __name__ == "__main__":
35 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
36 | print('Caution: we only generate the validation set!')
37 | pc_path = dataset_dir / "VOCdevkit/VOC2010"
38 |
39 | val_list = open(pc_path / "pascalcontext_val.txt", "r")
40 | pc459_labels = open(pc_path / "labels.txt", "r")
41 | pc59_labels = open(pc_path / "59_labels.txt", "r")
42 |
43 | pc459_dict = {}
44 | for line in pc459_labels.readlines():
45 | if ':' in line:
46 | idx, name = line.split(':')
47 | idx = int(idx.strip())
48 | name = name.strip()
49 | pc459_dict[name] = idx
50 |
51 | pc59_dict = {}
52 | for i, line in enumerate(pc59_labels.readlines()):
53 | name = line.split(':')[-1].strip()
54 | if name is not '':
55 | pc59_dict[i] = pc459_dict[name]
56 |
57 | pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
58 | pc459_dir.mkdir(parents=True, exist_ok=True)
59 | pc59_dir = pc_path / "annotations_detectron2" / "pc59_val"
60 | pc59_dir.mkdir(parents=True, exist_ok=True)
61 |
62 | for line in tqdm.tqdm(val_list.readlines()):
63 | fileid = line.strip()
64 | ori_mask = f'{pc_path}/trainval/{fileid}.mat'
65 | pc459_dst = f'{pc459_dir}/{fileid}.tif'
66 | pc59_dst = f'{pc59_dir}/{fileid}.png'
67 | if osp.exists(ori_mask):
68 | convert_pc459(ori_mask, pc459_dst)
69 | convert_pc59(ori_mask, pc59_dst, pc59_dict)
70 |
--------------------------------------------------------------------------------
/datasets/prepare_voc_sem_seg.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 | # Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py
4 |
5 | import os
6 | import os.path as osp
7 | from pathlib import Path
8 | import tqdm
9 |
10 | import numpy as np
11 | from PIL import Image
12 |
13 |
14 | clsID_to_trID = {
15 | 0: 255,
16 | 1: 0,
17 | 2: 1,
18 | 3: 2,
19 | 4: 3,
20 | 5: 4,
21 | 6: 5,
22 | 7: 6,
23 | 8: 7,
24 | 9: 8,
25 | 10: 9,
26 | 11: 10,
27 | 12: 11,
28 | 13: 12,
29 | 14: 13,
30 | 15: 14,
31 | 16: 15,
32 | 17: 16,
33 | 18: 17,
34 | 19: 18,
35 | 20: 19,
36 | 255: 255,
37 | }
38 |
39 | def convert_to_trainID(
40 | maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix=""
41 | ):
42 | mask = np.array(Image.open(maskpath))
43 | mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
44 | for clsID, trID in clsID_to_trID.items():
45 | mask_copy[mask == clsID] = trID
46 | seg_filename = (
47 | osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath))
48 | if is_train
49 | else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath))
50 | )
51 | if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
52 | return
53 | Image.fromarray(mask_copy).save(seg_filename, "PNG")
54 |
55 |
56 |
57 | if __name__ == "__main__":
58 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
59 | print('Caution: we only generate the validation set!')
60 | voc_path = dataset_dir / "VOCdevkit" / "VOC2012"
61 | out_mask_dir = voc_path / "annotations_detectron2"
62 | out_image_dir = voc_path / "images_detectron2"
63 | for name in ["val"]:
64 | os.makedirs((out_mask_dir / name), exist_ok=True)
65 | os.makedirs((out_image_dir / name), exist_ok=True)
66 | val_list = [
67 | osp.join(voc_path, "SegmentationClassAug", f + ".png")
68 | for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist()
69 | ]
70 | for file in tqdm.tqdm(val_list):
71 | convert_to_trainID(file, out_mask_dir, is_train=False)
72 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import argparse
5 | import glob
6 | import multiprocessing as mp
7 | import os
8 | import time
9 | import cv2
10 | import tqdm
11 |
12 | from detectron2.config import get_cfg
13 |
14 | from detectron2.projects.deeplab import add_deeplab_config
15 | from detectron2.data.detection_utils import read_image
16 | from detectron2.utils.logger import setup_logger
17 | from scan import add_ovseg_config
18 |
19 | from scan.utils import VisualizationDemo
20 |
21 | # constants
22 | WINDOW_NAME = "Open vocabulary segmentation"
23 |
24 |
25 | def setup_cfg(args):
26 | # load config from file and command-line arguments
27 | cfg = get_cfg()
28 | # for poly lr schedule
29 | add_deeplab_config(cfg)
30 | add_ovseg_config(cfg)
31 | cfg.merge_from_file(args.config_file)
32 | cfg.merge_from_list(args.opts)
33 | cfg.freeze()
34 | return cfg
35 |
36 |
37 | def get_parser():
38 | parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
39 | parser.add_argument(
40 | "--config-file",
41 | default="configs/ovseg_swinB_vitL_mask2former_demo.yaml",
42 | metavar="FILE",
43 | help="path to config file",
44 | )
45 | parser.add_argument(
46 | "--input",
47 | nargs="+",
48 | help="A list of space separated input images; "
49 | "or a single glob pattern such as 'directory/*.jpg'",
50 | default='./data/ADEChallengeData2016/images/validation/*.jpg'
51 | )
52 | parser.add_argument(
53 | "--class-names",
54 | nargs="+",
55 | default="building",
56 | help="A list of user-defined class_names"
57 | )
58 | parser.add_argument(
59 | "--output",
60 | default='./pred',
61 | help="A file or directory to save output visualizations. "
62 | "If not given, will show output in an OpenCV window.",
63 | )
64 | parser.add_argument(
65 | "--opts",
66 | help="Modify config options using the command-line 'KEY VALUE' pairs",
67 | default=['MODEL.WEIGHTS', '/opt/tiger/ljyaronld/OVSeg/ckpt/SwinB-Mask2Former-openclip_datacomp-frequency_121824-aux-split_query_only_crossattn-final.pth'],
68 | nargs=argparse.REMAINDER,
69 | )
70 | return parser
71 |
72 |
73 | if __name__ == "__main__":
74 | mp.set_start_method("spawn", force=True)
75 | args = get_parser().parse_args()
76 | setup_logger(name="fvcore")
77 | logger = setup_logger()
78 | logger.info("Arguments: " + str(args))
79 |
80 | cfg = setup_cfg(args)
81 |
82 | demo = VisualizationDemo(cfg)
83 | classes = []
84 | with open('/opt/tiger/ljyaronld/OVSeg/a_150.txt', 'r') as file:
85 | for line in file:
86 | classes.append(line.strip())
87 | class_names = classes
88 | class_names = args.class_names
89 | if args.input:
90 | if len(args.input) == 1:
91 | args.input = glob.glob(os.path.expanduser(args.input[0]))
92 | assert args.input, "The input path(s) was not found"
93 | for path in tqdm.tqdm(args.input, disable=not args.output):
94 | # use PIL, to be consistent with evaluation
95 | img = read_image(path, format="BGR")
96 | start_time = time.time()
97 | predictions, visualized_output = demo.run_on_image(img, class_names)
98 | logger.info(
99 | "{}: {} in {:.2f}s".format(
100 | path,
101 | "detected {} instances".format(len(predictions["instances"]))
102 | if "instances" in predictions
103 | else "finished",
104 | time.time() - start_time,
105 | )
106 | )
107 |
108 | if args.output:
109 | if os.path.isdir(args.output):
110 | assert os.path.isdir(args.output), args.output
111 | out_filename = os.path.join(args.output, os.path.basename(path))
112 | else:
113 | assert len(args.input) == 1, "Please specify a directory with args.output"
114 | out_filename = args.output
115 | visualized_output.save(out_filename)
116 | else:
117 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
118 | cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
119 | if cv2.waitKey(0) == 27:
120 | break # esc to quit
121 | else:
122 | raise NotImplementedError
--------------------------------------------------------------------------------
/imgs/cs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/cs.png
--------------------------------------------------------------------------------
/imgs/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/pipeline.png
--------------------------------------------------------------------------------
/imgs/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/results.png
--------------------------------------------------------------------------------
/imgs/visual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/visual.png
--------------------------------------------------------------------------------
/open_clip_training/.github/workflows/clear-cache.yml:
--------------------------------------------------------------------------------
1 | name: Clear cache
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | permissions:
7 | actions: write
8 |
9 | jobs:
10 | clear-cache:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Clear cache
14 | uses: actions/github-script@v6
15 | with:
16 | script: |
17 | const caches = await github.rest.actions.getActionsCacheList({
18 | owner: context.repo.owner,
19 | repo: context.repo.repo,
20 | })
21 | for (const cache of caches.data.actions_caches) {
22 | console.log(cache)
23 | await github.rest.actions.deleteActionsCacheById({
24 | owner: context.repo.owner,
25 | repo: context.repo.repo,
26 | cache_id: cache.id,
27 | })
28 | }
29 |
30 |
--------------------------------------------------------------------------------
/open_clip_training/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - uses: actions-ecosystem/action-regex-match@v2
13 | id: regex-match
14 | with:
15 | text: ${{ github.event.head_commit.message }}
16 | regex: '^Release ([^ ]+)'
17 | - name: Set up Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: '3.8'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install setuptools wheel twine
25 | - name: Release
26 | if: ${{ steps.regex-match.outputs.match != '' }}
27 | uses: softprops/action-gh-release@v1
28 | with:
29 | tag_name: v${{ steps.regex-match.outputs.group1 }}
30 | - name: Build and publish
31 | if: ${{ steps.regex-match.outputs.match != '' }}
32 | env:
33 | TWINE_USERNAME: __token__
34 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
35 | run: |
36 | python setup.py sdist bdist_wheel
37 | twine upload dist/*
38 |
--------------------------------------------------------------------------------
/open_clip_training/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | wandb/
3 | models/
4 | features/
5 | results/
6 |
7 | tests/data/
8 | *.pt
9 |
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | pip-wheel-metadata/
33 | share/python-wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .nox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | *.py,cover
60 | .hypothesis/
61 | .pytest_cache/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 | db.sqlite3-journal
72 |
73 | # Flask stuff:
74 | instance/
75 | .webassets-cache
76 |
77 | # Scrapy stuff:
78 | .scrapy
79 |
80 | # Sphinx documentation
81 | docs/_build/
82 |
83 | # PyBuilder
84 | target/
85 |
86 | # Jupyter Notebook
87 | .ipynb_checkpoints
88 |
89 | # IPython
90 | profile_default/
91 | ipython_config.py
92 |
93 | # pyenv
94 | .python-version
95 |
96 | # pipenv
97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | # install all needed dependencies.
101 | #Pipfile.lock
102 |
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 |
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 |
110 | # SageMath parsed files
111 | *.sage.py
112 |
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 |
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 |
126 | # Rope project settings
127 | .ropeproject
128 |
129 | # mkdocs documentation
130 | /site
131 |
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 |
137 | # Pyre type checker
138 | .pyre/
139 | sync.sh
140 | gpu1sync.sh
141 | .idea
142 | *.pdf
143 | **/._*
144 | **/*DS_*
145 | **.jsonl
146 | src/sbatch
147 | src/misc
148 | .vscode
149 | src/debug
150 | core.*
151 |
152 | # Allow
153 | !src/evaluation/misc/results_dbs/*
--------------------------------------------------------------------------------
/open_clip_training/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.1.0
2 | message: If you use this software, please cite it as below.
3 | authors:
4 | - family-names: Ilharco
5 | given-names: Gabriel
6 | - family-names: Wortsman
7 | given-names: Mitchell
8 | - family-names: Wightman
9 | given-names: Ross
10 | - family-names: Gordon
11 | given-names: Cade
12 | - family-names: Carlini
13 | given-names: Nicholas
14 | - family-names: Taori
15 | given-names: Rohan
16 | - family-names: Dave
17 | given-names: Achal
18 | - family-names: Shankar
19 | given-names: Vaishaal
20 | - family-names: Namkoong
21 | given-names: Hongseok
22 | - family-names: Miller
23 | given-names: John
24 | - family-names: Hajishirzi
25 | given-names: Hannaneh
26 | - family-names: Farhadi
27 | given-names: Ali
28 | - family-names: Schmidt
29 | given-names: Ludwig
30 | title: OpenCLIP
31 | version: v0.1
32 | doi: 10.5281/zenodo.5143773
33 | date-released: 2021-07-28
34 |
--------------------------------------------------------------------------------
/open_clip_training/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman,
2 | Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar,
3 | John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi,
4 | Ludwig Schmidt
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining
7 | a copy of this software and associated documentation files (the
8 | "Software"), to deal in the Software without restriction, including
9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 |
--------------------------------------------------------------------------------
/open_clip_training/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/open_clip/bpe_simple_vocab_16e6.txt.gz
2 | include src/open_clip/model_configs/*.json
3 |
4 |
--------------------------------------------------------------------------------
/open_clip_training/Makefile:
--------------------------------------------------------------------------------
1 | install: ## [Local development] Upgrade pip, install requirements, install package.
2 | python -m pip install -U pip
3 | python -m pip install -e .
4 |
5 | install-training:
6 | python -m pip install -r requirements-training.txt
7 |
8 | install-test: ## [Local development] Install test requirements
9 | python -m pip install -r requirements-test.txt
10 |
11 | test: ## [Local development] Run unit tests
12 | python -m pytest -x -s -v tests
13 |
--------------------------------------------------------------------------------
/open_clip_training/docs/LOW_ACC.md:
--------------------------------------------------------------------------------
1 | As we describe in more detail below, CLIP models in a medium accuracy regime already allow us to draw conclusions about the robustness of larger CLIP models since the models follow reliable scaling laws.
2 |
3 | [Cherti et al., 2022](https://arxiv.org/abs/2212.07143) and [Gadre et al., 2023](https://arxiv.org/abs/2304.14108) show additional discussions about the scaling behavior of CLIP models.
4 |
5 | ## Scaling trends
6 |
7 | The plot below shows how zero-shot performance of CLIP models varies as we scale the number of samples used for training. Zero-shot performance increases steadily for both ImageNet and [ImageNetV2](https://arxiv.org/abs/1902.10811), and is far from saturated at ~15M samples.
8 |
9 |
10 |
11 | ## Why are low-accuracy CLIP models interesting?
12 |
13 | **TL;DR:** CLIP models have high effective robustness, even at small scales.
14 |
15 | CLIP models are particularly intriguing because they are more robust to natural distribution shifts (see Section 3.3 in the [CLIP paper](https://arxiv.org/abs/2103.00020)).
16 | This phenomena is illustrated by the figure below, with ImageNet accuracy on the x-axis
17 | and [ImageNetV2](https://arxiv.org/abs/1902.10811) (a reproduction of the ImageNet validation set with distribution shift) accuracy on the y-axis.
18 | Standard training denotes training on the ImageNet train set and the CLIP zero-shot models
19 | are shown as stars.
20 |
21 | 
22 |
23 | As observed by [Taori et al., 2020](https://arxiv.org/abs/2007.00644) and [Miller et al., 2021](https://arxiv.org/abs/2107.04649), the in-distribution
24 | and out-of-distribution accuracies of models trained on ImageNet follow a predictable linear trend (the red line in the above plot). *Effective robustness*
25 | quantifies robustness as accuracy beyond this baseline, i.e., how far a model lies above the red line. Ideally a model would not suffer from distribution shift and fall on the y = x line ([trained human labelers are within a percentage point of the y = x line](http://proceedings.mlr.press/v119/shankar20c.html)).
26 |
27 | Even though the CLIP models trained with
28 | this codebase achieve much lower accuracy than those trained by OpenAI, our models still lie on the same
29 | trend of improved effective robustness (the purple line). Therefore, we can study what makes
30 | CLIP robust without requiring industrial-scale compute.
31 |
32 | For more information on effective robustness, please see:
33 |
34 | - [Recht et al., 2019](https://arxiv.org/abs/1902.10811).
35 | - [Taori et al., 2020](https://arxiv.org/abs/2007.00644).
36 | - [Miller et al., 2021](https://arxiv.org/abs/2107.04649).
37 |
38 | To know more about the factors that contribute to CLIP's robustness refer to [Fang et al., 2022](https://arxiv.org/abs/2205.01397).
--------------------------------------------------------------------------------
/open_clip_training/docs/clip_conceptual_captions.md:
--------------------------------------------------------------------------------
1 | ## Additional training curves for CLIP on Conceptual Captions
2 |
3 | # Zero shot accuracy
4 | 
5 |
6 | # Training loss curve
7 | 
8 |
9 | # Validation loss curve
10 | 
11 |
12 | # Validation recall
13 | 
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_b16/i50_t16_finetune.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 -m training.main \
2 | --save-frequency 1 \
3 | --save-most-recent \
4 | --zeroshot-frequency 1 \
5 | --train-data '/path/to/laion-400m' \
6 | --dataset-type webdataset \
7 | --lr "2.56e-5" \
8 | --beta1 0.9 \
9 | --beta2 0.95 \
10 | --warmup 3072 \
11 | --wd 0.2 \
12 | --batch-size 1024 \
13 | --aug-cfg scale='(0.4, 1.0)' \
14 | --epochs 1 \
15 | --train-num-samples 131072000 \
16 | --workers 6 \
17 | --model ViT-B-16-CL16 \
18 | --pretrained '/path/to/ckpt' \
19 | --precision 'amp_bf16' \
20 | --ddp-static-graph \
21 | --local-loss \
22 | --gather-with-grad \
23 | --grad-checkpointing \
24 | --log-every-n-steps 256 \
25 | --seed 0 \
26 | --logs ./logs/ \
27 | --imagenet-val '/path/to/imagenet/val'
28 |
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_b16/i50_t16_pretrain.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 -m training.main \
2 | --save-frequency 1 \
3 | --save-most-recent \
4 | --zeroshot-frequency 1 \
5 | --train-data '/path/to/laion-400m' \
6 | --dataset-type webdataset \
7 | --lr "2.048e-3" \
8 | --beta1 0.9 \
9 | --beta2 0.95 \
10 | --warmup 782 \
11 | --wd 0.2 \
12 | --batch-size 8192 \
13 | --aug-cfg scale='(0.4, 1.0)' \
14 | --epochs 6 \
15 | --workers 6 \
16 | --model ViT-B-16-CL16 \
17 | --precision 'amp_bf16' \
18 | --ddp-static-graph \
19 | --local-loss \
20 | --gather-with-grad \
21 | --force-image-size 112 \
22 | --grad-checkpointing \
23 | --log-every-n-steps 32 \
24 | --seed 0 \
25 | --logs ./logs/ \
26 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i17_t16_finetune.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 -m training.main \
2 | --save-frequency 1 \
3 | --save-most-recent \
4 | --zeroshot-frequency 1 \
5 | --train-data '/path/to/laion-400m' \
6 | --dataset-type webdataset \
7 | --lr "2.24e-5" \
8 | --beta1 0.9 \
9 | --beta2 0.95 \
10 | --warmup 3571 \
11 | --wd 0.2 \
12 | --batch-size 896 \
13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 | --epochs 1 \
15 | --train-num-samples 131072000 \
16 | --workers 6 \
17 | --model ViT-L-16-CL16-GAP \
18 | --pretrained '/path/to/ckpt' \
19 | --precision 'amp_bf16' \
20 | --ddp-static-graph \
21 | --local-loss \
22 | --gather-with-grad \
23 | --grad-checkpointing \
24 | --log-every-n-steps 293 \
25 | --seed 0 \
26 | --logs ./logs/ \
27 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i17_t16_pretrain.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 -m training.main \
2 | --save-frequency 1 \
3 | --save-most-recent \
4 | --zeroshot-frequency 1 \
5 | --train-data '/path/to/laion-400m' \
6 | --dataset-type webdataset \
7 | --lr "1.024e-3" \
8 | --beta1 0.9 \
9 | --beta2 0.95 \
10 | --warmup 1563 \
11 | --wd 0.2 \
12 | --batch-size 4096 \
13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 | --epochs 6 \
15 | --workers 6 \
16 | --model ViT-L-16-CL16-GAP \
17 | --precision 'amp_bf16' \
18 | --ddp-static-graph \
19 | --local-loss \
20 | --gather-with-grad \
21 | --force-image-size 64 \
22 | --grad-checkpointing \
23 | --log-every-n-steps 64 \
24 | --seed 0 \
25 | --logs ./logs/ \
26 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i37_t8_finetune.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 -m training.main \
2 | --save-frequency 1 \
3 | --save-most-recent \
4 | --zeroshot-frequency 1 \
5 | --train-data '/path/to/laion-400m' \
6 | --dataset-type webdataset \
7 | --lr "2.24e-5" \
8 | --beta1 0.9 \
9 | --beta2 0.95 \
10 | --warmup 3571 \
11 | --wd 0.2 \
12 | --batch-size 896 \
13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 | --epochs 1 \
15 | --train-num-samples 131072000 \
16 | --workers 6 \
17 | --model ViT-L-16-CL32-GAP \
18 | --pretrained '/path/to/ckpt' \
19 | --precision 'amp_bf16' \
20 | --ddp-static-graph \
21 | --local-loss \
22 | --gather-with-grad \
23 | --grad-checkpointing \
24 | --log-every-n-steps 293 \
25 | --seed 0 \
26 | --logs ./logs/ \
27 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i37_t8_pretrain.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 -m training.main \
2 | --save-frequency 1 \
3 | --save-most-recent \
4 | --zeroshot-frequency 1 \
5 | --train-data '/path/to/laion-400m' \
6 | --dataset-type webdataset \
7 | --lr "1.024e-3" \
8 | --beta1 0.9 \
9 | --beta2 0.95 \
10 | --warmup 1563 \
11 | --wd 0.2 \
12 | --batch-size 4096 \
13 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 | --epochs 6 \
15 | --workers 6 \
16 | --model ViT-L-16-CL8-Syntax-GAP \
17 | --precision 'amp_bf16' \
18 | --ddp-static-graph \
19 | --local-loss \
20 | --gather-with-grad \
21 | --force-image-size 96 \
22 | --grad-checkpointing \
23 | --log-every-n-steps 64 \
24 | --seed 0 \
25 | --logs ./logs/ \
26 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipav2/vit_h14/i257_t32_finetunex4.sh:
--------------------------------------------------------------------------------
1 | # have not been tested. use it at your own discretion
2 | # the original experiment was run on tpu v3-256.
3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
4 | torchrun --nproc_per_node 8 -m training.main \
5 | --save-frequency 1 \
6 | --save-most-recent \
7 | --zeroshot-frequency 1 \
8 | --train-data '/path/to/laion2b_or_datacomp1b' \
9 | --train-num-samples 131072000 \
10 | --dataset-type webdataset \
11 | --lr "5.12e-5" \
12 | --beta1 0.9 \
13 | --beta2 0.95 \
14 | --warmup 800 \
15 | --wd 0.2 \
16 | --batch-size 4096 \
17 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
18 | --epochs 4 \
19 | --workers 6 \
20 | --model ViT-H-14-CL32-GAP \
21 | --pretrained '/path/to/pretrain84_ckpt' \
22 | --precision 'amp_bf16' \
23 | --ddp-static-graph \
24 | --local-loss \
25 | --gather-with-grad \
26 | --force-image-size 224 \
27 | --force-patch-dropout 0.3 \
28 | --grad-checkpointing \
29 | --log-every-n-steps 64 \
30 | --seed 0 \
31 | --logs ./logs/ \
32 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipav2/vit_h14/i50_t8_pretrain.sh:
--------------------------------------------------------------------------------
1 | # have not been tested. use it at your own discretion
2 | # the original experiment was run on tpu v3-256.
3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
4 | torchrun --nproc_per_node 8 -m training.main \
5 | --save-frequency 1 \
6 | --save-most-recent \
7 | --zeroshot-frequency 1 \
8 | --train-data '/path/to/laion2b_or_datacomp1b' \
9 | --train-num-samples 4e8 \
10 | --dataset-type webdataset \
11 | --lr "2.048e-3" \
12 | --beta1 0.9 \
13 | --beta2 0.95 \
14 | --warmup 3200 \
15 | --wd 0.2 \
16 | --batch-size 8192 \
17 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
18 | --epochs 32 \
19 | --workers 6 \
20 | --model ViT-H-14-CL8-Syntax-GAP \
21 | --precision 'amp_bf16' \
22 | --ddp-static-graph \
23 | --local-loss \
24 | --gather-with-grad \
25 | --force-image-size 84 \
26 | --grad-checkpointing \
27 | --log-every-n-steps 32 \
28 | --seed 0 \
29 | --logs ./logs/ \
30 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipav2/vit_h14/i577_t32_finetunex1.sh:
--------------------------------------------------------------------------------
1 | # have not been tested. use it at your own discretion
2 | # the original experiment was run on tpu v3-256.
3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
4 | torchrun --nproc_per_node 8 -m training.main \
5 | --save-frequency 1 \
6 | --save-most-recent \
7 | --zeroshot-frequency 1 \
8 | --train-data '/path/to/laion2b_or_datacomp1b' \
9 | --train-num-samples 131072000 \
10 | --dataset-type webdataset \
11 | --lr "6.4e-6" \
12 | --beta1 0.9 \
13 | --beta2 0.95 \
14 | --warmup 1600 \
15 | --wd 0.2 \
16 | --batch-size 2048 \
17 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
18 | --epochs 1 \
19 | --workers 6 \
20 | --model ViT-H-14-CL32-GAP \
21 | --pretrained '/path/to/finetune224_ckpt' \
22 | --precision 'amp_bf16' \
23 | --ddp-static-graph \
24 | --local-loss \
25 | --gather-with-grad \
26 | --force-image-size 336 \
27 | --force-patch-dropout 0.4 \
28 | --grad-checkpointing \
29 | --log-every-n-steps 64 \
30 | --seed 0 \
31 | --logs ./logs/ \
32 | --imagenet-val '/path/to/imagenet/val'
--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/stability_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --partition=g40423
3 | #SBATCH --job-name=testopenclip
4 | #SBATCH --nodes 30
5 | #SBATCH --ntasks-per-node=8
6 | #SBATCH --cpus-per-task=12
7 | #SBATCH --output=%x_%j.out
8 | #SBATCH --comment=laion
9 | #SBATCH --open-mode=append
10 | #SBATCH --exclusive
11 |
12 | module load openmpi
13 | module load cuda/11.7
14 |
15 | export MASTER_ADDR=`hostname`
16 | export MASTER_PORT=12802
17 | export NCCL_PROTO=simple
18 | export FI_EFA_FORK_SAFE=1
19 | export FI_LOG_LEVEL=1
20 | export FI_EFA_USE_DEVICE_RDMA=1
21 | export NCCL_DEBUG=info
22 |
23 | export PYTHONFAULTHANDLER=1
24 |
25 | export CUDA_LAUNCH_BLOCKING=0
26 | export OMPI_MCA_mtl_base_verbose=1
27 | export FI_EFA_ENABLE_SHM_TRANSFER=0
28 | export FI_PROVIDER=efa
29 | export FI_EFA_TX_MIN_CREDITS=64
30 | export NCCL_TREE_THRESHOLD=0
31 |
32 | cd /admin/home-mitchellw/open_clip/src
33 | export PYTHONPATH="$PYTHONPATH:/admin/home-mitchellw/open_clip/src"
34 |
35 | EXP_NAME="test-B-32-laion5b-lr1e-3-bs90k"
36 |
37 | srun --comment laion --cpu_bind=v --accel-bind=gn python -m training.main \
38 | --save-frequency 1 \
39 | --train-data="pipe:aws s3 cp s3://s-datasets/laion5b/{laion2B-data/{000000..231349}.tar,laion2B-multi-data/{000000..226687}.tar,laion1B-nolang-data/{000000..127231}.tar} -" \
40 | --train-num-samples 135646078 \
41 | --dataset-type webdataset \
42 | --dataset-resampled \
43 | --warmup 2000 \
44 | --batch-size=375 \
45 | --epochs=97 \
46 | --lr 1e-3 \
47 | --workers=8 \
48 | --report-to wandb \
49 | --name ${EXP_NAME} \
50 | --logs /scratch/logs/ \
51 | --model ViT-B-32 \
52 | --seed 0 \
53 | --ddp-static-graph \
54 | --local-loss \
55 | --gather-with-grad \
56 | --grad-checkpointing \
57 | --precision amp_bfloat16 \
58 | --wandb-project-name open_clip6 \
59 | --resume "latest" \
60 | --remote-sync s3://s-laion/mitchellw/logs
61 |
--------------------------------------------------------------------------------
/open_clip_training/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 | regression_test
4 |
--------------------------------------------------------------------------------
/open_clip_training/scripts/clipav1_vit_l16_i37_t8.sh:
--------------------------------------------------------------------------------
1 | # eval on a single gpu
2 | CUDA_VISIBLE_DEVICES=2 TORCH_CUDNN_V8_API_ENABLED=1 TFDS_PREFETCH_SIZE=8192 python3 -m training.main \
3 | --model ViT-L-16-CL32-GAP \
4 | --pretrained "/path/to/clipa_vit_l16_i37_t8.pt" \
5 | --seed 0 \
6 | --imagenet-val '/path/to/ImageNet/val'
--------------------------------------------------------------------------------
/open_clip_training/scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=1 python3 -m training.main \
2 | --model ViT-H-14-CL32-GAP-BigVision \
3 | --pretrained "/path/to/vit_h14_i84_224_336_cl32_gap_datacomp1b.pt" \
4 | --force-image-size 336 \
5 | --square-resize-only \
6 | --interpolation 'bilinear' \
7 | --image-mean 0.485 0.456 0.406 \
8 | --image-std 0.229 0.224 0.225 \
9 | --seed 0 \
10 | --imagenet-val '/path/to/ImageNet/val'
11 |
--------------------------------------------------------------------------------
/open_clip_training/scripts/h14_224_32_finetune.sh:
--------------------------------------------------------------------------------
1 | # 64k batchsize for 2.048e-3 lr
2 | TORCH_CUDNN_V8_API_ENABLED=1 torchrun --nproc_per_node 8 -m training.main \
3 | --save-frequency 1 \
4 | --save-most-recent \
5 | --zeroshot-frequency 1 \
6 | --train-data '/path/to/laion' \
7 | --dataset-type webdataset \
8 | --lr "2.048e-3" \
9 | --beta1 0.9 \
10 | --beta2 0.95 \
11 | --warmup 782 \
12 | --wd 0.2 \
13 | --batch-size 4096 \
14 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
15 | --epochs=7 \
16 | --workers=6 \
17 | --model ViT-H-14-CL32-GAP \
18 | --precision 'amp_bf16' \
19 | --local-loss \
20 | --gather-with-grad \
21 | --force-image-size 224 \
22 | --grad-checkpointing \
23 | --log-every-n-steps 32 \
24 | --seed 0 \
25 | --logs ./logs/ \
26 | --imagenet-val '/path/to/ImageNet/val' \
27 | --name 'name' \
28 | --report-to "wandb" \
29 | --wandb-project-name "project_name"
30 |
31 |
32 |
--------------------------------------------------------------------------------
/open_clip_training/scripts/h14_84_8_pretrain.sh:
--------------------------------------------------------------------------------
1 | # 64k batchsize for 2.048e-3 lr
2 | TORCH_CUDNN_V8_API_ENABLED=1 torchrun --nproc_per_node 8 -m training.main \
3 | --save-frequency 1 \
4 | --save-most-recent \
5 | --zeroshot-frequency 1 \
6 | --train-data '/path/to/laion' \
7 | --dataset-type webdataset \
8 | --lr "2.048e-3" \
9 | --beta1 0.9 \
10 | --beta2 0.95 \
11 | --warmup 782 \
12 | --wd 0.2 \
13 | --batch-size 4096 \
14 | --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
15 | --epochs=7 \
16 | --workers=6 \
17 | --model ViT-H-14-CL8-SyntaxMask-GAP \
18 | --precision 'amp_bf16' \
19 | --local-loss \
20 | --gather-with-grad \
21 | --force-image-size 84 \
22 | --grad-checkpointing \
23 | --log-every-n-steps 32 \
24 | --seed 0 \
25 | --logs ./logs/ \
26 | --imagenet-val '/path/to/ImageNet/val' \
27 | --name 'name' \
28 | --report-to "wandb" \
29 | --wandb-project-name "project_name"
30 |
31 |
32 |
--------------------------------------------------------------------------------
/open_clip_training/setup.py:
--------------------------------------------------------------------------------
1 | """ Setup
2 | """
3 | from setuptools import setup, find_packages
4 | from codecs import open
5 | from os import path
6 |
7 | here = path.abspath(path.dirname(__file__))
8 |
9 | # Get the long description from the README file
10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
11 | long_description = f.read()
12 |
13 | def _read_reqs(relpath):
14 | fullpath = path.join(path.dirname(__file__), relpath)
15 | with open(fullpath) as f:
16 | return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))]
17 |
18 | REQUIREMENTS = _read_reqs("requirements.txt")
19 | TRAINING_REQUIREMENTS = _read_reqs("requirements-training.txt")
20 |
21 | exec(open('src/open_clip/version.py').read())
22 | setup(
23 | name='open_clip_torch',
24 | version=__version__,
25 | description='OpenCLIP',
26 | long_description=long_description,
27 | long_description_content_type='text/markdown',
28 | url='https://github.com/mlfoundations/open_clip',
29 | author='',
30 | author_email='',
31 | classifiers=[
32 | # How mature is this project? Common values are
33 | # 3 - Alpha
34 | # 4 - Beta
35 | # 5 - Production/Stable
36 | 'Development Status :: 3 - Alpha',
37 | 'Intended Audience :: Education',
38 | 'Intended Audience :: Science/Research',
39 | 'License :: OSI Approved :: Apache Software License',
40 | 'Programming Language :: Python :: 3.7',
41 | 'Programming Language :: Python :: 3.8',
42 | 'Programming Language :: Python :: 3.9',
43 | 'Programming Language :: Python :: 3.10',
44 | 'Topic :: Scientific/Engineering',
45 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
46 | 'Topic :: Software Development',
47 | 'Topic :: Software Development :: Libraries',
48 | 'Topic :: Software Development :: Libraries :: Python Modules',
49 | ],
50 |
51 | # Note that this is a string of words separated by whitespace, not a list.
52 | keywords='CLIP pretrained',
53 | package_dir={'': 'src'},
54 | packages=find_packages(where='src'),
55 | include_package_data=True,
56 | install_requires=REQUIREMENTS,
57 | extras_require={
58 | "training": TRAINING_REQUIREMENTS,
59 | },
60 | python_requires='>=3.7',
61 | )
62 |
--------------------------------------------------------------------------------
/open_clip_training/src/clip_adapter/clip_adapter.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, Union, Callable, Optional, List
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn.functional as F
6 | from torch import nn
7 |
8 | from open_clip.factory import create_model_and_transforms
9 | import copy
10 |
11 | class ClipAdapter(nn.Module):
12 | def __init__(self, args, device):
13 | super().__init__()
14 | model, preprocess_train, preprocess_val, preprocess_val_entire = create_model_and_transforms(
15 | args.model,
16 | args.pretrained,
17 | precision=args.precision,
18 | device=device,
19 | jit=args.torchscript,
20 | force_quick_gelu=args.force_quick_gelu,
21 | force_custom_text=args.force_custom_text,
22 | force_patch_dropout=args.force_patch_dropout,
23 | force_image_size=args.force_image_size,
24 | image_mean=args.image_mean,
25 | image_std=args.image_std,
26 | image_interpolation=args.image_interpolation,
27 | image_resize_mode=args.image_resize_mode, # only effective for inference
28 | aug_cfg=args.aug_cfg,
29 | pretrained_image=args.pretrained_image,
30 | output_dict=True,
31 | with_mask=args.with_mask,
32 | mask_emb_depth=args.mask_emb_depth
33 | )
34 |
35 | self.clip_model = model
36 | self.preprocess_train = preprocess_train
37 | self.preprocess_val = preprocess_val
38 | self.preprocess_val_entire = preprocess_val_entire
39 |
40 | self.original_clip_visual = copy.deepcopy(model.visual)
41 | for _, param in self.original_clip_visual.named_parameters():
42 | param.requires_grad = False
43 |
44 | def forward(self, original_image, image, text, mask=None):
45 | if image is None:
46 | return self.encode_text(text)
47 | elif text is None:
48 | ori_image_features = self.original_clip_visual(original_image, get_embedding=True)
49 | image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features, mask=mask) # [32, 768]
50 |
51 | image_features = F.normalize(image_features, dim=-1) # [32, 768]
52 | return {'image_features': image_features}
53 |
54 | if mask is None:
55 | ori_image_features = self.original_clip_visual(original_image, get_embedding=True)
56 | image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features) # [32, 768]
57 | else:
58 | ori_image_features = self.original_clip_visual(original_image, get_embedding=True)
59 | image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features, mask=mask) # [32, 768]
60 |
61 | image_features = F.normalize(image_features, dim=-1) # [32, 768]
62 |
63 | text_features = self.clip_model.encode_text(text)
64 | text_features = F.normalize(text_features, dim=-1) # [32, 768]
65 |
66 | # return image_features, text_features, self.clip_model.logit_scale.exp()
67 | out_dict = {
68 | "image_features": image_features,
69 | "text_features": text_features,
70 | "logit_scale": self.clip_model.logit_scale.exp()
71 | }
72 | if self.clip_model.logit_bias is not None:
73 | out_dict['logit_bias'] = self.logit_bias
74 | return out_dict
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .coca_model import CoCa
2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
8 | get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
9 | from .openai import load_openai_model, list_openai_models
10 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
11 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
12 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
13 | from .tokenizer import SimpleTokenizer, tokenize, decode
14 | from .transform import image_transform, AugmentationCfg
15 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
16 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES
17 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/open_clip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
4 | IMAGENET_STD = (0.229, 0.224, 0.225)
5 | INCEPTION_MEAN = (0.5, 0.5, 0.5)
6 | INCEPTION_STD = (0.5, 0.5, 0.5)
7 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/open_clip/generation_utils.py
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/hf_configs.py:
--------------------------------------------------------------------------------
1 | # HF architecture dict:
2 | arch_dict = {
3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
4 | "roberta": {
5 | "config_names": {
6 | "context_length": "max_position_embeddings",
7 | "vocab_size": "vocab_size",
8 | "width": "hidden_size",
9 | "heads": "num_attention_heads",
10 | "layers": "num_hidden_layers",
11 | "layer_attr": "layer",
12 | "token_embeddings_attr": "embeddings"
13 | },
14 | "pooler": "mean_pooler",
15 | },
16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 | "xlm-roberta": {
18 | "config_names": {
19 | "context_length": "max_position_embeddings",
20 | "vocab_size": "vocab_size",
21 | "width": "hidden_size",
22 | "heads": "num_attention_heads",
23 | "layers": "num_hidden_layers",
24 | "layer_attr": "layer",
25 | "token_embeddings_attr": "embeddings"
26 | },
27 | "pooler": "mean_pooler",
28 | },
29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 | "mt5": {
31 | "config_names": {
32 | # unlimited seqlen
33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 | "context_length": "",
36 | "vocab_size": "vocab_size",
37 | "width": "d_model",
38 | "heads": "num_heads",
39 | "layers": "num_layers",
40 | "layer_attr": "block",
41 | "token_embeddings_attr": "embed_tokens"
42 | },
43 | "pooler": "mean_pooler",
44 | },
45 | # https://huggingface.co/docs/transformers/model_doc/bert
46 | "bert": {
47 | "config_names": {
48 | "context_length": "max_position_embeddings",
49 | "vocab_size": "vocab_size",
50 | "width": "hidden_size",
51 | "heads": "num_attention_heads",
52 | "layers": "num_hidden_layers",
53 | },
54 | "pooler": "cls_pooler",
55 | },
56 | # https://huggingface.co/docs/transformers/model_doc/m2m_100
57 | "m2m_100": {
58 | "config_names": {
59 | "context_length": "max_position_embeddings",
60 | "vocab_size": "vocab_size",
61 | "width": "d_model",
62 | "heads": "encoder_attention_heads",
63 | "layers": "encoder_layers",
64 | },
65 | "pooler": "cls_pooler",
66 | },
67 | }
68 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA01-g-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva_giant_patch14_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA01-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva_giant_patch14_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_base_patch16_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-E-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_enormous_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1280,
14 | "heads": 20,
15 | "layers": 32
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-E-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_enormous_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "timm_model_name": "eva02_large_patch14_clip_336",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_large_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": [
6 | 3,
7 | 15,
8 | 36,
9 | 10
10 | ],
11 | "width": 128,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 1024,
18 | "heads": 16,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 256,
7 | "timm_model_name": "vit_base_patch16_siglip_256",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 384,
7 | "timm_model_name": "vit_base_patch16_siglip_384",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 512,
7 | "timm_model_name": "vit_base_patch16_siglip_512",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 256,
7 | "timm_model_name": "vit_base_patch16_siglip_256",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 250000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 224,
7 | "timm_model_name": "vit_base_patch16_siglip_224",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 240,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 16
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 256,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 256,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-378-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 378,
6 | "layers": 32,
7 | "width": 1280,
8 | "head_width": 80,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14,
9 | "no_ln_pre": true,
10 | "pool_type": "avg",
11 | "final_ln_after_pool": true
12 | },
13 | "text_cfg": {
14 | "context_length": 32,
15 | "vocab_size": 32000,
16 | "hf_tokenizer_name": "bert-base-uncased",
17 | "tokenizer_kwargs": {
18 | "strip_sep_token": true
19 | },
20 | "width": 1024,
21 | "heads": 16,
22 | "layers": 24,
23 | "pool_type": "last",
24 | "no_causal_mask": true
25 | }
26 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-CLIPA.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14,
9 | "no_ln_pre": true,
10 | "pool_type": "avg",
11 | "final_ln_after_pool": true
12 | },
13 | "text_cfg": {
14 | "context_length": 32,
15 | "vocab_size": 32000,
16 | "hf_tokenizer_name": "bert-base-uncased",
17 | "tokenizer_kwargs": {
18 | "strip_sep_token": true
19 | },
20 | "width": 1024,
21 | "heads": 16,
22 | "layers": 24,
23 | "pool_type": "last",
24 | "no_causal_mask": true
25 | }
26 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 32,
7 | "width": 1280,
8 | "head_width": 80,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 16
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 280,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-CLIPA-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14,
8 | "no_ln_pre": true,
9 | "pool_type": "avg",
10 | "final_ln_after_pool": true
11 | },
12 | "text_cfg": {
13 | "context_length": 32,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "bert-base-uncased",
16 | "tokenizer_kwargs": {
17 | "strip_sep_token": true
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "pool_type": "last",
23 | "no_causal_mask": true
24 | }
25 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-CLIPA.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14,
8 | "no_ln_pre": true,
9 | "pool_type": "avg",
10 | "final_ln_after_pool": true
11 | },
12 | "text_cfg": {
13 | "context_length": 32,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "bert-base-uncased",
16 | "tokenizer_kwargs": {
17 | "strip_sep_token": true
18 | },
19 | "width": 768,
20 | "heads": 12,
21 | "layers": 12,
22 | "pool_type": "last",
23 | "no_causal_mask": true
24 | }
25 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 24,
7 | "width": 1024,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 320,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16-SigLIP-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 256,
7 | "timm_model_name": "vit_large_patch16_siglip_256",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 1024,
20 | "heads": 16,
21 | "layers": 24,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 384,
7 | "timm_model_name": "vit_large_patch16_siglip_384",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 1024,
20 | "heads": 16,
21 | "layers": 24,
22 | "no_causal_mask": true,
23 | "proj_bias": true,
24 | "pool_type": "last",
25 | "norm_kwargs":{
26 | "eps": 1e-6
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 16,
8 | "ls_init_value": 1e-4
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 384,
14 | "heads": 6,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 256,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 256,
13 | "heads": 4,
14 | "layers": 10
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 256,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 256,
13 | "heads": 4,
14 | "layers": 10
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1152,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 384,
7 | "timm_model_name": "vit_so400m_patch14_siglip_384",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 64,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 1152,
20 | "heads": 16,
21 | "layers": 27,
22 | "mlp_ratio": 3.7362,
23 | "no_causal_mask": true,
24 | "proj_bias": true,
25 | "pool_type": "last",
26 | "norm_kwargs":{
27 | "eps": 1e-6
28 | }
29 | }
30 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1152,
3 | "init_logit_bias": -10,
4 | "custom_text": true,
5 | "vision_cfg": {
6 | "image_size": 224,
7 | "timm_model_name": "vit_so400m_patch14_siglip_224",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "context_length": 16,
14 | "vocab_size": 32000,
15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 | "tokenizer_kwargs": {
17 | "clean": "canonicalize"
18 | },
19 | "width": 1152,
20 | "heads": 16,
21 | "layers": 27,
22 | "mlp_ratio": 3.7362,
23 | "no_causal_mask": true,
24 | "proj_bias": true,
25 | "pool_type": "last",
26 | "norm_kwargs":{
27 | "eps": 1e-6
28 | }
29 | }
30 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 48,
6 | "width": 1664,
7 | "head_width": 104,
8 | "mlp_ratio": 4.9231,
9 | "patch_size": 14,
10 | "no_ln_pre": true,
11 | "pool_type": "avg",
12 | "final_ln_after_pool": true
13 | },
14 | "text_cfg": {
15 | "context_length": 32,
16 | "vocab_size": 32000,
17 | "hf_tokenizer_name": "bert-base-uncased",
18 | "tokenizer_kwargs": {
19 | "strip_sep_token": true
20 | },
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "pool_type": "last",
25 | "no_causal_mask": true
26 | }
27 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 48,
6 | "width": 1664,
7 | "head_width": 104,
8 | "mlp_ratio": 4.9231,
9 | "patch_size": 14,
10 | "no_ln_pre": true,
11 | "pool_type": "avg",
12 | "final_ln_after_pool": true
13 | },
14 | "text_cfg": {
15 | "context_length": 32,
16 | "vocab_size": 32000,
17 | "hf_tokenizer_name": "bert-base-uncased",
18 | "tokenizer_kwargs": {
19 | "strip_sep_token": true
20 | },
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "pool_type": "last",
25 | "no_causal_mask": true
26 | }
27 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 48,
6 | "width": 1664,
7 | "head_width": 104,
8 | "mlp_ratio": 4.9231,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1280,
15 | "heads": 20,
16 | "layers": 32
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 56,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.5715,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1280,
15 | "heads": 20,
16 | "layers": 36
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32,
8 | "attentional_pool": true,
9 | "attn_pooler_heads": 8,
10 | "output_tokens": true
11 | },
12 | "text_cfg": {
13 | "context_length": 76,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12,
18 | "embed_cls": true,
19 | "output_tokens": true
20 | },
21 | "multimodal_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 49408,
24 | "width": 512,
25 | "heads": 8,
26 | "layers": 12,
27 | "attn_pooler_heads": 8
28 | },
29 | "custom_text": true
30 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14,
8 | "attentional_pool": true,
9 | "attn_pooler_heads": 8,
10 | "output_tokens": true
11 | },
12 | "text_cfg": {
13 | "context_length": 76,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 12,
18 | "embed_cls": true,
19 | "output_tokens": true
20 | },
21 | "multimodal_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 49408,
24 | "width": 768,
25 | "heads": 12,
26 | "layers": 12,
27 | "attn_pooler_heads": 12
28 | },
29 | "custom_text": true
30 | }
31 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "multimodal_cfg": {
4 | "width": 768,
5 | "context_length": 76,
6 | "vocab_size": 64000,
7 | "mlp_ratio": 4,
8 | "layers": 12,
9 | "dim_head": 64,
10 | "heads": 12,
11 | "n_queries": 256,
12 | "attn_pooler_heads": 8
13 | },
14 | "vision_cfg": {
15 | "image_size": 288,
16 | "layers": 12,
17 | "width": 768,
18 | "patch_size": 18,
19 | "output_tokens": true
20 | },
21 | "text_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 64000,
24 | "layers": 12,
25 | "heads": 12,
26 | "width": 768,
27 | "embed_cls": true,
28 | "output_tokens": true
29 | },
30 | "custom_text": true
31 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32,
8 | "output_tokens": true
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "roberta-base",
12 | "hf_tokenizer_name": "roberta-base",
13 | "hf_proj_type": "linear",
14 | "width": 768,
15 | "output_tokens": true
16 | },
17 | "multimodal_cfg": {
18 | "context_length": 76,
19 | "width": 768,
20 | "heads": 8,
21 | "layers": 12
22 | },
23 | "custom_text": true
24 | }
25 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 640,
16 | "heads": 10,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 640,
16 | "heads": 10,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "mlp",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 16
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_large_d_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "mlp",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 16
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_small",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_tiny",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 20
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xxlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 24
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xxlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 24
18 | }
19 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "google/mt5-base",
11 | "hf_tokenizer_name": "google/mt5-base",
12 | "hf_pooler_type": "mean_pooler"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "google/mt5-xl",
12 | "hf_tokenizer_name": "google/mt5-xl",
13 | "hf_pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-base-siglip.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "custom_text": true,
4 | "init_logit_bias": -10,
5 | "vision_cfg": {
6 | "image_size": 384,
7 | "timm_model_name": "vit_base_patch16_siglip_384",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "hf_model_name": "facebook/nllb-200-distilled-600M",
14 | "hf_tokenizer_name": "facebook/nllb-200-distilled-600M",
15 | "hf_proj_type": "linear",
16 | "hf_pooler_type": "cls_pooler"
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "facebook/nllb-200-distilled-600M",
11 | "hf_tokenizer_name": "facebook/nllb-200-distilled-600M",
12 | "hf_proj_type": "linear",
13 | "hf_pooler_type": "cls_pooler"
14 | }
15 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-large-siglip.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1152,
3 | "custom_text": true,
4 | "init_logit_bias": -10,
5 | "vision_cfg": {
6 | "image_size": 384,
7 | "timm_model_name": "vit_so400m_patch14_siglip_384",
8 | "timm_model_pretrained": false,
9 | "timm_pool": "map",
10 | "timm_proj": "none"
11 | },
12 | "text_cfg": {
13 | "hf_model_name": "facebook/nllb-200-distilled-1.3B",
14 | "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B",
15 | "hf_proj_type": "linear",
16 | "hf_pooler_type": "cls_pooler"
17 | }
18 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-large.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "facebook/nllb-200-distilled-1.3B",
12 | "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B",
13 | "hf_proj_type": "linear",
14 | "hf_pooler_type": "cls_pooler"
15 | }
16 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "roberta-base",
12 | "hf_tokenizer_name": "roberta-base",
13 | "hf_pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "swin_base_patch4_window7_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 640,
14 | "heads": 10,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_medium_patch16_gap_256",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 256
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "xlm-roberta-base",
11 | "hf_tokenizer_name": "xlm-roberta-base",
12 | "hf_pooler_type": "mean_pooler"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "xlm-roberta-large",
12 | "hf_tokenizer_name": "xlm-roberta-large",
13 | "hf_pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/openai.py:
--------------------------------------------------------------------------------
1 | """ OpenAI pretrained model functions
2 |
3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4 | """
5 |
6 | import os
7 | import warnings
8 | from typing import List, Optional, Union
9 |
10 | import torch
11 |
12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
13 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
14 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
15 |
16 | __all__ = ["list_openai_models", "load_openai_model"]
17 |
18 |
19 | def list_openai_models() -> List[str]:
20 | """Returns the names of available CLIP models"""
21 | return list_pretrained_models_by_tag('openai')
22 |
23 |
24 | def load_openai_model(
25 | name: str,
26 | precision: Optional[str] = None,
27 | device: Optional[Union[str, torch.device]] = None,
28 | cache_dir: Optional[str] = None,
29 | ):
30 | """Load a CLIP model
31 |
32 | Parameters
33 | ----------
34 | name : str
35 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
36 | precision: str
37 | Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
38 | device : Union[str, torch.device]
39 | The device to put the loaded model
40 | cache_dir : Optional[str]
41 | The directory to cache the downloaded model weights
42 |
43 | Returns
44 | -------
45 | model : torch.nn.Module
46 | The CLIP model
47 | preprocess : Callable[[PIL.Image], torch.Tensor]
48 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
49 | """
50 | if device is None:
51 | device = "cuda" if torch.cuda.is_available() else "cpu"
52 | if precision is None:
53 | precision = 'fp32' if device == 'cpu' else 'fp16'
54 |
55 | if get_pretrained_url(name, 'openai'):
56 | model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
57 | elif os.path.isfile(name):
58 | model_path = name
59 | else:
60 | raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
61 |
62 | try:
63 | # loading JIT archive
64 | model = torch.jit.load(model_path, map_location="cpu").eval()
65 | state_dict = None
66 | except RuntimeError:
67 | # loading saved state dict
68 | state_dict = torch.load(model_path, map_location="cpu")
69 |
70 | # Build a non-jit model from the OpenAI jitted model state dict
71 | cast_dtype = get_cast_dtype(precision)
72 | try:
73 | model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
74 | except KeyError:
75 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
76 | model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
77 |
78 | # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
79 | model = model.to(device)
80 | # FIXME support pure fp16/bf16 precision modes
81 | if precision != 'fp16':
82 | model.float()
83 | if precision == 'bf16':
84 | # for bf16, convert back to low-precision
85 | convert_weights_to_lp(model, dtype=torch.bfloat16)
86 |
87 | # add mean / std attributes for consistency with OpenCLIP models
88 | model.visual.image_mean = OPENAI_DATASET_MEAN
89 | model.visual.image_std = OPENAI_DATASET_STD
90 | return model
91 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/pos_embed.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | # --------------------------------------------------------
7 | # Position embedding utils
8 | # --------------------------------------------------------
9 |
10 | import numpy as np
11 |
12 | import torch
13 |
14 | # --------------------------------------------------------
15 | # 2D sine-cosine position embedding
16 | # References:
17 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
18 | # MoCo v3: https://github.com/facebookresearch/moco-v3
19 | # --------------------------------------------------------
20 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
21 | """
22 | grid_size: int of the grid height and width
23 | return:
24 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
25 | """
26 | grid_h = np.arange(grid_size, dtype=np.float32)
27 | grid_w = np.arange(grid_size, dtype=np.float32)
28 | grid = np.meshgrid(grid_w, grid_h) # here w goes first
29 | grid = np.stack(grid, axis=0)
30 |
31 | grid = grid.reshape([2, 1, grid_size, grid_size])
32 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
33 | if cls_token:
34 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
35 | return pos_embed
36 |
37 |
38 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
39 | assert embed_dim % 2 == 0
40 |
41 | # use half of dimensions to encode grid_h
42 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
43 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
44 |
45 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
46 | return emb
47 |
48 |
49 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
50 | """
51 | embed_dim: output dimension for each position
52 | pos: a list of positions to be encoded: size (M,)
53 | out: (M, D)
54 | """
55 | assert embed_dim % 2 == 0
56 | omega = np.arange(embed_dim // 2, dtype=float)
57 | omega /= embed_dim / 2.
58 | omega = 1. / 10000**omega # (D/2,)
59 |
60 | pos = pos.reshape(-1) # (M,)
61 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
62 |
63 | emb_sin = np.sin(out) # (M, D/2)
64 | emb_cos = np.cos(out) # (M, D/2)
65 |
66 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
67 | return emb
68 |
69 |
70 | # --------------------------------------------------------
71 | # Interpolate position embeddings for high-resolution
72 | # References:
73 | # DeiT: https://github.com/facebookresearch/deit
74 | # --------------------------------------------------------
75 | def interpolate_pos_embed(model, checkpoint_model):
76 | if 'pos_embed' in checkpoint_model:
77 | pos_embed_checkpoint = checkpoint_model['pos_embed']
78 | embedding_size = pos_embed_checkpoint.shape[-1]
79 | num_patches = model.patch_embed.num_patches
80 | num_extra_tokens = model.pos_embed.shape[-2] - num_patches
81 | # height (== width) for the checkpoint position embedding
82 | orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
83 | # height (== width) for the new position embedding
84 | new_size = int(num_patches ** 0.5)
85 | # class_token and dist_token are kept unchanged
86 | if orig_size != new_size:
87 | print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
88 | extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
89 | # only the position tokens are interpolated
90 | pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
91 | pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
92 | pos_tokens = torch.nn.functional.interpolate(
93 | pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
94 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
95 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
96 | checkpoint_model['pos_embed'] = new_pos_embed
97 |
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/utils.py:
--------------------------------------------------------------------------------
1 | from itertools import repeat
2 | import collections.abc
3 |
4 | import torch
5 | from torch import nn as nn
6 | from torchvision.ops.misc import FrozenBatchNorm2d
7 |
8 |
9 | def freeze_batch_norm_2d(module, module_match={}, name=''):
10 | """
11 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
12 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
13 | returned. Otherwise, the module is walked recursively and submodules are converted in place.
14 |
15 | Args:
16 | module (torch.nn.Module): Any PyTorch module.
17 | module_match (dict): Dictionary of full module names to freeze (all if empty)
18 | name (str): Full module name (prefix)
19 |
20 | Returns:
21 | torch.nn.Module: Resulting module
22 |
23 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
24 | """
25 | res = module
26 | is_match = True
27 | if module_match:
28 | is_match = name in module_match
29 | if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
30 | res = FrozenBatchNorm2d(module.num_features)
31 | res.num_features = module.num_features
32 | res.affine = module.affine
33 | if module.affine:
34 | res.weight.data = module.weight.data.clone().detach()
35 | res.bias.data = module.bias.data.clone().detach()
36 | res.running_mean.data = module.running_mean.data
37 | res.running_var.data = module.running_var.data
38 | res.eps = module.eps
39 | else:
40 | for child_name, child in module.named_children():
41 | full_child_name = '.'.join([name, child_name]) if name else child_name
42 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
43 | if new_child is not child:
44 | res.add_module(child_name, new_child)
45 | return res
46 |
47 |
48 | # From PyTorch internals
49 | def _ntuple(n):
50 | def parse(x):
51 | if isinstance(x, collections.abc.Iterable):
52 | return x
53 | return tuple(repeat(x, n))
54 | return parse
55 |
56 |
57 | to_1tuple = _ntuple(1)
58 | to_2tuple = _ntuple(2)
59 | to_3tuple = _ntuple(3)
60 | to_4tuple = _ntuple(4)
61 | to_ntuple = lambda n, x: _ntuple(n)(x)
62 |
63 | # Replaces all linear layers with linear_replacement
64 | # TODO: add int8 support for other linear layers including attn and convnets
65 | def replace_linear(model, linear_replacement, include_modules=['c_fc', 'c_proj'], copy_weights=True):
66 | for name, module in model.named_children():
67 | if len(list(module.children())) > 0:
68 | replace_linear(module, linear_replacement, include_modules, copy_weights)
69 |
70 | if isinstance(module, torch.nn.Linear) and name in include_modules:
71 | old_module = model._modules[name]
72 | model._modules[name] = linear_replacement(
73 | module.in_features,
74 | module.out_features,
75 | module.bias is not None,
76 | )
77 | if copy_weights:
78 | model._modules[name].weight.data.copy_(old_module.weight.data)
79 | if model._modules[name].bias is not None:
80 | model._modules[name].bias.data.copy_(old_module.bias)
81 |
82 | return model
83 |
84 | def convert_int8_model_to_inference_mode(model):
85 | for m in model.modules():
86 | if hasattr(m, 'prepare_for_eval'):
87 | int8_original_dtype = m.weight.dtype
88 | m.prepare_for_eval()
89 | m.int8_original_dtype = int8_original_dtype
--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.22.0'
2 |
--------------------------------------------------------------------------------
/open_clip_training/src/scripts/1cap_finetune_VitL.sh:
--------------------------------------------------------------------------------
1 | torchrun --master_port 12345 --nproc_per_node 8 -m training.main \
2 | --train-data /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/coco_proposal_1cap.csv \
3 | --train-num-samples 442117 \
4 | --lr 0.000005 \
5 | --warmup 100 \
6 | --force-quick-gelu \
7 | --dataset-type csv \
8 | --batch-size 32 \
9 | --precision amp \
10 | --workers 8 \
11 | --model ViT-L-14 \
12 | --lock-text \
13 | --zeroshot-frequency 1 \
14 | --save-frequency 1 \
15 | --epochs 10 \
16 | --pretrained datacomp_xl_s13b_b90k \
17 | --ade-val /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/ade_gt_150cls_val
--------------------------------------------------------------------------------
/open_clip_training/src/scripts/finetune_VitL_with_mask.sh:
--------------------------------------------------------------------------------
1 | torchrun --master_port 12345 --nproc_per_node 8 -m training.main \
2 | --train-data /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/coco_proposal_1cap.csv \
3 | --train-num-samples 442117 \
4 | --lr 0.000005 \
5 | --warmup 100 \
6 | --force-quick-gelu \
7 | --dataset-type csv \
8 | --batch-size 32 \
9 | --precision amp \
10 | --workers 8 \
11 | --model ViT-L-14 \
12 | --lock-text \
13 | --zeroshot-frequency 1 \
14 | --save-frequency 1 \
15 | --epochs 10 \
16 | --pretrained datacomp_xl_s13b_b90k \
17 | --ade-val /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/ade_gt_150cls_val \
18 | --with-mask
--------------------------------------------------------------------------------
/open_clip_training/src/training/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 |
--------------------------------------------------------------------------------
/open_clip_training/src/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/training/__init__.py
--------------------------------------------------------------------------------
/open_clip_training/src/training/file_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import multiprocessing
4 | import subprocess
5 | import time
6 | import fsspec
7 | import torch
8 | from tqdm import tqdm
9 |
10 | def remote_sync_s3(local_dir, remote_dir):
11 | # skip epoch_latest which can change during sync.
12 | result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
13 | if result.returncode != 0:
14 | logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}")
15 | return False
16 |
17 | logging.info(f"Successfully synced with S3 bucket")
18 | return True
19 |
20 | def remote_sync_fsspec(local_dir, remote_dir):
21 | # FIXME currently this is slow and not recommended. Look into speeding up.
22 | a = fsspec.get_mapper(local_dir)
23 | b = fsspec.get_mapper(remote_dir)
24 |
25 | for k in a:
26 | # skip epoch_latest which can change during sync.
27 | if 'epoch_latest.pt' in k:
28 | continue
29 |
30 | logging.info(f'Attempting to sync {k}')
31 | if k in b and len(a[k]) == len(b[k]):
32 | logging.debug(f'Skipping remote sync for {k}.')
33 | continue
34 |
35 | try:
36 | logging.info(f'Successful sync for {k}.')
37 | b[k] = a[k]
38 | except Exception as e:
39 | logging.info(f'Error during remote sync for {k}: {e}')
40 | return False
41 |
42 | return True
43 |
44 | def remote_sync(local_dir, remote_dir, protocol):
45 | logging.info('Starting remote sync.')
46 | if protocol == 's3':
47 | return remote_sync_s3(local_dir, remote_dir)
48 | elif protocol == 'fsspec':
49 | return remote_sync_fsspec(local_dir, remote_dir)
50 | else:
51 | logging.error('Remote protocol not known')
52 | return False
53 |
54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol):
55 | while True:
56 | time.sleep(sync_every)
57 | remote_sync(local_dir, remote_dir, protocol)
58 |
59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol):
60 | p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol))
61 | return p
62 |
63 | # Note: we are not currently using this save function.
64 | def pt_save(pt_obj, file_path):
65 | of = fsspec.open(file_path, "wb")
66 | with of as f:
67 | torch.save(pt_obj, file_path)
68 |
69 | def pt_load(file_path, map_location=None):
70 | if file_path.startswith('s3'):
71 | logging.info('Loading remote checkpoint, which may take a bit.')
72 | of = fsspec.open(file_path, "rb")
73 | with of as f:
74 | out = torch.load(f, map_location=map_location)
75 | return out
76 |
77 | def check_exists(file_path):
78 | try:
79 | with fsspec.open(file_path):
80 | pass
81 | except FileNotFoundError:
82 | return False
83 | return True
84 |
--------------------------------------------------------------------------------
/open_clip_training/src/training/precision.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from contextlib import suppress
3 |
4 |
5 | def get_autocast(precision):
6 | if precision == 'amp':
7 | return torch.cuda.amp.autocast
8 | elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
9 | # amp_bfloat16 is more stable than amp float16 for clip training
10 | return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
11 | else:
12 | return suppress
13 |
--------------------------------------------------------------------------------
/open_clip_training/src/training/scheduler.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def assign_learning_rate(optimizer, new_lr):
5 | for param_group in optimizer.param_groups:
6 | param_group["lr"] = new_lr
7 |
8 |
9 | def _warmup_lr(base_lr, warmup_length, step):
10 | return base_lr * (step + 1) / warmup_length
11 |
12 |
13 | def const_lr(optimizer, base_lr, warmup_length, steps):
14 | def _lr_adjuster(step):
15 | if step < warmup_length:
16 | lr = _warmup_lr(base_lr, warmup_length, step)
17 | else:
18 | lr = base_lr
19 | assign_learning_rate(optimizer, lr)
20 | return lr
21 | return _lr_adjuster
22 |
23 |
24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.):
25 | def _lr_adjuster(step):
26 | start_cooldown_step = steps - cooldown_steps
27 | if step < warmup_length:
28 | lr = _warmup_lr(base_lr, warmup_length, step)
29 | else:
30 | if step < start_cooldown_step:
31 | lr = base_lr
32 | else:
33 | e = step - start_cooldown_step
34 | es = steps - start_cooldown_step
35 | # linear decay if power == 1; polynomial decay otherwise;
36 | decay = (1 - (e/es)) ** cooldown_power
37 | lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr
38 | assign_learning_rate(optimizer, lr)
39 | return lr
40 | return _lr_adjuster
41 |
42 |
43 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
44 | def _lr_adjuster(step):
45 | if step < warmup_length:
46 | lr = _warmup_lr(base_lr, warmup_length, step)
47 | else:
48 | e = step - warmup_length
49 | es = steps - warmup_length
50 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
51 | assign_learning_rate(optimizer, lr)
52 | return lr
53 | return _lr_adjuster
54 |
--------------------------------------------------------------------------------
/open_clip_training/src/training/zero_shot.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import torch
4 | from tqdm import tqdm
5 |
6 | from open_clip import get_input_dtype, get_tokenizer, build_zero_shot_classifier, \
7 | IMAGENET_CLASSNAMES, OPENAI_IMAGENET_TEMPLATES
8 | from .precision import get_autocast
9 | from .ade150_zeroshot_data import ade150_classnames
10 | from torchmetrics import Accuracy
11 |
12 | def accuracy(output, target, topk=(1,)):
13 | pred = output.topk(max(topk), 1, True, True)[1].t()
14 | correct = pred.eq(target.view(1, -1).expand_as(pred))
15 | return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk], pred[0]
16 |
17 |
18 | def run(model, classifier, dataloader, args):
19 | autocast = get_autocast(args.precision)
20 | input_dtype = get_input_dtype(args.precision)
21 |
22 | with torch.no_grad():
23 | top1, top5, n = 0., 0., 0.
24 | preds = []
25 | targets = []
26 | macc = Accuracy('multiclass', num_classes=150, average='macro').cuda()
27 | for images, target, entire_images in tqdm(dataloader, unit_scale=args.batch_size):
28 | if args.with_mask:
29 | images, masks = images
30 | masks = masks.to(device=args.device, dtype=input_dtype)
31 | else:
32 | images = images
33 | masks=None
34 | images = images.to(device=args.device, dtype=input_dtype)
35 | target = target.to(args.device)
36 | entire_images = entire_images.to(device=args.device, dtype=input_dtype)
37 |
38 | with autocast():
39 | # predict
40 | output = model(original_image=entire_images, image=images, mask=masks, text=None)
41 | image_features = output['image_features'] if isinstance(output, dict) else output[0]
42 | logits = 100. * image_features @ classifier
43 |
44 | # measure accuracy
45 | (acc1, acc5), pred = accuracy(logits, target, topk=(1, 5))
46 | preds.append(pred)
47 | targets.append(target)
48 | top1 += acc1
49 | top5 += acc5
50 | n += images.size(0)
51 | preds = torch.cat(preds)
52 | targets = torch.cat(targets)
53 | top1 = (top1 / n)
54 | top5 = (top5 / n)
55 | return top1, top5, macc(preds, targets).item()
56 |
57 |
58 | def zero_shot_eval(model, data, epoch, args, tokenizer=None):
59 | if 'imagenet-val' not in data and 'imagenet-v2' not in data and 'ade-val' not in data:
60 | return {}
61 | if args.zeroshot_frequency == 0:
62 | return {}
63 | if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
64 | return {}
65 | if args.distributed and not args.horovod:
66 | model = model.module
67 |
68 | logging.info('Starting zero-shot imagenet.')
69 | if tokenizer is None:
70 | tokenizer = get_tokenizer(args.model)
71 |
72 | logging.info('Building zero-shot classifier')
73 | autocast = get_autocast(args.precision)
74 | with autocast():
75 | if 'ade-val' in data:
76 | classifier = build_zero_shot_classifier(
77 | model,
78 | tokenizer=tokenizer,
79 | classnames=ade150_classnames,
80 | templates=OPENAI_IMAGENET_TEMPLATES,
81 | num_classes_per_batch=10,
82 | device=args.device,
83 | use_tqdm=True,
84 | )
85 | else:
86 | classifier = build_zero_shot_classifier(
87 | model,
88 | tokenizer=tokenizer,
89 | classnames=IMAGENET_CLASSNAMES,
90 | templates=OPENAI_IMAGENET_TEMPLATES,
91 | num_classes_per_batch=10,
92 | device=args.device,
93 | use_tqdm=True,
94 | )
95 |
96 | logging.info('Using classifier')
97 | results = {}
98 | if 'imagenet-val' in data:
99 | top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args)
100 | results['imagenet-zeroshot-val-top1'] = top1
101 | results['imagenet-zeroshot-val-top5'] = top5
102 | if 'imagenet-v2' in data:
103 | top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args)
104 | results['imagenetv2-zeroshot-val-top1'] = top1
105 | results['imagenetv2-zeroshot-val-top5'] = top5
106 | if 'ade-val' in data:
107 | top1, top5, macc = run(model, classifier, data['ade-val'].dataloader, args)
108 | results['ade150-zeroshot-val-top1'] = top1
109 | results['ade150-zeroshot-val-top5'] = top5
110 |
111 | logging.info('Finished zero-shot imagenet.')
112 |
113 | return results
114 |
--------------------------------------------------------------------------------
/open_clip_training/tests/test_hf_model.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import torch
4 | from open_clip.hf_model import _POOLERS, HFTextEncoder
5 | from transformers import AutoConfig
6 | from transformers.modeling_outputs import BaseModelOutput
7 | # test poolers
8 | def test_poolers():
9 | bs, sl, d = 2, 10, 5
10 | h = torch.arange(sl).repeat(bs).reshape(bs, sl)[..., None] * torch.linspace(0.2, 1., d)
11 | mask = torch.ones(bs, sl, dtype=torch.bool)
12 | mask[:2, 6:] = False
13 | x = BaseModelOutput(h)
14 | for name, cls in _POOLERS.items():
15 | pooler = cls()
16 | res = pooler(x, mask)
17 | assert res.shape == (bs, d), f"{name} returned wrong shape"
18 |
19 | # test HFTextEncoder
20 | @pytest.mark.parametrize("model_id", ["arampacha/roberta-tiny", "roberta-base", "xlm-roberta-base", "google/mt5-base"])
21 | def test_pretrained_text_encoder(model_id):
22 | bs, sl, d = 2, 10, 64
23 | cfg = AutoConfig.from_pretrained(model_id)
24 | model = HFTextEncoder(model_id, d, proj_type='linear')
25 | x = torch.randint(0, cfg.vocab_size, (bs, sl))
26 | with torch.no_grad():
27 | emb = model(x)
28 |
29 | assert emb.shape == (bs, d)
30 |
--------------------------------------------------------------------------------
/open_clip_training/tests/test_inference_simple.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from PIL import Image
3 | from open_clip.factory import get_tokenizer
4 | import pytest
5 | import open_clip
6 | import os
7 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
8 |
9 | if hasattr(torch._C, '_jit_set_profiling_executor'):
10 | # legacy executor is too slow to compile large models for unit tests
11 | # no need for the fusion performance here
12 | torch._C._jit_set_profiling_executor(True)
13 | torch._C._jit_set_profiling_mode(False)
14 |
15 |
16 | test_simple_models = [
17 | # model, pretrained, jit, force_custom_text
18 | ("ViT-B-32", "laion2b_s34b_b79k", False, False),
19 | ("ViT-B-32", "laion2b_s34b_b79k", True, False),
20 | ("ViT-B-32", "laion2b_s34b_b79k", True, True),
21 | ("roberta-ViT-B-32", "laion2b_s12b_b32k", False, False),
22 | ]
23 |
24 |
25 | @pytest.mark.parametrize("model_type,pretrained,jit,force_custom_text", test_simple_models)
26 | def test_inference_simple(
27 | model_type,
28 | pretrained,
29 | jit,
30 | force_custom_text,
31 | ):
32 | model, _, preprocess = open_clip.create_model_and_transforms(
33 | model_type,
34 | pretrained=pretrained,
35 | jit=jit,
36 | force_custom_text=force_custom_text,
37 | )
38 | tokenizer = get_tokenizer(model_type)
39 |
40 | current_dir = os.path.dirname(os.path.realpath(__file__))
41 |
42 | image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0)
43 | text = tokenizer(["a diagram", "a dog", "a cat"])
44 |
45 | with torch.no_grad():
46 | image_features = model.encode_image(image)
47 | text_features = model.encode_text(text)
48 |
49 | text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
50 |
51 | assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0]
52 |
--------------------------------------------------------------------------------
/open_clip_training/tests/test_num_shards.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from training.data import get_dataset_size
4 |
5 | @pytest.mark.parametrize(
6 | "shards,expected_size",
7 | [
8 | ('/path/to/shard.tar', 1),
9 | ('/path/to/shard_{000..000}.tar', 1),
10 | ('/path/to/shard_{000..009}.tar', 10),
11 | ('/path/to/shard_{000..009}_{000..009}.tar', 100),
12 | ('/path/to/shard.tar::/path/to/other_shard_{000..009}.tar', 11),
13 | ('/path/to/shard_{000..009}.tar::/path/to/other_shard_{000..009}.tar', 20),
14 | (['/path/to/shard.tar'], 1),
15 | (['/path/to/shard.tar', '/path/to/other_shard.tar'], 2),
16 | ]
17 | )
18 | def test_num_shards(shards, expected_size):
19 | _, size = get_dataset_size(shards)
20 | assert size == expected_size, f'Expected {expected_size} for {shards} but found {size} instead.'
21 |
--------------------------------------------------------------------------------
/open_clip_training/tests/test_training_simple.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import sys
4 | import pytest
5 | from PIL import Image
6 | import torch
7 | from training.main import main
8 |
9 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
10 |
11 | if hasattr(torch._C, '_jit_set_profiling_executor'):
12 | # legacy executor is too slow to compile large models for unit tests
13 | # no need for the fusion performance here
14 | torch._C._jit_set_profiling_executor(True)
15 | torch._C._jit_set_profiling_mode(False)
16 |
17 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
18 | def test_training():
19 | main([
20 | '--save-frequency', '1',
21 | '--zeroshot-frequency', '1',
22 | '--dataset-type', "synthetic",
23 | '--train-num-samples', '16',
24 | '--warmup', '1',
25 | '--batch-size', '4',
26 | '--lr', '1e-3',
27 | '--wd', '0.1',
28 | '--epochs', '1',
29 | '--workers', '2',
30 | '--model', 'RN50'
31 | ])
32 |
33 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
34 | def test_training_coca():
35 | main([
36 | '--save-frequency', '1',
37 | '--zeroshot-frequency', '1',
38 | '--dataset-type', "synthetic",
39 | '--train-num-samples', '16',
40 | '--warmup', '1',
41 | '--batch-size', '4',
42 | '--lr', '1e-3',
43 | '--wd', '0.1',
44 | '--epochs', '1',
45 | '--workers', '2',
46 | '--model', 'coca_ViT-B-32'
47 | ])
48 |
49 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
50 | def test_training_mt5():
51 | main([
52 | '--save-frequency', '1',
53 | '--zeroshot-frequency', '1',
54 | '--dataset-type', "synthetic",
55 | '--train-num-samples', '16',
56 | '--warmup', '1',
57 | '--batch-size', '4',
58 | '--lr', '1e-3',
59 | '--wd', '0.1',
60 | '--epochs', '1',
61 | '--workers', '2',
62 | '--model', 'mt5-base-ViT-B-32',
63 | '--lock-text',
64 | '--lock-text-unlocked-layers', '2'
65 | ])
66 |
67 |
68 |
69 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
70 | def test_training_unfreezing_vit():
71 | main([
72 | '--save-frequency', '1',
73 | '--zeroshot-frequency', '1',
74 | '--dataset-type', "synthetic",
75 | '--train-num-samples', '16',
76 | '--warmup', '1',
77 | '--batch-size', '4',
78 | '--lr', '1e-3',
79 | '--wd', '0.1',
80 | '--epochs', '1',
81 | '--workers', '2',
82 | '--model', 'ViT-B-32',
83 | '--lock-image',
84 | '--lock-image-unlocked-groups', '5',
85 | '--accum-freq', '2'
86 | ])
87 |
88 |
89 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
90 | def test_training_clip_with_jit():
91 | main([
92 | '--save-frequency', '1',
93 | '--zeroshot-frequency', '1',
94 | '--dataset-type', "synthetic",
95 | '--train-num-samples', '16',
96 | '--warmup', '1',
97 | '--batch-size', '4',
98 | '--lr', '1e-3',
99 | '--wd', '0.1',
100 | '--epochs', '1',
101 | '--workers', '2',
102 | '--model', 'ViT-B-32',
103 | '--torchscript'
104 | ])
105 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | wandb
7 | fire
8 | opencv-python
9 | pandas
10 | braceexpand
11 | torch-ema
12 | torchmetrics==0.11.4
13 | setuptools==59.5.0
14 | webdataset>=0.2.5
15 | numpy==1.23.0
--------------------------------------------------------------------------------
/scan/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import data # register all new datasets
3 | from . import modeling
4 |
5 | # config
6 | # from .config import add_maskformer2_config
7 | from .config import add_ovseg_config
8 |
9 | # dataset loading
10 | # from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
11 | # from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
12 | # from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
13 | # MaskFormerInstanceDatasetMapper,
14 | # )
15 | # from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
16 | # MaskFormerPanopticDatasetMapper,
17 | # )
18 | # from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
19 | # MaskFormerSemanticDatasetMapper,
20 | # )
21 |
22 | # models
23 | # from .maskformer_model import MaskFormer
24 | from .test_time_augmentation import SemanticSegmentorWithTTA
25 |
26 | # evaluation
27 | # from .evaluation.instance_evaluation import InstanceSegEvaluator
28 | from .ovseg_model import SCAN, SCANDEMO
--------------------------------------------------------------------------------
/scan/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | from .dataset_mappers import *
5 | from . import datasets
6 | from .build import (
7 | build_detection_train_loader,
8 | build_detection_test_loader,
9 | )
10 |
--------------------------------------------------------------------------------
/scan/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
5 |
--------------------------------------------------------------------------------
/scan/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import register_coco_stuff, register_voc_seg
3 | from . import register_cc3m
4 | from . import register_ade20k_full
5 | from . import register_pascal_context
--------------------------------------------------------------------------------
/scan/data/datasets/register_voc_seg.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 |
7 | PASCALVOC20_NAMES = (
8 | "aeroplane",
9 | "bicycle",
10 | "bird",
11 | "boat",
12 | "bottle",
13 | "bus",
14 | "car",
15 | "cat",
16 | "chair",
17 | "cow",
18 | "diningtable",
19 | "dog",
20 | "horse",
21 | "motorbike",
22 | "person",
23 | "pottedplant",
24 | "sheep",
25 | "sofa",
26 | "train",
27 | "tvmonitor",
28 | )
29 |
30 | def _get_voc_meta(cat_list):
31 | ret = {
32 | "stuff_classes": cat_list,
33 | }
34 | return ret
35 |
36 |
37 | def register_pascalvoc(root):
38 | root = os.path.join(root, "VOCdevkit/VOC2012")
39 | meta = _get_voc_meta(PASCALVOC20_NAMES)
40 |
41 | for name, image_dirname, sem_seg_dirname in [
42 | ("val", "JPEGImages", "annotations_detectron2/val"),
43 | ]:
44 | image_dir = os.path.join(root, image_dirname)
45 | gt_dir = os.path.join(root, sem_seg_dirname)
46 | all_name = f"pascalvoc20_sem_seg_{name}"
47 | DatasetCatalog.register(
48 | all_name,
49 | lambda x=image_dir, y=gt_dir: load_sem_seg(
50 | y, x, gt_ext="png", image_ext="jpg"
51 | ),
52 | )
53 | MetadataCatalog.get(all_name).set(
54 | image_root=image_dir,
55 | sem_seg_root=gt_dir,
56 | evaluator_type="sem_seg",
57 | ignore_label=255,
58 | **meta,
59 | )
60 |
61 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
62 | register_pascalvoc(_root)
63 |
--------------------------------------------------------------------------------
/scan/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator, SGIoU_SemSegEvaluator
5 |
--------------------------------------------------------------------------------
/scan/frequency.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class LFM(nn.Module):
6 | def __init__(self, num_channels):
7 | super(LFM, self).__init__()
8 | self.conv1 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0)
9 | self.conv2 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0)
10 |
11 | def make_gaussian(self, y_idx, x_idx, height, width, sigma=7):
12 | yv, xv = torch.meshgrid([torch.arange(0, height), torch.arange(0, width)])
13 |
14 | yv = yv.unsqueeze(0).float().cuda()
15 | xv = xv.unsqueeze(0).float().cuda()
16 |
17 |
18 | g = torch.exp(- ((yv - y_idx) ** 2 + (xv - x_idx) ** 2) / (2 * sigma ** 2))
19 |
20 | return g.unsqueeze(0) #1, 1, H, W
21 |
22 |
23 | def forward(self, x, sigma):
24 | b, c, h, w = x.shape
25 | x = x.float()
26 | y = torch.fft.fft2(x)
27 |
28 |
29 | h_idx, w_idx = h // 2, w // 2
30 | high_filter = self.make_gaussian(h_idx, w_idx, h, w, sigma=sigma)
31 | y = y * (1 - high_filter)
32 |
33 | y_imag = y.imag
34 | y_real = y.real
35 | y_f = torch.cat([y_real, y_imag], dim=1)
36 | y = F.relu(self.conv1(y_f))
37 |
38 | y = self.conv2(y).float()
39 | y_real, y_imag = torch.chunk(y, 2, dim=1)
40 | y = torch.complex(y_real, y_imag)
41 |
42 | y = torch.fft.ifft2(y, s=(h, w)).float()
43 | return x + y
44 |
45 | class MLP(nn.Module):
46 | def __init__(self, input_dim, output_dim):
47 | super(MLP, self).__init__()
48 | self.fc1 = nn.Linear(input_dim, output_dim)
49 | self.fc2 = nn.Linear(output_dim, output_dim)
50 |
51 | def forward(self, x):
52 | x = self.fc2(self.fc1(x))
53 | return x
54 |
55 |
56 | class CA(nn.Module):
57 | def __init__(self, input_dim, num):
58 | super(CA, self).__init__()
59 | self.num = num
60 | self.multiattn = nn.ModuleList()
61 | self.ln = nn.ModuleList()
62 | for i in range(num):
63 | self.multiattn.append(nn.MultiheadAttention(embed_dim=input_dim, num_heads=8, batch_first=True))
64 | if i != num - 1:
65 | self.ln.append(nn.LayerNorm(input_dim))
66 |
67 | def forward(self, tgt, memory):
68 | for i in range(self.num):
69 | tgt = tgt + self.multiattn[i](tgt, memory, memory)[0]
70 | if i != self.num - 1:
71 | tgt = self.ln[i](tgt)
72 | return tgt
--------------------------------------------------------------------------------
/scan/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import OpenVocaMask2FormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 |
--------------------------------------------------------------------------------
/scan/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
--------------------------------------------------------------------------------
/scan/modeling/clip_adapter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | from .text_template import (
5 | PredefinedPromptExtractor,
6 | ImageNetPromptExtractor,
7 | VILDPromptExtractor,
8 | )
9 | from .adapter import ClipAdapter, MaskFormerClipAdapter
10 |
11 |
12 | def build_text_prompt(cfg):
13 | if cfg.TEXT_TEMPLATES == "predefined":
14 | text_templates = PredefinedPromptExtractor(cfg.PREDEFINED_PROMPT_TEMPLATES)
15 | elif cfg.TEXT_TEMPLATES == "imagenet":
16 | text_templates = ImageNetPromptExtractor()
17 | elif cfg.TEXT_TEMPLATES == "vild":
18 | text_templates = VILDPromptExtractor()
19 | else:
20 | raise NotImplementedError(
21 | "Prompt learner {} is not supported".format(cfg.TEXT_TEMPLATES)
22 | )
23 | return text_templates
24 |
--------------------------------------------------------------------------------
/scan/modeling/clip_adapter/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | from typing import Tuple
5 | import numpy as np
6 | import torch
7 | from detectron2.utils.comm import get_local_rank, synchronize
8 |
9 |
10 | def expand_box(
11 | x1: float,
12 | y1: float,
13 | x2: float,
14 | y2: float,
15 | expand_ratio: float = 1.0,
16 | max_h: int = None,
17 | max_w: int = None,
18 | ):
19 | cx = 0.5 * (x1 + x2)
20 | cy = 0.5 * (y1 + y2)
21 | w = x2 - x1
22 | h = y2 - y1
23 | w = w * expand_ratio
24 | h = h * expand_ratio
25 | box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
26 | if max_h is not None:
27 | box[1] = max(0, box[1])
28 | box[3] = min(max_h - 1, box[3])
29 | if max_w is not None:
30 | box[0] = max(0, box[0])
31 | box[2] = min(max_w - 1, box[2])
32 | return [int(b) for b in box]
33 |
34 |
35 | def mask2box(mask: torch.Tensor):
36 | # use naive way
37 | row = torch.nonzero(mask.sum(dim=0))[:, 0]
38 | if len(row) == 0:
39 | return None
40 | x1 = row.min()
41 | x2 = row.max()
42 | col = np.nonzero(mask.sum(dim=1))[:, 0]
43 | y1 = col.min()
44 | y2 = col.max()
45 | return x1, y1, x2 + 1, y2 + 1
46 |
47 |
48 | def crop_with_mask(
49 | image: torch.Tensor,
50 | mask: torch.Tensor,
51 | bbox: torch.Tensor,
52 | fill: Tuple[float, float, float] = (0, 0, 0),
53 | expand_ratio: float = 1.0,
54 | ):
55 | l, t, r, b = expand_box(*bbox, expand_ratio)
56 | _, h, w = image.shape
57 | l = max(l, 0)
58 | t = max(t, 0)
59 | r = min(r, w)
60 | b = min(b, h)
61 | new_image = torch.cat(
62 | [image.new_full((1, b - t, r - l), fill_value=val) for val in fill]
63 | )
64 | # return image[:, t:b, l:r], mask[None, t:b, l:r]
65 | return image[:, t:b, l:r] * mask[None, t:b, l:r] + (1 - mask[None, t:b, l:r]) * new_image, mask[None, t:b, l:r]
--------------------------------------------------------------------------------
/scan/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 |
14 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 |
21 | try:
22 | import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 | info_string = (
25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 | "\t`sh make.sh`\n"
28 | )
29 | raise ModuleNotFoundError(info_string)
30 |
31 |
32 | class MSDeformAttnFunction(Function):
33 | @staticmethod
34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 | ctx.im2col_step = im2col_step
36 | output = MSDA.ms_deform_attn_forward(
37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 | return output
40 |
41 | @staticmethod
42 | @once_differentiable
43 | def backward(ctx, grad_output):
44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 | grad_value, grad_sampling_loc, grad_attn_weight = \
46 | MSDA.ms_deform_attn_backward(
47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 |
49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 |
51 |
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 | # for debug and test only,
54 | # need to use cuda version instead
55 | N_, S_, M_, D_ = value.shape
56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 | sampling_grids = 2 * sampling_locations - 1
59 | sampling_value_list = []
60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 | # N_*M_, D_, Lq_, P_
66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 | mode='bilinear', padding_mode='zeros', align_corners=False)
68 | sampling_value_list.append(sampling_value_l_)
69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 | return output.transpose(1, 2).contiguous()
73 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------
7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | # ------------------------------------------------------------------------------------------------
9 |
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 |
13 | python3 setup.py build install
14 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from .ms_deform_attn import MSDeformAttn
13 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | import os
13 | import glob
14 |
15 | import torch
16 |
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 |
21 | from setuptools import find_packages
22 | from setuptools import setup
23 |
24 | requirements = ["torch", "torchvision"]
25 |
26 | def get_extensions():
27 | this_dir = os.path.dirname(os.path.abspath(__file__))
28 | extensions_dir = os.path.join(this_dir, "src")
29 |
30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 |
34 | sources = main_file + source_cpu
35 | extension = CppExtension
36 | extra_compile_args = {"cxx": []}
37 | define_macros = []
38 |
39 | # Force cuda since torch ask for a device, not if cuda is in fact available.
40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 | extension = CUDAExtension
42 | sources += source_cuda
43 | define_macros += [("WITH_CUDA", None)]
44 | extra_compile_args["nvcc"] = [
45 | "-DCUDA_HAS_FP16=1",
46 | "-D__CUDA_NO_HALF_OPERATORS__",
47 | "-D__CUDA_NO_HALF_CONVERSIONS__",
48 | "-D__CUDA_NO_HALF2_OPERATORS__",
49 | ]
50 | else:
51 | if CUDA_HOME is None:
52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 | else:
54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 |
56 | sources = [os.path.join(extensions_dir, s) for s in sources]
57 | include_dirs = [extensions_dir]
58 | ext_modules = [
59 | extension(
60 | "MultiScaleDeformableAttention",
61 | sources,
62 | include_dirs=include_dirs,
63 | define_macros=define_macros,
64 | extra_compile_args=extra_compile_args,
65 | )
66 | ]
67 | return ext_modules
68 |
69 | setup(
70 | name="MultiScaleDeformableAttention",
71 | version="1.0",
72 | author="Weijie Su",
73 | url="https://github.com/fundamentalvision/Deformable-DETR",
74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 | packages=find_packages(exclude=("configs", "tests",)),
76 | ext_modules=get_extensions(),
77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include
17 |
18 | #include
19 | #include
20 |
21 |
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 | const at::Tensor &value,
25 | const at::Tensor &spatial_shapes,
26 | const at::Tensor &level_start_index,
27 | const at::Tensor &sampling_loc,
28 | const at::Tensor &attn_weight,
29 | const int im2col_step)
30 | {
31 | AT_ERROR("Not implement on cpu");
32 | }
33 |
34 | std::vector
35 | ms_deform_attn_cpu_backward(
36 | const at::Tensor &value,
37 | const at::Tensor &spatial_shapes,
38 | const at::Tensor &level_start_index,
39 | const at::Tensor &sampling_loc,
40 | const at::Tensor &attn_weight,
41 | const at::Tensor &grad_output,
42 | const int im2col_step)
43 | {
44 | AT_ERROR("Not implement on cpu");
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 | #include
18 |
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 | const at::Tensor &value,
22 | const at::Tensor &spatial_shapes,
23 | const at::Tensor &level_start_index,
24 | const at::Tensor &sampling_loc,
25 | const at::Tensor &attn_weight,
26 | const int im2col_step);
27 |
28 | std::vector
29 | ms_deform_attn_cpu_backward(
30 | const at::Tensor &value,
31 | const at::Tensor &spatial_shapes,
32 | const at::Tensor &level_start_index,
33 | const at::Tensor &sampling_loc,
34 | const at::Tensor &attn_weight,
35 | const at::Tensor &grad_output,
36 | const int im2col_step);
37 |
38 |
39 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 | #include
18 |
19 | at::Tensor ms_deform_attn_cuda_forward(
20 | const at::Tensor &value,
21 | const at::Tensor &spatial_shapes,
22 | const at::Tensor &level_start_index,
23 | const at::Tensor &sampling_loc,
24 | const at::Tensor &attn_weight,
25 | const int im2col_step);
26 |
27 | std::vector ms_deform_attn_cuda_backward(
28 | const at::Tensor &value,
29 | const at::Tensor &spatial_shapes,
30 | const at::Tensor &level_start_index,
31 | const at::Tensor &sampling_loc,
32 | const at::Tensor &attn_weight,
33 | const at::Tensor &grad_output,
34 | const int im2col_step);
35 |
36 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 |
18 | #include "cpu/ms_deform_attn_cpu.h"
19 |
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 |
24 |
25 | at::Tensor
26 | ms_deform_attn_forward(
27 | const at::Tensor &value,
28 | const at::Tensor &spatial_shapes,
29 | const at::Tensor &level_start_index,
30 | const at::Tensor &sampling_loc,
31 | const at::Tensor &attn_weight,
32 | const int im2col_step)
33 | {
34 | if (value.type().is_cuda())
35 | {
36 | #ifdef WITH_CUDA
37 | return ms_deform_attn_cuda_forward(
38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 | AT_ERROR("Not compiled with GPU support");
41 | #endif
42 | }
43 | AT_ERROR("Not implemented on the CPU");
44 | }
45 |
46 | std::vector
47 | ms_deform_attn_backward(
48 | const at::Tensor &value,
49 | const at::Tensor &spatial_shapes,
50 | const at::Tensor &level_start_index,
51 | const at::Tensor &sampling_loc,
52 | const at::Tensor &attn_weight,
53 | const at::Tensor &grad_output,
54 | const int im2col_step)
55 | {
56 | if (value.type().is_cuda())
57 | {
58 | #ifdef WITH_CUDA
59 | return ms_deform_attn_cuda_backward(
60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 | AT_ERROR("Not compiled with GPU support");
63 | #endif
64 | }
65 | AT_ERROR("Not implemented on the CPU");
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include "ms_deform_attn.h"
17 |
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 |
--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 |
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 |
23 |
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 |
30 |
31 | torch.manual_seed(3)
32 |
33 |
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 | value = torch.rand(N, S, M, D).cuda() * 0.01
37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 | im2col_step = 2
41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 | fwdok = torch.allclose(output_cuda, output_pytorch)
44 | max_abs_err = (output_cuda - output_pytorch).abs().max()
45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 |
47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 |
49 |
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 | value = torch.rand(N, S, M, D).cuda() * 0.01
53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 | im2col_step = 2
57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 | max_abs_err = (output_cuda - output_pytorch).abs().max()
61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 |
63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 |
65 |
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 |
68 | value = torch.rand(N, S, M, channels).cuda() * 0.01
69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 | im2col_step = 2
73 | func = MSDeformAttnFunction.apply
74 |
75 | value.requires_grad = grad_value
76 | sampling_locations.requires_grad = grad_sampling_loc
77 | attention_weights.requires_grad = grad_attn_weight
78 |
79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 |
81 | print(f'* {gradok} check_gradient_numerical(D={channels})')
82 |
83 |
84 | if __name__ == '__main__':
85 | check_forward_equal_with_pytorch_double()
86 | check_forward_equal_with_pytorch_float()
87 |
88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 | check_gradient_numerical(channels, True, True, True)
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/scan/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 |
--------------------------------------------------------------------------------
/scan/modeling/transformer_decoder/open_vocab_mask2former_predictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
3 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
4 |
5 | from torch import nn
6 | from detectron2.config import configurable
7 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder, MLP
8 |
9 |
10 | class OpenVocabMask2FormerPredictor(MultiScaleMaskedTransformerDecoder):
11 | @configurable
12 | def __init__(
13 | self,
14 | in_channels,
15 | mask_classification=True,
16 | *,
17 | embedding_dim: int,
18 | embed_hidden_dim: int,
19 | embed_layers: int,
20 | hidden_dim: int,
21 | num_queries: int,
22 | nheads: int,
23 | # dropout: float,
24 | dim_feedforward: int,
25 | # enc_layers: int,
26 | dec_layers: int,
27 | pre_norm: bool,
28 | # deep_supervision: bool,
29 | mask_dim: int,
30 | enforce_input_project: bool,
31 | ):
32 | super().__init__(
33 | in_channels,
34 | False,
35 | num_classes=embedding_dim,
36 | hidden_dim=hidden_dim,
37 | num_queries=num_queries,
38 | nheads=nheads,
39 | # dropout=dropout,
40 | dim_feedforward=dim_feedforward,
41 | # enc_layers=enc_layers,
42 | dec_layers=dec_layers,
43 | pre_norm=pre_norm,
44 | # deep_supervision=deep_supervision,
45 | mask_dim=mask_dim,
46 | enforce_input_project=enforce_input_project,
47 | )
48 | mask_classification = True
49 | self.mask_classification = mask_classification
50 | # output FFNs
51 | if self.mask_classification:
52 | self.class_embed = MLP(
53 | hidden_dim, embed_hidden_dim, embedding_dim, embed_layers
54 | )
55 |
56 | def freeze_pretrained(self):
57 | for name, module in self.named_children():
58 | if name not in ["class_embed"]:
59 | for param in module.parameters():
60 | param.requires_grad = False
61 |
62 | @classmethod
63 | def from_config(cls, cfg, in_channels, mask_classification):
64 | ret = {}
65 | ret["in_channels"] = in_channels
66 | ret["mask_classification"] = mask_classification
67 |
68 | ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM
69 | ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM
70 | ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS
71 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
72 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
73 | # Transformer parameters:
74 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
75 | # ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
76 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
77 | # ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
78 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
79 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
80 | # ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
81 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
82 |
83 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
84 |
85 | return ret
86 |
--------------------------------------------------------------------------------
/scan/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
3 | """
4 | Various positional encodings for the transformer.
5 | """
6 | import math
7 |
8 | import torch
9 | from torch import nn
10 |
11 |
12 | class PositionEmbeddingSine(nn.Module):
13 | """
14 | This is a more standard version of the position embedding, very similar to the one
15 | used by the Attention is all you need paper, generalized to work on images.
16 | """
17 |
18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 | super().__init__()
20 | self.num_pos_feats = num_pos_feats
21 | self.temperature = temperature
22 | self.normalize = normalize
23 | if scale is not None and normalize is False:
24 | raise ValueError("normalize should be True if scale is passed")
25 | if scale is None:
26 | scale = 2 * math.pi
27 | self.scale = scale
28 |
29 | def forward(self, x, mask=None):
30 | if mask is None:
31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 | not_mask = ~mask
33 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 | if self.normalize:
36 | eps = 1e-6
37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 |
40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 |
43 | pos_x = x_embed[:, :, :, None] / dim_t
44 | pos_y = y_embed[:, :, :, None] / dim_t
45 | pos_x = torch.stack(
46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 | ).flatten(3)
48 | pos_y = torch.stack(
49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 | ).flatten(3)
51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 | return pos
53 |
54 | def __repr__(self, _repr_indent=4):
55 | head = "Positional encoding " + self.__class__.__name__
56 | body = [
57 | "num_pos_feats: {}".format(self.num_pos_feats),
58 | "temperature: {}".format(self.temperature),
59 | "normalize: {}".format(self.normalize),
60 | "scale: {}".format(self.scale),
61 | ]
62 | # _repr_indent = 4
63 | lines = [head] + [" " * _repr_indent + line for line in body]
64 | return "\n".join(lines)
65 |
--------------------------------------------------------------------------------
/scan/test_time_augmentation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 | from itertools import count
5 |
6 | import numpy as np
7 | import torch
8 | from fvcore.transforms import HFlipTransform
9 | from torch import nn
10 | from torch.nn.parallel import DistributedDataParallel
11 |
12 | from detectron2.data.detection_utils import read_image
13 | from detectron2.modeling import DatasetMapperTTA
14 |
15 |
16 | __all__ = [
17 | "SemanticSegmentorWithTTA",
18 | ]
19 |
20 |
21 | class SemanticSegmentorWithTTA(nn.Module):
22 | """
23 | A SemanticSegmentor with test-time augmentation enabled.
24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
25 | """
26 |
27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
28 | """
29 | Args:
30 | cfg (CfgNode):
31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
32 | tta_mapper (callable): takes a dataset dict and returns a list of
33 | augmented versions of the dataset dict. Defaults to
34 | `DatasetMapperTTA(cfg)`.
35 | batch_size (int): batch the augmented images into this batch size for inference.
36 | """
37 | super().__init__()
38 | if isinstance(model, DistributedDataParallel):
39 | model = model.module
40 | self.cfg = cfg.clone()
41 |
42 | self.model = model
43 |
44 | if tta_mapper is None:
45 | tta_mapper = DatasetMapperTTA(cfg)
46 | self.tta_mapper = tta_mapper
47 | self.batch_size = batch_size
48 |
49 | def __call__(self, batched_inputs):
50 | """
51 | Same input/output format as :meth:`SemanticSegmentor.forward`
52 | """
53 |
54 | def _maybe_read_image(dataset_dict):
55 | ret = copy.copy(dataset_dict)
56 | if "image" not in ret:
57 | image = read_image(ret.pop("file_name"), self.model.input_format)
58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
59 | ret["image"] = image
60 | if "height" not in ret and "width" not in ret:
61 | ret["height"] = image.shape[1]
62 | ret["width"] = image.shape[2]
63 | return ret
64 |
65 | processed_results = []
66 | for x in batched_inputs:
67 | result = self._inference_one_image(_maybe_read_image(x))
68 | processed_results.append(result)
69 | return processed_results
70 |
71 | def _inference_one_image(self, input):
72 | """
73 | Args:
74 | input (dict): one dataset dict with "image" field being a CHW tensor
75 | Returns:
76 | dict: one output dict
77 | """
78 | orig_shape = (input["height"], input["width"])
79 | augmented_inputs, tfms = self._get_augmented_inputs(input)
80 |
81 | final_predictions = None
82 | count_predictions = 0
83 | for input, tfm in zip(augmented_inputs, tfms):
84 | count_predictions += 1
85 | with torch.no_grad():
86 | if final_predictions is None:
87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
89 | else:
90 | final_predictions = self.model([input])[0].pop("sem_seg")
91 | else:
92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
94 | else:
95 | final_predictions += self.model([input])[0].pop("sem_seg")
96 |
97 | final_predictions = final_predictions / count_predictions
98 | return {"sem_seg": final_predictions}
99 |
100 | def _get_augmented_inputs(self, input):
101 | augmented_inputs = self.tta_mapper(input)
102 | tfms = [x.pop("transforms") for x in augmented_inputs]
103 | return augmented_inputs, tfms
104 |
--------------------------------------------------------------------------------
/scan/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .events import setup_wandb, WandbWriter
3 | from .predictor import VisualizationDemo
--------------------------------------------------------------------------------
/scan/utils/events.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import os
5 | import wandb
6 | from detectron2.utils import comm
7 | from detectron2.utils.events import EventWriter, get_event_storage
8 |
9 |
10 | def setup_wandb(cfg, args):
11 | if comm.is_main_process():
12 | init_args = {
13 | k.lower(): v
14 | for k, v in cfg.WANDB.items()
15 | if isinstance(k, str) and k not in ["config", "name"]
16 | }
17 | # only include most related part to avoid too big table
18 | # TODO: add configurable params to select which part of `cfg` should be saved in config
19 | if "config_exclude_keys" in init_args:
20 | init_args["config"] = cfg
21 | init_args["config"]["cfg_file"] = args.config_file
22 | else:
23 | init_args["config"] = {
24 | "model": cfg.MODEL,
25 | "solver": cfg.SOLVER,
26 | "cfg_file": args.config_file,
27 | }
28 | if ("name" not in init_args) or (init_args["name"] is None):
29 | init_args["name"] = os.path.basename(args.config_file)
30 | # wandb.init(**init_args)
31 |
32 |
33 | class BaseRule(object):
34 | def __call__(self, target):
35 | return target
36 |
37 |
38 | class IsIn(BaseRule):
39 | def __init__(self, keyword: str):
40 | self.keyword = keyword
41 |
42 | def __call__(self, target):
43 | return self.keyword in target
44 |
45 |
46 | class Prefix(BaseRule):
47 | def __init__(self, keyword: str):
48 | self.keyword = keyword
49 |
50 | def __call__(self, target):
51 | return "/".join([self.keyword, target])
52 |
53 |
54 | class WandbWriter(EventWriter):
55 | """
56 | Write all scalars to a tensorboard file.
57 | """
58 |
59 | def __init__(self):
60 | """
61 | Args:
62 | log_dir (str): the directory to save the output events
63 | kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
64 | """
65 | self._last_write = -1
66 | self._group_rules = [
67 | (IsIn("/"), BaseRule()),
68 | (IsIn("loss"), Prefix("train")),
69 | ]
70 |
71 | def write(self):
72 |
73 | storage = get_event_storage()
74 |
75 | def _group_name(scalar_name):
76 | for (rule, op) in self._group_rules:
77 | if rule(scalar_name):
78 | return op(scalar_name)
79 | return scalar_name
80 |
81 | stats = {
82 | _group_name(name): scalars[0]
83 | for name, scalars in storage.latest().items()
84 | if scalars[1] > self._last_write
85 | }
86 | if len(stats) > 0:
87 | self._last_write = max([v[1] for k, v in storage.latest().items()])
88 |
89 | # storage.put_{image,histogram} is only meant to be used by
90 | # tensorboard writer. So we access its internal fields directly from here.
91 | if len(storage._vis_data) >= 1:
92 | stats["image"] = [
93 | wandb.Image(img, caption=img_name)
94 | for img_name, img, step_num in storage._vis_data
95 | ]
96 | # Storage stores all image data and rely on this writer to clear them.
97 | # As a result it assumes only one writer will use its image data.
98 | # An alternative design is to let storage store limited recent
99 | # data (e.g. only the most recent image) that all writers can access.
100 | # In that case a writer may not see all image data if its period is long.
101 | storage.clear_images()
102 |
103 | if len(storage._histograms) >= 1:
104 |
105 | def create_bar(tag, bucket_limits, bucket_counts, **kwargs):
106 | data = [
107 | [label, val] for (label, val) in zip(bucket_limits, bucket_counts)
108 | ]
109 | table = wandb.Table(data=data, columns=["label", "value"])
110 | return wandb.plot.bar(table, "label", "value", title=tag)
111 |
112 | stats["hist"] = [create_bar(**params) for params in storage._histograms]
113 |
114 | storage.clear_histograms()
115 |
116 | if len(stats) == 0:
117 | return
118 | # wandb.log(stats, step=storage.iter)
119 |
120 | def close(self):
121 | wandb.finish()
122 |
--------------------------------------------------------------------------------
/scan/utils/misc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
3 | """
4 | Misc functions, including distributed helpers.
5 |
6 | Mostly copy-paste from torchvision references.
7 | """
8 | from typing import List, Optional
9 |
10 | import torch
11 | import torch.distributed as dist
12 | import torchvision
13 | from torch import Tensor
14 |
15 |
16 |
17 | def _max_by_axis(the_list):
18 | # type: (List[List[int]]) -> List[int]
19 | maxes = the_list[0]
20 | for sublist in the_list[1:]:
21 | for index, item in enumerate(sublist):
22 | maxes[index] = max(maxes[index], item)
23 | return maxes
24 |
25 |
26 | class NestedTensor(object):
27 | def __init__(self, tensors, mask: Optional[Tensor]):
28 | self.tensors = tensors
29 | self.mask = mask
30 |
31 | def to(self, device):
32 | # type: (Device) -> NestedTensor # noqa
33 | cast_tensor = self.tensors.to(device)
34 | mask = self.mask
35 | if mask is not None:
36 | assert mask is not None
37 | cast_mask = mask.to(device)
38 | else:
39 | cast_mask = None
40 | return NestedTensor(cast_tensor, cast_mask)
41 |
42 | def decompose(self):
43 | return self.tensors, self.mask
44 |
45 | def __repr__(self):
46 | return str(self.tensors)
47 |
48 |
49 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
50 | # TODO make this more general
51 | if tensor_list[0].ndim == 3:
52 | if torchvision._is_tracing():
53 | # nested_tensor_from_tensor_list() does not export well to ONNX
54 | # call _onnx_nested_tensor_from_tensor_list() instead
55 | return _onnx_nested_tensor_from_tensor_list(tensor_list)
56 |
57 | # TODO make it support different-sized images
58 | max_size = _max_by_axis([list(img.shape) for img in tensor_list])
59 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
60 | batch_shape = [len(tensor_list)] + max_size
61 | b, c, h, w = batch_shape
62 | dtype = tensor_list[0].dtype
63 | device = tensor_list[0].device
64 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
65 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
66 | for img, pad_img, m in zip(tensor_list, tensor, mask):
67 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
68 | m[: img.shape[1], : img.shape[2]] = False
69 | else:
70 | raise ValueError("not supported")
71 | return NestedTensor(tensor, mask)
72 |
73 |
74 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
75 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
76 | @torch.jit.unused
77 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
78 | max_size = []
79 | for i in range(tensor_list[0].dim()):
80 | max_size_i = torch.max(
81 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
82 | ).to(torch.int64)
83 | max_size.append(max_size_i)
84 | max_size = tuple(max_size)
85 |
86 | # work around for
87 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
88 | # m[: img.shape[1], :img.shape[2]] = False
89 | # which is not yet supported in onnx
90 | padded_imgs = []
91 | padded_masks = []
92 | for img in tensor_list:
93 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
94 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
95 | padded_imgs.append(padded_img)
96 |
97 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
98 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
99 | padded_masks.append(padded_mask.to(torch.bool))
100 |
101 | tensor = torch.stack(padded_imgs)
102 | mask = torch.stack(padded_masks)
103 |
104 | return NestedTensor(tensor, mask=mask)
105 |
106 |
107 | def is_dist_avail_and_initialized():
108 | if not dist.is_available():
109 | return False
110 | if not dist.is_initialized():
111 | return False
112 | return True
113 |
114 | def get_gt_binary_masks(gt_semseg):
115 | mask_ids = torch.unique(gt_semseg)
116 | gt_masks = []
117 | for id in mask_ids:
118 | if id != 255:
119 | gt_masks.append(gt_semseg == id)
120 | gt_masks = torch.stack(gt_masks).float()
121 | return gt_masks
--------------------------------------------------------------------------------
/scan/utils/post_process_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import torch
5 | from torch.nn import functional as F
6 | import numpy as np
7 |
8 | try:
9 | import pydensecrf.densecrf as dcrf
10 | from pydensecrf.utils import (
11 | unary_from_softmax,
12 | unary_from_labels,
13 | create_pairwise_bilateral,
14 | create_pairwise_gaussian,
15 | )
16 | except:
17 | dcrf = None
18 |
19 |
20 | def dense_crf_post_process(
21 | logits,
22 | image,
23 | n_labels=None,
24 | max_iters=5,
25 | pos_xy_std=(3, 3),
26 | pos_w=3,
27 | bi_xy_std=(80, 80),
28 | bi_rgb_std=(13, 13, 13),
29 | bi_w=10,
30 | ):
31 | """
32 | logits : [C,H,W]
33 | image : [3,H,W]
34 | """
35 | if dcrf is None:
36 | raise FileNotFoundError(
37 | "pydensecrf is required to perform dense crf inference."
38 | )
39 | if isinstance(logits, torch.Tensor):
40 | logits = F.softmax(logits, dim=0).detach().cpu().numpy()
41 | U = unary_from_softmax(logits)
42 | n_labels = logits.shape[0]
43 | elif logits.ndim == 3:
44 | U = unary_from_softmax(logits)
45 | n_labels = logits.shape[0]
46 | else:
47 | assert n_labels is not None
48 | U = unary_from_labels(logits, n_labels, zero_unsure=False)
49 |
50 | d = dcrf.DenseCRF2D(image.shape[1], image.shape[0], n_labels)
51 |
52 | d.setUnaryEnergy(U)
53 |
54 | # This adds the color-independent term, features are the locations only.
55 | d.addPairwiseGaussian(
56 | sxy=pos_xy_std,
57 | compat=pos_w,
58 | kernel=dcrf.DIAG_KERNEL,
59 | normalization=dcrf.NORMALIZE_SYMMETRIC,
60 | )
61 |
62 | # This adds the color-dependent term, i.e. features are (x,y,r,g,b).
63 | d.addPairwiseBilateral(
64 | sxy=bi_xy_std,
65 | srgb=bi_rgb_std,
66 | rgbim=image,
67 | compat=bi_w,
68 | kernel=dcrf.DIAG_KERNEL,
69 | normalization=dcrf.NORMALIZE_SYMMETRIC,
70 | )
71 | # Run five inference steps.
72 | logits = d.inference(max_iters)
73 | logits = np.asarray(logits).reshape((n_labels, image.shape[0], image.shape[1]))
74 | return torch.from_numpy(logits)
75 |
--------------------------------------------------------------------------------
/tools/convert-pretrained-clip-model-to-d2.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import pickle as pkl
5 | import sys
6 |
7 | import torch
8 |
9 | """
10 | Usage:
11 | # download pretrained swin model:
12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 | # run the conversion
14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 | FORMAT: "RGB"
20 | """
21 |
22 |
23 | def transform(path):
24 | model = torch.load(path, map_location="cpu")
25 | print(f"loading {path}......")
26 | state_dict = model["model"]
27 | state_dict = {
28 | k.replace("visual_model.", ""): v
29 | for k, v in state_dict.items()
30 | if k.startswith("visual_model")
31 | }
32 | source_keys = [k for k in state_dict.keys() if "relative_coords" in k]
33 | for k in source_keys:
34 | state_dict[
35 | k.replace("relative_coords", "relative_position_index")
36 | ] = state_dict[k]
37 | del state_dict[k]
38 |
39 | source_keys = [k for k in state_dict.keys() if "atten_mask_matrix" in k]
40 | for k in source_keys:
41 | state_dict[k.replace("atten_mask_matrix", "attn_mask")] = state_dict[k]
42 | del state_dict[k]
43 |
44 | source_keys = [k for k in state_dict.keys() if "rel_pos_embed_table" in k]
45 | for k in source_keys:
46 | state_dict[
47 | k.replace("rel_pos_embed_table", "relative_position_bias_table")
48 | ] = state_dict[k]
49 | del state_dict[k]
50 |
51 | source_keys = [k for k in state_dict.keys() if "channel_reduction" in k]
52 | for k in source_keys:
53 | state_dict[k.replace("channel_reduction", "reduction")] = state_dict[k]
54 | del state_dict[k]
55 | return {
56 | k if k.startswith("backbone.") else "backbone." + k: v
57 | for k, v in state_dict.items()
58 | }
59 |
60 |
61 | if __name__ == "__main__":
62 | input = sys.argv[1]
63 | res = {
64 | "model": transform(input),
65 | "__author__": "third_party",
66 | "matching_heuristics": True,
67 | }
68 | with open(sys.argv[2], "wb") as f:
69 | pkl.dump(res, f)
70 |
--------------------------------------------------------------------------------
/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import pickle as pkl
5 | import sys
6 |
7 | import torch
8 |
9 | """
10 | Usage:
11 | # download pretrained swin model:
12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 | # run the conversion
14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 | FORMAT: "RGB"
20 | """
21 |
22 | if __name__ == "__main__":
23 | input = sys.argv[1]
24 |
25 | obj = torch.load(input, map_location="cpu")["model"]
26 |
27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 |
29 | with open(sys.argv[2], "wb") as f:
30 | pkl.dump(res, f)
31 |
--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import pickle as pkl
5 | import sys
6 |
7 | import torch
8 |
9 | """
10 | Usage:
11 | # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 | # run the conversion
14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 | # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 | WEIGHTS: "/path/to/r50.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | RESNETS:
21 | DEPTH: 50
22 | STRIDE_IN_1X1: False
23 | INPUT:
24 | FORMAT: "RGB"
25 | These models typically produce slightly worse results than the
26 | pre-trained ResNets we use in official configs, which are the
27 | original ResNet models released by MSRA.
28 | """
29 |
30 | if __name__ == "__main__":
31 | input = sys.argv[1]
32 |
33 | obj = torch.load(input, map_location="cpu")
34 |
35 | newmodel = {}
36 | for k in list(obj.keys()):
37 | old_k = k
38 | if "layer" not in k:
39 | k = "stem." + k
40 | for t in [1, 2, 3, 4]:
41 | k = k.replace("layer{}".format(t), "res{}".format(t + 1))
42 | for t in [1, 2, 3]:
43 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
44 | k = k.replace("downsample.0", "shortcut")
45 | k = k.replace("downsample.1", "shortcut.norm")
46 | print(old_k, "->", k)
47 | newmodel[k] = obj.pop(old_k).detach().numpy()
48 |
49 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
50 |
51 | with open(sys.argv[2], "wb") as f:
52 | pkl.dump(res, f)
53 | if obj:
54 | print("Unconverted keys:", obj.keys())
55 |
--------------------------------------------------------------------------------
/tools/replace_clip.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 |
4 | import torch
5 | from collections import OrderedDict
6 |
7 |
8 | # PATH to finetune clip model
9 | clip_ckpt = torch.load('CS_CLIP.pt')
10 |
11 | new_model = OrderedDict()
12 | state_dict = clip_ckpt['state_dict']
13 |
14 | for k, v in state_dict.items():
15 | if 'clip_model' in k:
16 | new_key = k.replace('module.clip_model.','')
17 | new_model[new_key] = v
18 |
19 | # PATH to trained MaskFormer model
20 | ovseg_model = torch.load('Seg_model.pth', 'cpu')
21 |
22 | for k, v in new_model.items():
23 | new_k = 'clip_adapter.clip_model.' + k
24 | if new_k in ovseg_model['model'].keys():
25 | ovseg_model['model'][new_k] = v
26 | else:
27 | print(f'{new_k} does not exist in ckpt')
28 | try:
29 | ovseg_model['model']['clip_adapter.clip_model.visual.mask_embedding'] = new_model['visual.mask_embedding']
30 | print('clip_ckpt has mask_embedding, remember to set MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD True during evaluation')
31 | except:
32 | print('clip_ckpt does not have mask_embedding, remember to set MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False during evaluation')
33 |
34 | torch.save(ovseg_model, 'SCAN.pth')
35 |
--------------------------------------------------------------------------------