├── .gitignore
├── INSTALL.md
├── LICENSE
├── README.md
├── configs
    ├── scan_vitB.yaml
    ├── scan_vitL.yaml
    └── scan_vitL_demo.yaml
├── datasets
    ├── DATASETS.md
    ├── prepare_ade20k_full_sem_seg.py
    ├── prepare_ade20k_sem_seg.py
    ├── prepare_coco_stuff_sem_seg.py
    ├── prepare_pascal_context.py
    └── prepare_voc_sem_seg.py
├── demo.py
├── imgs
    ├── cs.png
    ├── pipeline.png
    ├── results.png
    └── visual.png
├── open_clip_training
    ├── .github
    │   └── workflows
    │   │   ├── ci.yml
    │   │   ├── clear-cache.yml
    │   │   └── python-publish.yml
    ├── .gitignore
    ├── CITATION.cff
    ├── HISTORY.md
    ├── LICENSE
    ├── MANIFEST.in
    ├── Makefile
    ├── README.md
    ├── docs
    │   ├── Interacting_with_open_clip.ipynb
    │   ├── Interacting_with_open_coca.ipynb
    │   ├── LOW_ACC.md
    │   ├── PRETRAINED.md
    │   ├── clip_conceptual_captions.md
    │   ├── clipa.md
    │   ├── datacomp_models.md
    │   ├── openclip_results.csv
    │   └── script_examples
    │   │   ├── clipa
    │   │       ├── vit_b16
    │   │       │   ├── i50_t16_finetune.sh
    │   │       │   └── i50_t16_pretrain.sh
    │   │       └── vit_l16
    │   │       │   ├── i17_t16_finetune.sh
    │   │       │   ├── i17_t16_pretrain.sh
    │   │       │   ├── i37_t8_finetune.sh
    │   │       │   └── i37_t8_pretrain.sh
    │   │   ├── clipav2
    │   │       └── vit_h14
    │   │       │   ├── i257_t32_finetunex4.sh
    │   │       │   ├── i50_t8_pretrain.sh
    │   │       │   └── i577_t32_finetunex1.sh
    │   │   └── stability_example.sh
    ├── pytest.ini
    ├── scripts
    │   ├── clipav1_vit_l16_i37_t8.sh
    │   ├── clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh
    │   ├── h14_224_32_finetune.sh
    │   └── h14_84_8_pretrain.sh
    ├── setup.py
    ├── src
    │   ├── clip_adapter
    │   │   └── clip_adapter.py
    │   ├── open_clip
    │   │   ├── __init__.py
    │   │   ├── big_vision.py
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── coca_model.py
    │   │   ├── constants.py
    │   │   ├── factory.py
    │   │   ├── generation_utils.py
    │   │   ├── hf_configs.py
    │   │   ├── hf_model.py
    │   │   ├── loss.py
    │   │   ├── model.py
    │   │   ├── model_configs
    │   │   │   ├── EVA01-g-14-plus.json
    │   │   │   ├── EVA01-g-14.json
    │   │   │   ├── EVA02-B-16.json
    │   │   │   ├── EVA02-E-14-plus.json
    │   │   │   ├── EVA02-E-14.json
    │   │   │   ├── EVA02-L-14-336.json
    │   │   │   ├── EVA02-L-14.json
    │   │   │   ├── RN101-quickgelu.json
    │   │   │   ├── RN101.json
    │   │   │   ├── RN50-quickgelu.json
    │   │   │   ├── RN50.json
    │   │   │   ├── RN50x16.json
    │   │   │   ├── RN50x4.json
    │   │   │   ├── RN50x64.json
    │   │   │   ├── ViT-B-16-SigLIP-256.json
    │   │   │   ├── ViT-B-16-SigLIP-384.json
    │   │   │   ├── ViT-B-16-SigLIP-512.json
    │   │   │   ├── ViT-B-16-SigLIP-i18n-256.json
    │   │   │   ├── ViT-B-16-SigLIP.json
    │   │   │   ├── ViT-B-16-plus-240.json
    │   │   │   ├── ViT-B-16-plus.json
    │   │   │   ├── ViT-B-16-quickgelu.json
    │   │   │   ├── ViT-B-16.json
    │   │   │   ├── ViT-B-32-256.json
    │   │   │   ├── ViT-B-32-plus-256.json
    │   │   │   ├── ViT-B-32-quickgelu.json
    │   │   │   ├── ViT-B-32.json
    │   │   │   ├── ViT-H-14-378-quickgelu.json
    │   │   │   ├── ViT-H-14-CLIPA-336.json
    │   │   │   ├── ViT-H-14-CLIPA.json
    │   │   │   ├── ViT-H-14-quickgelu.json
    │   │   │   ├── ViT-H-14.json
    │   │   │   ├── ViT-H-16.json
    │   │   │   ├── ViT-L-14-280.json
    │   │   │   ├── ViT-L-14-336.json
    │   │   │   ├── ViT-L-14-CLIPA-336.json
    │   │   │   ├── ViT-L-14-CLIPA.json
    │   │   │   ├── ViT-L-14-quickgelu.json
    │   │   │   ├── ViT-L-14.json
    │   │   │   ├── ViT-L-16-320.json
    │   │   │   ├── ViT-L-16-SigLIP-256.json
    │   │   │   ├── ViT-L-16-SigLIP-384.json
    │   │   │   ├── ViT-L-16.json
    │   │   │   ├── ViT-M-16-alt.json
    │   │   │   ├── ViT-M-16.json
    │   │   │   ├── ViT-M-32-alt.json
    │   │   │   ├── ViT-M-32.json
    │   │   │   ├── ViT-S-16-alt.json
    │   │   │   ├── ViT-S-16.json
    │   │   │   ├── ViT-S-32-alt.json
    │   │   │   ├── ViT-S-32.json
    │   │   │   ├── ViT-SO400M-14-SigLIP-384.json
    │   │   │   ├── ViT-SO400M-14-SigLIP.json
    │   │   │   ├── ViT-bigG-14-CLIPA-336.json
    │   │   │   ├── ViT-bigG-14-CLIPA.json
    │   │   │   ├── ViT-bigG-14.json
    │   │   │   ├── ViT-e-14.json
    │   │   │   ├── ViT-g-14.json
    │   │   │   ├── coca_ViT-B-32.json
    │   │   │   ├── coca_ViT-L-14.json
    │   │   │   ├── coca_base.json
    │   │   │   ├── coca_roberta-ViT-B-32.json
    │   │   │   ├── convnext_base.json
    │   │   │   ├── convnext_base_w.json
    │   │   │   ├── convnext_base_w_320.json
    │   │   │   ├── convnext_large.json
    │   │   │   ├── convnext_large_d.json
    │   │   │   ├── convnext_large_d_320.json
    │   │   │   ├── convnext_small.json
    │   │   │   ├── convnext_tiny.json
    │   │   │   ├── convnext_xlarge.json
    │   │   │   ├── convnext_xxlarge.json
    │   │   │   ├── convnext_xxlarge_320.json
    │   │   │   ├── mt5-base-ViT-B-32.json
    │   │   │   ├── mt5-xl-ViT-H-14.json
    │   │   │   ├── nllb-clip-base-siglip.json
    │   │   │   ├── nllb-clip-base.json
    │   │   │   ├── nllb-clip-large-siglip.json
    │   │   │   ├── nllb-clip-large.json
    │   │   │   ├── roberta-ViT-B-32.json
    │   │   │   ├── swin_base_patch4_window7_224.json
    │   │   │   ├── vit_medium_patch16_gap_256.json
    │   │   │   ├── vit_relpos_medium_patch16_cls_224.json
    │   │   │   ├── xlm-roberta-base-ViT-B-32.json
    │   │   │   └── xlm-roberta-large-ViT-H-14.json
    │   │   ├── modified_resnet.py
    │   │   ├── openai.py
    │   │   ├── pos_embed.py
    │   │   ├── pretrained.py
    │   │   ├── push_to_hf_hub.py
    │   │   ├── timm_model.py
    │   │   ├── tokenizer.py
    │   │   ├── transform.py
    │   │   ├── transformer.py
    │   │   ├── utils.py
    │   │   ├── version.py
    │   │   ├── zero_shot_classifier.py
    │   │   └── zero_shot_metadata.py
    │   ├── scripts
    │   │   ├── 1cap_finetune_VitL.sh
    │   │   └── finetune_VitL_with_mask.sh
    │   └── training
    │   │   ├── .gitignore
    │   │   ├── __init__.py
    │   │   ├── ade150_zeroshot_data.py
    │   │   ├── data.py
    │   │   ├── distributed.py
    │   │   ├── file_utils.py
    │   │   ├── main.py
    │   │   ├── params.py
    │   │   ├── precision.py
    │   │   ├── profiler.py
    │   │   ├── scheduler.py
    │   │   ├── train.py
    │   │   └── zero_shot.py
    ├── tests
    │   ├── test_download_pretrained.py
    │   ├── test_hf_model.py
    │   ├── test_inference.py
    │   ├── test_inference_simple.py
    │   ├── test_num_shards.py
    │   ├── test_training_simple.py
    │   ├── test_wds.py
    │   └── util_test.py
    └── tutorials
    │   └── int8_tutorial.ipynb
├── requirements.txt
├── scan
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── augmentations.py
    │   ├── build.py
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   └── mask_former_semantic_dataset_mapper.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── csv_data.py
    │   │   ├── register_ade20k_full.py
    │   │   ├── register_cc3m.py
    │   │   ├── register_coco_stuff.py
    │   │   ├── register_pascal_context.py
    │   │   └── register_voc_seg.py
    ├── evaluation
    │   ├── __init__.py
    │   └── generalized_sem_seg_evaluation.py
    ├── frequency.py
    ├── maskformer_model.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── clip_resnet.py
    │   │   └── swin.py
    │   ├── clip_adapter
    │   │   ├── __init__.py
    │   │   ├── adapter.py
    │   │   ├── text_template.py
    │   │   └── utils.py
    │   ├── criterion.py
    │   ├── matcher.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   ├── mask_former_head.py
    │   │   └── per_pixel_baseline.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   ├── fpn.py
    │   │   ├── msdeformattn.py
    │   │   └── ops
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── setup.py
    │   │   │   ├── src
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │   │   │       │   └── ms_deform_attn_cpu.h
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.cu
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │   │   │       ├── ms_deform_attn.h
    │   │   │       └── vision.cpp
    │   │   │   └── test.py
    │   └── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── mask2former_transformer_decoder.py
    │   │   ├── maskformer_transformer_decoder.py
    │   │   ├── open_vocab_mask2former_predictor.py
    │   │   ├── position_encoding.py
    │   │   └── transformer.py
    ├── ovseg_model.py
    ├── test_time_augmentation.py
    └── utils
    │   ├── __init__.py
    │   ├── events.py
    │   ├── misc.py
    │   ├── post_process_utils.py
    │   └── predictor.py
├── tools
    ├── convert-pretrained-clip-model-to-d2.py
    ├── convert-pretrained-swin-model-to-d2.py
    ├── convert-torchvision-to-d2.py
    └── replace_clip.py
└── train_net.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | outputs
 4 | instant_test_output
 5 | inference_test_output
 6 | 
 7 | 
 8 | 
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | **/.ipynb_checkpoints/
34 | 
35 | # Editor temporaries
36 | *.swn
37 | *.swo
38 | *.swp
39 | *~
40 | 
41 | # editor settings
42 | .idea
43 | .vscode
44 | _darcs
45 | 
46 | # project dirs
47 | /detectron2/model_zoo/configs
48 | /datasets/*
49 | !/datasets/*.*
50 | /projects/*/datasets
51 | /models
52 | /snippet
53 | 
54 | # vs code
55 | .history
56 | 
57 | amlt
58 | thirdparty
59 | wandb
60 | weights
61 | 
62 | 
63 | *.zip
64 | *.tar
65 | /output
66 | *.pth
67 | *.pt
68 | 
69 | *.png
70 | !imgs/*.png
71 | *.txt
72 | !requirements.txt
73 | 
74 | results/
75 | 
76 | openclip_data/
77 | logs/
78 | 
79 | data
80 | !scan/data
81 | 
82 | *log*
83 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux with Python ≥ 3.8
 5 | - PyTorch ≥ 1.10 and torchvision that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | 
10 | ### Usage
11 | 
12 | Install required packages. 
13 | 
14 | ```bash
15 | conda create -n scan python=3.8
16 | conda activate scan
17 | conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge -y
18 | pip install -r requirements.txt
19 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
20 | ```
21 | 
22 | 
23 | 
24 | Install other packages.
25 | 
26 | ```bash
27 | cd scan/modeling/pixel_decoder/ops
28 | sh make.sh
29 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Open-Vocabulary Segmentation with Semantic-Assisted Calibration [CVPR 2024]
  2 | Yong Liu*, Sule Bai*, Guanbin Li, Yitong Wang, Yansong Tang
  3 | (*equal contribution)
  4 | 
  5 | The repository contains the official implementation of "Open-Vocabulary Segmentation with Semantic-Assisted Calibration"
  6 | 
  7 | [Paper](https://arxiv.org/abs/2312.04089)
  8 | 
  9 | <a href='https://arxiv.org/abs/2312.04089'><img src='https://img.shields.io/badge/ArXiv-2312.04089-red'></a> 
 10 | 
 11 | 
 12 | 
 13 | ---
 14 | ## 📖 Pipeline & Results
 15 | <p align="center">
 16 |  <img src="imgs/pipeline.png" width="88%">
 17 |  <img src="imgs/visual.png" width="50%">
 18 |  <img src="imgs/results.png" width="37.5%">
 19 | </p>
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | ### Tab of Content
 27 | - [Installation](#1)
 28 | - [Data Preparation](#2)
 29 | - [Usage](#3)
 30 |   - [Training](#5)
 31 |   - [Evaluation](#4)
 32 | - [Cite](#6)
 33 | 
 34 | <span id="1"></span>
 35 | 
 36 | 
 37 | If you find any bugs due to carelessness on our part in organizing the code, feel free to contact us and point that!
 38 | 
 39 | ### Installation
 40 | Please see [installation guide](./INSTALL.md).
 41 |    
 42 | 
 43 | <span id="2"></span>
 44 | 
 45 | ### Data Preparation
 46 | Please follow the instruction of [ov-seg](https://github.com/facebookresearch/ov-seg) to prepare the training and test data. The data should be organized like:
 47 | ```
 48 | $DETECTRON2_DATASETS/
 49 |   coco/                 # COCOStuff-171
 50 |   ADEChallengeData2016/ # ADE20K-150
 51 |   ADE20K_2021_17_01/    # ADE20K-847
 52 |   VOCdevkit/
 53 |     VOC2012/            # PASCALVOC-20
 54 |     VOC2010/            # PASCALContext-59, PASCALContext-459
 55 | ```
 56 | 
 57 | 
 58 | <span id="3"></span>
 59 | 
 60 | ### Usage
 61 | 
 62 | - #### Pretrained Weight
 63 |   We have provided the pretrained SCAN-VitL weights and the finetuned Contextual-shifted CLIP weights. Please download them from [here](https://drive.google.com/drive/folders/1obgHGQngtQms0u5YUJRnwd4y1IzME-c8?usp=drive_link).
 64 | 
 65 | 
 66 | 
 67 | #### Evaluation 
 68 | 
 69 |   <span id="4"></span>
 70 |   ```
 71 |   python train_net.py --eval-only --config-file <CONFIG_FILE> --num-gpus <NUM_GPU> OUTPUT_DIR <OUTPUT_PATH> MODEL.WEIGHTS <TRAINED_MODEL_PATH>
 72 |   ```
 73 |   - Here is an example:
 74 |   ```
 75 |   python train_net.py --num-gpu 8 --eval-only --config-file configs/scan_vitL.yaml MODEL.WEIGHTS ./SCAN.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\) MODEL.CLIP_ADAPTER.REPLACE_RATIO 0.05 MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.75 MODEL.CLIP_ADAPTER.MASK_THR 0.55
 76 |   ```
 77 | 
 78 | <span id="5"></span>
 79 | #### Training
 80 | 1. Train the segmentation model:
 81 |   ```
 82 |   python train_net.py  --config-file <CONFIG_FILE> --num-gpus <NUM_GPU>
 83 |   ```
 84 | 
 85 |   - Here is an example:
 86 | 
 87 |   ```
 88 |   python train_net.py  --num-gpu 8 --config-file configs/scan_vitL.yaml
 89 |   ```
 90 | 
 91 | 2. Fuse segmentation model with finetuned CLIP.
 92 | 
 93 |   We have provided the [finetuned CLIP weights](https://drive.google.com/drive/folders/1obgHGQngtQms0u5YUJRnwd4y1IzME-c8?usp=drive_link). You can directly fuse the pretrained weights with the segmentation model to get the final model. The fuse command is:
 94 |   ```
 95 |   cd tools
 96 |   python replace_clip.py
 97 |   ```
 98 |   You need to specify the "clip_ckpt" and "ovseg_model" in the file according to your CLIP path and segmentation model path.
 99 | 
100 | 
101 |   (Optional) If you want to finetune the CLIP model from scratch, please follow  [ov-seg](https://github.com/facebookresearch/ov-seg) to prepare the corresponding data. The finetued command is:
102 | 
103 |   ```
104 |   cd open_clip_training
105 |   cd src
106 |   bash scripts/finetune_VitL_with_mask.sh
107 |   ```
108 | 
109 | 
110 | 
111 | <span id="6"></span>
112 | ### Cite 
113 | 
114 | If you find our work helpful, we'd appreciate it if you could cite our paper in your work.
115 | ```
116 | @article{liu2023open,
117 |   title={Open-Vocabulary Segmentation with Semantic-Assisted Calibration},
118 |   author={Liu, Yong and Bai, Sule and Li, Guanbin and Wang, Yitong and Tang, Yansong},
119 |   journal={arXiv preprint arXiv:2312.04089},
120 |   year={2023}
121 | }
122 | ```
123 | 


--------------------------------------------------------------------------------
/configs/scan_vitB.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "SCAN"
  3 |   BACKBONE:
  4 |     FREEZE_AT: 0
  5 |     NAME: "D2SwinTransformer"
  6 |   SWIN:
  7 |     EMBED_DIM: 128
  8 |     DEPTHS: [2, 2, 18, 2]
  9 |     NUM_HEADS: [4, 8, 16, 32]
 10 |     WINDOW_SIZE: 12
 11 |     APE: False
 12 |     DROP_PATH_RATE: 0.3
 13 |     PATCH_NORM: True
 14 |     PRETRAIN_IMG_SIZE: 384
 15 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
 16 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 17 |   PIXEL_STD: [58.395, 57.120, 57.375]
 18 |   SELECT_ORI_CLIP_ID: [6, 9, 12]
 19 |   FREQUENCY_SIGMA: [9, 7, 3]
 20 |   CLIP_VISION_DIM: 768
 21 |   SCAN_DIM: 512
 22 |   PATCH_SIZE: 14
 23 |   SEM_SEG_HEAD:
 24 |     NAME: "OpenVocaMask2FormerHead"
 25 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 26 |     IGNORE_VALUE: 255
 27 |     NUM_CLASSES: 171 # number of categories in training set
 28 |     EMBEDDING_DIM: 512
 29 |     EMBED_LAYERS: 2
 30 |     COMMON_STRIDE: 4 # not used, hard-coded
 31 |     LOSS_WEIGHT: 1.0
 32 |     CONVS_DIM: 256
 33 |     MASK_DIM: 256
 34 |     NORM: "GN"
 35 |     # pixel decoder
 36 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 37 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 38 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 39 |     COMMON_STRIDE: 4
 40 |     TRANSFORMER_ENC_LAYERS: 6
 41 |   MASK_FORMER:
 42 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
 43 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 44 |     DEEP_SUPERVISION: True
 45 |     NO_OBJECT_WEIGHT: 0.1
 46 |     CLASS_WEIGHT: 2.0
 47 |     MASK_WEIGHT: 5.0
 48 |     DICE_WEIGHT: 5.0
 49 |     HIDDEN_DIM: 256
 50 |     NUM_OBJECT_QUERIES: 100
 51 |     NHEADS: 8
 52 |     DROPOUT: 0.0
 53 |     DIM_FEEDFORWARD: 2048
 54 |     ENC_LAYERS: 0
 55 |     PRE_NORM: False
 56 |     ENFORCE_INPUT_PROJ: False
 57 |     SIZE_DIVISIBILITY: 32
 58 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 59 |     TRAIN_NUM_POINTS: 12544
 60 |     OVERSAMPLE_RATIO: 3.0
 61 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 62 |   CLIP_ADAPTER:
 63 |     TEXT_TEMPLATES: "vild"
 64 |     CLIP_MODEL_NAME: "ViT-B-16"
 65 |     MASK_FILL: "mean"
 66 |     MASK_EXPAND_RATIO: 1.0
 67 |     MASK_MATTING: False # use soft background, default not used
 68 |     REGION_RESIZED: True # resize to the input of clip, e.g., 224
 69 |     CLIP_ENSEMBLE: True # use ensemble of two classification branches
 70 |     # For the [MASK_THR, CLIP_ENSEMBLE_WEIGHT], we have the following settings:
 71 |     # a847: [0.25, 0.75]  a150: [0.4, 0.7]  pc459: [0.25, 0.7]  pc59: [0.25, 0.35]  voc20: [0.2, 0.45]
 72 |     MASK_THR: 0.4
 73 |     CLIP_ENSEMBLE_WEIGHT: 0.7
 74 |     # For the REPLACE_RATIO, we have the following settings:
 75 |     # a847: 0.05  a150: 0.05  pc459: 0.05  pc59: 0.05  voc20: 0.1
 76 |     REPLACE_RATIO: 0.15
 77 |     REPLACE_LAYER: [1, 3, 5]
 78 | DATASETS:
 79 |   TRAIN: ("coco_2017_train_stuff_sem_seg",)
 80 |   TEST: ("ade20k_sem_seg_val",)
 81 | SOLVER:
 82 |   IMS_PER_BATCH: 32
 83 |   BASE_LR: 0.00006
 84 |   MAX_ITER: 120000
 85 |   WARMUP_FACTOR: 1e-6
 86 |   WARMUP_ITERS: 1500
 87 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
 88 |   WEIGHT_DECAY: 0.01
 89 |   WEIGHT_DECAY_NORM: 0.0
 90 |   WEIGHT_DECAY_EMBED: 0.0
 91 |   BACKBONE_MULTIPLIER: 1.0
 92 |   TEST_IMS_PER_BATCH: 1
 93 |   CLIP_GRADIENTS:
 94 |     ENABLED: True
 95 |     CLIP_TYPE: "full_model"
 96 |     CLIP_VALUE: 0.01
 97 |     NORM_TYPE: 2.0
 98 | INPUT:
 99 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
100 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
101 |   MIN_SIZE_TEST: 640
102 |   MAX_SIZE_TRAIN: 2560
103 |   MAX_SIZE_TEST: 2560
104 |   CROP:
105 |     ENABLED: True
106 |     TYPE: "absolute"
107 |     SIZE: (640, 640)
108 |     SINGLE_CATEGORY_MAX_AREA: 1.0
109 |   COLOR_AUG_SSD: True
110 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
111 |   FORMAT: "RGB"
112 | TEST:
113 |   EVAL_PERIOD: 5000
114 |   # SEMANTIC_ON: True
115 |   # INSTANCE_ON: False
116 |   # PANOPTIC_ON: False
117 |   AUG:
118 |     ENABLED: False
119 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
120 |     MAX_SIZE: 3584
121 |     FLIP: True
122 | DATALOADER:
123 |   FILTER_EMPTY_ANNOTATIONS: True
124 |   NUM_WORKERS: 16
125 | VERSION: 2
126 | METRIC: 'Vanilla' # Vanilla or SG-IoU
127 | OUTPUT_DIR: output/SCAN-VitB


--------------------------------------------------------------------------------
/configs/scan_vitL.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "SCAN"
  3 |   BACKBONE:
  4 |     FREEZE_AT: 0
  5 |     NAME: "D2SwinTransformer"
  6 |   SWIN:
  7 |     EMBED_DIM: 128
  8 |     DEPTHS: [2, 2, 18, 2]
  9 |     NUM_HEADS: [4, 8, 16, 32]
 10 |     WINDOW_SIZE: 12
 11 |     APE: False
 12 |     DROP_PATH_RATE: 0.3
 13 |     PATCH_NORM: True
 14 |     PRETRAIN_IMG_SIZE: 384
 15 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
 16 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 17 |   PIXEL_STD: [58.395, 57.120, 57.375]
 18 |   SELECT_ORI_CLIP_ID: [12, 18, 24]
 19 |   FREQUENCY_SIGMA: [9, 7, 3]
 20 |   CLIP_VISION_DIM: 1024
 21 |   SCAN_DIM: 768
 22 |   PATCH_SIZE: 16
 23 |   SEM_SEG_HEAD:
 24 |     NAME: "OpenVocaMask2FormerHead"
 25 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 26 |     IGNORE_VALUE: 255
 27 |     NUM_CLASSES: 171 # number of categories in training set
 28 |     EMBEDDING_DIM: 768
 29 |     EMBED_LAYERS: 2
 30 |     COMMON_STRIDE: 4 # not used, hard-coded
 31 |     LOSS_WEIGHT: 1.0
 32 |     CONVS_DIM: 256
 33 |     MASK_DIM: 256
 34 |     NORM: "GN"
 35 |     # pixel decoder
 36 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 37 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 38 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 39 |     COMMON_STRIDE: 4
 40 |     TRANSFORMER_ENC_LAYERS: 6
 41 |   MASK_FORMER:
 42 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
 43 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 44 |     DEEP_SUPERVISION: True
 45 |     NO_OBJECT_WEIGHT: 0.1
 46 |     CLASS_WEIGHT: 2.0
 47 |     MASK_WEIGHT: 5.0
 48 |     DICE_WEIGHT: 5.0
 49 |     HIDDEN_DIM: 256
 50 |     NUM_OBJECT_QUERIES: 100
 51 |     NHEADS: 8
 52 |     DROPOUT: 0.0
 53 |     DIM_FEEDFORWARD: 2048
 54 |     ENC_LAYERS: 0
 55 |     PRE_NORM: False
 56 |     ENFORCE_INPUT_PROJ: False
 57 |     SIZE_DIVISIBILITY: 32
 58 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 59 |     TRAIN_NUM_POINTS: 12544
 60 |     OVERSAMPLE_RATIO: 3.0
 61 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 62 |   CLIP_ADAPTER:
 63 |     TEXT_TEMPLATES: "vild"
 64 |     CLIP_MODEL_NAME: "ViT-L-14"
 65 |     MASK_FILL: "mean"
 66 |     MASK_EXPAND_RATIO: 1.0
 67 |     MASK_MATTING: False # use soft background, default not used
 68 |     REGION_RESIZED: True # resize to the input of clip, e.g., 224
 69 |     CLIP_ENSEMBLE: True # use ensemble of two classification branches
 70 |     # For the [MASK_THR, CLIP_ENSEMBLE_WEIGHT], we have the following settings:
 71 |     # a847: [0.3, 0.75]  a150: [0.55, 0.75]  pc459: [0.25, 0.65]  pc59: [0.5, 0.5]  voc20: [0.2, 0.65]
 72 |     MASK_THR: 0.4
 73 |     CLIP_ENSEMBLE_WEIGHT: 0.7
 74 |     # For the REPLACE_RATIO, we have the following settings:
 75 |     # a847: 0.15  a150: 0.05  pc459: 0.05  pc59: 0.05  voc20: 0.1
 76 |     REPLACE_RATIO: 0.15
 77 |     REPLACE_LAYER: [1, 3, 5, 7, 9]
 78 | DATASETS:
 79 |   TRAIN: ("coco_2017_train_stuff_sem_seg",)
 80 |   TEST: ("ade20k_sem_seg_val",)
 81 | SOLVER:
 82 |   IMS_PER_BATCH: 32
 83 |   BASE_LR: 0.00006
 84 |   MAX_ITER: 120000
 85 |   WARMUP_FACTOR: 1e-6
 86 |   WARMUP_ITERS: 1500
 87 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
 88 |   WEIGHT_DECAY: 0.01
 89 |   WEIGHT_DECAY_NORM: 0.0
 90 |   WEIGHT_DECAY_EMBED: 0.0
 91 |   BACKBONE_MULTIPLIER: 1.0
 92 |   TEST_IMS_PER_BATCH: 1
 93 |   CLIP_GRADIENTS:
 94 |     ENABLED: True
 95 |     CLIP_TYPE: "full_model"
 96 |     CLIP_VALUE: 0.01
 97 |     NORM_TYPE: 2.0
 98 | INPUT:
 99 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
100 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
101 |   MIN_SIZE_TEST: 640
102 |   MAX_SIZE_TRAIN: 2560
103 |   MAX_SIZE_TEST: 2560
104 |   CROP:
105 |     ENABLED: True
106 |     TYPE: "absolute"
107 |     SIZE: (640, 640)
108 |     SINGLE_CATEGORY_MAX_AREA: 1.0
109 |   COLOR_AUG_SSD: True
110 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
111 |   FORMAT: "RGB"
112 | TEST:
113 |   EVAL_PERIOD: 5000
114 |   # SEMANTIC_ON: True
115 |   # INSTANCE_ON: False
116 |   # PANOPTIC_ON: False
117 |   AUG:
118 |     ENABLED: False
119 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
120 |     MAX_SIZE: 3584
121 |     FLIP: True
122 | DATALOADER:
123 |   FILTER_EMPTY_ANNOTATIONS: True
124 |   NUM_WORKERS: 16
125 | VERSION: 2
126 | METRIC: 'Vanilla' # Vanilla or SG-IoU
127 | OUTPUT_DIR: output/SCAN-VitL


--------------------------------------------------------------------------------
/configs/scan_vitL_demo.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   META_ARCHITECTURE: "SCANDEMO"
  3 |   BACKBONE:
  4 |     FREEZE_AT: 0
  5 |     NAME: "D2SwinTransformer"
  6 |   SWIN:
  7 |     EMBED_DIM: 128
  8 |     DEPTHS: [2, 2, 18, 2]
  9 |     NUM_HEADS: [4, 8, 16, 32]
 10 |     WINDOW_SIZE: 12
 11 |     APE: False
 12 |     DROP_PATH_RATE: 0.3
 13 |     PATCH_NORM: True
 14 |     PRETRAIN_IMG_SIZE: 384
 15 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
 16 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 17 |   PIXEL_STD: [58.395, 57.120, 57.375]
 18 |   SELECT_ORI_CLIP_ID: [12, 18, 24]
 19 |   FREQUENCY_SIGMA: [9, 7, 3]
 20 |   SEM_SEG_HEAD:
 21 |     NAME: "OpenVocaMask2FormerHead"
 22 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 23 |     IGNORE_VALUE: 255
 24 |     NUM_CLASSES: 171 # number of categories in training set
 25 |     EMBEDDING_DIM: 768
 26 |     EMBED_LAYERS: 2
 27 |     COMMON_STRIDE: 4 # not used, hard-coded
 28 |     LOSS_WEIGHT: 1.0
 29 |     CONVS_DIM: 256
 30 |     MASK_DIM: 256
 31 |     NORM: "GN"
 32 |     # pixel decoder
 33 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 34 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 35 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 36 |     COMMON_STRIDE: 4
 37 |     TRANSFORMER_ENC_LAYERS: 6
 38 |   MASK_FORMER:
 39 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
 40 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 41 |     DEEP_SUPERVISION: True
 42 |     NO_OBJECT_WEIGHT: 0.1
 43 |     CLASS_WEIGHT: 2.0
 44 |     MASK_WEIGHT: 5.0
 45 |     DICE_WEIGHT: 5.0
 46 |     HIDDEN_DIM: 256
 47 |     NUM_OBJECT_QUERIES: 100
 48 |     NHEADS: 8
 49 |     DROPOUT: 0.0
 50 |     DIM_FEEDFORWARD: 2048
 51 |     ENC_LAYERS: 0
 52 |     PRE_NORM: False
 53 |     ENFORCE_INPUT_PROJ: False
 54 |     SIZE_DIVISIBILITY: 32
 55 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 56 |     TRAIN_NUM_POINTS: 12544
 57 |     OVERSAMPLE_RATIO: 3.0
 58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 59 |   CLIP_ADAPTER:
 60 |     TEXT_TEMPLATES: "vild"
 61 |     CLIP_MODEL_NAME: "ViT-L/14"
 62 |     MASK_FILL: "mean"
 63 |     MASK_EXPAND_RATIO: 1.0
 64 |     MASK_THR: 0.4 # choose the foreground objects
 65 |     MASK_MATTING: False # use soft background, default not used
 66 |     REGION_RESIZED: True # resize to the input of clip, e.g., 224
 67 |     CLIP_ENSEMBLE: True # use ensemble of two classification branches
 68 |     CLIP_ENSEMBLE_WEIGHT: 0.7
 69 |     REPLACE_RATIO: 0.15
 70 |     REPLACE_LAYER: [1, 3, 5, 7, 9]
 71 | DATASETS:
 72 |   TRAIN: ("coco_2017_train_stuff_sem_seg",)
 73 |   TEST: ("ade20k_sem_seg_val",)
 74 | SOLVER:
 75 |   IMS_PER_BATCH: 32
 76 |   BASE_LR: 0.00006
 77 |   MAX_ITER: 120000
 78 |   WARMUP_FACTOR: 1e-6
 79 |   WARMUP_ITERS: 1500
 80 |   WEIGHT_DECAY: 0.01
 81 |   WEIGHT_DECAY_NORM: 0.0
 82 |   WEIGHT_DECAY_EMBED: 0.0
 83 |   BACKBONE_MULTIPLIER: 1.0
 84 |   TEST_IMS_PER_BATCH: 1
 85 |   CLIP_GRADIENTS:
 86 |     ENABLED: True
 87 |     CLIP_TYPE: "full_model"
 88 |     CLIP_VALUE: 0.01
 89 |     NORM_TYPE: 2.0
 90 | INPUT:
 91 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
 92 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
 93 |   MIN_SIZE_TEST: 640
 94 |   MAX_SIZE_TRAIN: 2560
 95 |   MAX_SIZE_TEST: 2560
 96 |   CROP:
 97 |     ENABLED: True
 98 |     TYPE: "absolute"
 99 |     SIZE: (640, 640)
100 |     SINGLE_CATEGORY_MAX_AREA: 1.0
101 |   COLOR_AUG_SSD: True
102 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
103 |   FORMAT: "RGB"
104 | TEST:
105 |   EVAL_PERIOD: 5000
106 |   AUG:
107 |     ENABLED: False
108 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
109 |     MAX_SIZE: 3584
110 |     FLIP: True
111 | DATALOADER:
112 |   FILTER_EMPTY_ANNOTATIONS: True
113 |   NUM_WORKERS: 16
114 | VERSION: 2


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output, index=None):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     if index is not None:
17 |         mapping = {i: k for k, i in enumerate(index)}
18 |         img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)(
19 |             img.astype(np.float)
20 |         ).astype(np.uint8)
21 |     Image.fromarray(img).save(output)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     dataset_dir = (
26 |         Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
27 |     )
28 |     print('Caution: we only generate the validation set!')
29 |     for name in ["validation"]:
30 |         annotation_dir = dataset_dir / "annotations" / name
31 |         output_dir = dataset_dir / "annotations_detectron2" / name
32 |         output_dir.mkdir(parents=True, exist_ok=True)
33 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34 |             output_file = output_dir / file.name
35 |             convert(file, output_file)
36 | 


--------------------------------------------------------------------------------
/datasets/prepare_pascal_context.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import tqdm
 5 | import os
 6 | import os.path as osp
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | from PIL import Image
11 | import scipy.io
12 | 
13 | def convert_pc59(mask_path, new_mask_path, pc59_dict):
14 |     mat = scipy.io.loadmat(mask_path)
15 |     mask = mat['LabelMap']
16 | 
17 |     mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
18 |     for trID, clsID in pc59_dict.items():
19 |         mask_copy[mask == clsID] = trID
20 | 
21 |     min_value = np.amin(mask_copy)
22 |     assert min_value >= 0, print(min_value)
23 |     Image.fromarray(mask_copy).save(new_mask_path, "PNG")
24 | 
25 | def convert_pc459(mask_path, new_mask_path):
26 |     mat = scipy.io.loadmat(mask_path)
27 |     mask = mat['LabelMap']
28 |     mask = mask - 1
29 |     min_value = np.amin(mask)
30 |     assert min_value >= 0, print(min_value)
31 |     Image.fromarray(mask).save(new_mask_path, "TIFF")
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
36 |     print('Caution: we only generate the validation set!')
37 |     pc_path = dataset_dir / "VOCdevkit/VOC2010"
38 | 
39 |     val_list = open(pc_path / "pascalcontext_val.txt", "r")
40 |     pc459_labels = open(pc_path / "labels.txt", "r")
41 |     pc59_labels = open(pc_path / "59_labels.txt", "r")
42 | 
43 |     pc459_dict = {}
44 |     for line in pc459_labels.readlines():
45 |         if ':' in line:
46 |             idx, name = line.split(':')
47 |             idx = int(idx.strip())
48 |             name = name.strip()
49 |             pc459_dict[name] = idx
50 | 
51 |     pc59_dict = {}
52 |     for i, line in enumerate(pc59_labels.readlines()):
53 |         name = line.split(':')[-1].strip()
54 |         if name is not '':
55 |             pc59_dict[i] = pc459_dict[name]
56 | 
57 |     pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
58 |     pc459_dir.mkdir(parents=True, exist_ok=True)
59 |     pc59_dir = pc_path / "annotations_detectron2" / "pc59_val"
60 |     pc59_dir.mkdir(parents=True, exist_ok=True)
61 | 
62 |     for line in tqdm.tqdm(val_list.readlines()):
63 |         fileid = line.strip()
64 |         ori_mask = f'{pc_path}/trainval/{fileid}.mat'
65 |         pc459_dst = f'{pc459_dir}/{fileid}.tif'
66 |         pc59_dst = f'{pc59_dir}/{fileid}.png'
67 |         if osp.exists(ori_mask):
68 |             convert_pc459(ori_mask, pc459_dst)
69 |             convert_pc59(ori_mask, pc59_dst, pc59_dict)
70 | 


--------------------------------------------------------------------------------
/datasets/prepare_voc_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | # Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py
 4 | 
 5 | import os
 6 | import os.path as osp
 7 | from pathlib import Path
 8 | import tqdm
 9 | 
10 | import numpy as np
11 | from PIL import Image
12 | 
13 | 
14 | clsID_to_trID = {
15 |     0: 255,
16 |     1: 0,
17 |     2: 1,
18 |     3: 2,
19 |     4: 3,
20 |     5: 4,
21 |     6: 5,
22 |     7: 6,
23 |     8: 7,
24 |     9: 8,
25 |     10: 9,
26 |     11: 10,
27 |     12: 11,
28 |     13: 12,
29 |     14: 13,
30 |     15: 14,
31 |     16: 15,
32 |     17: 16,
33 |     18: 17,
34 |     19: 18,
35 |     20: 19,
36 |     255: 255,
37 | }
38 | 
39 | def convert_to_trainID(
40 |     maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix=""
41 | ):
42 |     mask = np.array(Image.open(maskpath))
43 |     mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
44 |     for clsID, trID in clsID_to_trID.items():
45 |         mask_copy[mask == clsID] = trID
46 |     seg_filename = (
47 |         osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath))
48 |         if is_train
49 |         else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath))
50 |     )
51 |     if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
52 |         return
53 |     Image.fromarray(mask_copy).save(seg_filename, "PNG")
54 | 
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
59 |     print('Caution: we only generate the validation set!')
60 |     voc_path = dataset_dir / "VOCdevkit" / "VOC2012"
61 |     out_mask_dir = voc_path / "annotations_detectron2"
62 |     out_image_dir = voc_path / "images_detectron2"
63 |     for name in ["val"]:
64 |         os.makedirs((out_mask_dir / name), exist_ok=True)
65 |         os.makedirs((out_image_dir / name), exist_ok=True)
66 |         val_list = [
67 |             osp.join(voc_path, "SegmentationClassAug", f + ".png")
68 |             for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist()
69 |         ]
70 |         for file in tqdm.tqdm(val_list):
71 |             convert_to_trainID(file, out_mask_dir, is_train=False)
72 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
  3 | 
  4 | import argparse
  5 | import glob
  6 | import multiprocessing as mp
  7 | import os
  8 | import time
  9 | import cv2
 10 | import tqdm
 11 | 
 12 | from detectron2.config import get_cfg
 13 | 
 14 | from detectron2.projects.deeplab import add_deeplab_config
 15 | from detectron2.data.detection_utils import read_image
 16 | from detectron2.utils.logger import setup_logger
 17 | from scan import add_ovseg_config
 18 | 
 19 | from scan.utils import VisualizationDemo
 20 | 
 21 | # constants
 22 | WINDOW_NAME = "Open vocabulary segmentation"
 23 | 
 24 | 
 25 | def setup_cfg(args):
 26 |     # load config from file and command-line arguments
 27 |     cfg = get_cfg()
 28 |     # for poly lr schedule
 29 |     add_deeplab_config(cfg)
 30 |     add_ovseg_config(cfg)
 31 |     cfg.merge_from_file(args.config_file)
 32 |     cfg.merge_from_list(args.opts)
 33 |     cfg.freeze()
 34 |     return cfg
 35 | 
 36 | 
 37 | def get_parser():
 38 |     parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
 39 |     parser.add_argument(
 40 |         "--config-file",
 41 |         default="configs/ovseg_swinB_vitL_mask2former_demo.yaml",
 42 |         metavar="FILE",
 43 |         help="path to config file",
 44 |     )
 45 |     parser.add_argument(
 46 |         "--input",
 47 |         nargs="+",
 48 |         help="A list of space separated input images; "
 49 |         "or a single glob pattern such as 'directory/*.jpg'",
 50 |         default='./data/ADEChallengeData2016/images/validation/*.jpg'
 51 |     )
 52 |     parser.add_argument(
 53 |         "--class-names",
 54 |         nargs="+",
 55 |         default="building",
 56 |         help="A list of user-defined class_names"
 57 |     )
 58 |     parser.add_argument(
 59 |         "--output",
 60 |         default='./pred',
 61 |         help="A file or directory to save output visualizations. "
 62 |         "If not given, will show output in an OpenCV window.",
 63 |     )
 64 |     parser.add_argument(
 65 |         "--opts",
 66 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 67 |         default=['MODEL.WEIGHTS', '/opt/tiger/ljyaronld/OVSeg/ckpt/SwinB-Mask2Former-openclip_datacomp-frequency_121824-aux-split_query_only_crossattn-final.pth'],
 68 |         nargs=argparse.REMAINDER,
 69 |     )
 70 |     return parser
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 |     mp.set_start_method("spawn", force=True)
 75 |     args = get_parser().parse_args()
 76 |     setup_logger(name="fvcore")
 77 |     logger = setup_logger()
 78 |     logger.info("Arguments: " + str(args))
 79 | 
 80 |     cfg = setup_cfg(args)
 81 | 
 82 |     demo = VisualizationDemo(cfg)
 83 |     classes = []
 84 |     with open('/opt/tiger/ljyaronld/OVSeg/a_150.txt', 'r') as file:
 85 |         for line in file:
 86 |             classes.append(line.strip())
 87 |     class_names = classes
 88 |     class_names = args.class_names
 89 |     if args.input:
 90 |         if len(args.input) == 1:
 91 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
 92 |             assert args.input, "The input path(s) was not found"
 93 |         for path in tqdm.tqdm(args.input, disable=not args.output):
 94 |             # use PIL, to be consistent with evaluation
 95 |             img = read_image(path, format="BGR")
 96 |             start_time = time.time()
 97 |             predictions, visualized_output = demo.run_on_image(img, class_names)
 98 |             logger.info(
 99 |                 "{}: {} in {:.2f}s".format(
100 |                     path,
101 |                     "detected {} instances".format(len(predictions["instances"]))
102 |                     if "instances" in predictions
103 |                     else "finished",
104 |                     time.time() - start_time,
105 |                 )
106 |             )
107 | 
108 |             if args.output:
109 |                 if os.path.isdir(args.output):
110 |                     assert os.path.isdir(args.output), args.output
111 |                     out_filename = os.path.join(args.output, os.path.basename(path))
112 |                 else:
113 |                     assert len(args.input) == 1, "Please specify a directory with args.output"
114 |                     out_filename = args.output
115 |                 visualized_output.save(out_filename)
116 |             else:
117 |                 cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
118 |                 cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
119 |                 if cv2.waitKey(0) == 27:
120 |                     break  # esc to quit
121 |     else:
122 |         raise NotImplementedError


--------------------------------------------------------------------------------
/imgs/cs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/cs.png


--------------------------------------------------------------------------------
/imgs/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/pipeline.png


--------------------------------------------------------------------------------
/imgs/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/results.png


--------------------------------------------------------------------------------
/imgs/visual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/imgs/visual.png


--------------------------------------------------------------------------------
/open_clip_training/.github/workflows/clear-cache.yml:
--------------------------------------------------------------------------------
 1 | name: Clear cache
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | permissions:
 7 |   actions: write
 8 | 
 9 | jobs:
10 |   clear-cache:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Clear cache
14 |         uses: actions/github-script@v6
15 |         with:
16 |           script: |
17 |             const caches = await github.rest.actions.getActionsCacheList({
18 |               owner: context.repo.owner,
19 |               repo: context.repo.repo,
20 |             })
21 |             for (const cache of caches.data.actions_caches) {
22 |               console.log(cache)
23 |               await github.rest.actions.deleteActionsCacheById({
24 |                 owner: context.repo.owner,
25 |                 repo: context.repo.repo,
26 |                 cache_id: cache.id,
27 |               })
28 |             }
29 | 
30 | 


--------------------------------------------------------------------------------
/open_clip_training/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - uses: actions-ecosystem/action-regex-match@v2
13 |       id: regex-match
14 |       with:
15 |         text: ${{ github.event.head_commit.message }}
16 |         regex: '^Release ([^ ]+)'
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.8'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Release
26 |       if: ${{ steps.regex-match.outputs.match != '' }}
27 |       uses: softprops/action-gh-release@v1
28 |       with:
29 |         tag_name: v${{ steps.regex-match.outputs.group1 }}
30 |     - name: Build and publish
31 |       if: ${{ steps.regex-match.outputs.match != '' }}
32 |       env:
33 |         TWINE_USERNAME: __token__
34 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
35 |       run: |
36 |         python setup.py sdist bdist_wheel
37 |         twine upload dist/*
38 | 


--------------------------------------------------------------------------------
/open_clip_training/.gitignore:
--------------------------------------------------------------------------------
  1 | logs/
  2 | wandb/
  3 | models/
  4 | features/
  5 | results/
  6 | 
  7 | tests/data/
  8 | *.pt
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 | 
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | sync.sh
140 | gpu1sync.sh
141 | .idea
142 | *.pdf
143 | **/._*
144 | **/*DS_*
145 | **.jsonl
146 | src/sbatch
147 | src/misc
148 | .vscode
149 | src/debug
150 | core.*
151 | 
152 | # Allow
153 | !src/evaluation/misc/results_dbs/*


--------------------------------------------------------------------------------
/open_clip_training/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.1.0
 2 | message: If you use this software, please cite it as below.
 3 | authors:
 4 |   - family-names: Ilharco
 5 |     given-names: Gabriel
 6 |   - family-names: Wortsman
 7 |     given-names: Mitchell
 8 |   - family-names: Wightman
 9 |     given-names: Ross
10 |   - family-names: Gordon
11 |     given-names: Cade   
12 |   - family-names: Carlini
13 |     given-names: Nicholas
14 |   - family-names: Taori
15 |     given-names: Rohan
16 |   - family-names: Dave
17 |     given-names: Achal
18 |   - family-names: Shankar
19 |     given-names: Vaishaal
20 |   - family-names: Namkoong
21 |     given-names: Hongseok
22 |   - family-names: Miller
23 |     given-names: John
24 |   - family-names: Hajishirzi
25 |     given-names: Hannaneh
26 |   - family-names: Farhadi
27 |     given-names: Ali
28 |   - family-names: Schmidt
29 |     given-names: Ludwig
30 | title: OpenCLIP
31 | version: v0.1
32 | doi: 10.5281/zenodo.5143773
33 | date-released: 2021-07-28
34 | 


--------------------------------------------------------------------------------
/open_clip_training/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman, 
 2 | Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, 
 3 | John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, 
 4 | Ludwig Schmidt
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining
 7 | a copy of this software and associated documentation files (the
 8 | "Software"), to deal in the Software without restriction, including
 9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/open_clip_training/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/open_clip/bpe_simple_vocab_16e6.txt.gz
2 | include src/open_clip/model_configs/*.json
3 | 
4 | 


--------------------------------------------------------------------------------
/open_clip_training/Makefile:
--------------------------------------------------------------------------------
 1 | install: ## [Local development] Upgrade pip, install requirements, install package.
 2 | 	python -m pip install -U pip
 3 | 	python -m pip install -e .
 4 | 
 5 | install-training:
 6 | 	python -m pip install -r requirements-training.txt
 7 | 
 8 | install-test: ## [Local development] Install test requirements
 9 | 	python -m pip install -r requirements-test.txt
10 | 
11 | test: ## [Local development] Run unit tests
12 | 	python -m pytest -x -s -v tests
13 | 


--------------------------------------------------------------------------------
/open_clip_training/docs/LOW_ACC.md:
--------------------------------------------------------------------------------
 1 | As we describe in more detail below, CLIP models in a medium accuracy regime already allow us to draw conclusions about the robustness of larger CLIP models since the models follow reliable scaling laws.
 2 | 
 3 | [Cherti et al., 2022](https://arxiv.org/abs/2212.07143) and [Gadre et al., 2023](https://arxiv.org/abs/2304.14108) show additional discussions about the scaling behavior of CLIP models.
 4 | 
 5 | ## Scaling trends
 6 | 
 7 | The plot below shows how zero-shot performance of CLIP models varies as we scale the number of samples used for training. Zero-shot performance increases steadily for both ImageNet and [ImageNetV2](https://arxiv.org/abs/1902.10811), and is far from saturated at ~15M samples.
 8 | 
 9 | <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/scaling.png" width="700">
10 | 
11 | ## Why are low-accuracy CLIP models interesting?
12 | 
13 | **TL;DR:** CLIP models have high effective robustness, even at small scales.
14 | 
15 | CLIP models are particularly intriguing because they are more robust to natural distribution shifts (see Section 3.3 in the [CLIP paper](https://arxiv.org/abs/2103.00020)).
16 | This phenomena is illustrated by the figure below, with ImageNet accuracy on the x-axis
17 | and [ImageNetV2](https://arxiv.org/abs/1902.10811) (a reproduction of the ImageNet validation set with distribution shift) accuracy on the y-axis.
18 | Standard training denotes training on the ImageNet train set and the CLIP zero-shot models
19 | are shown as stars.
20 | 
21 | ![CLIP scatter plot](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/effective_robustness.png)
22 | 
23 | As observed by [Taori et al., 2020](https://arxiv.org/abs/2007.00644) and [Miller et al., 2021](https://arxiv.org/abs/2107.04649), the in-distribution
24 | and out-of-distribution accuracies of models trained on ImageNet follow a predictable linear trend (the red line in the above plot). *Effective robustness*
25 | quantifies robustness as accuracy beyond this baseline, i.e., how far a model lies above the red line. Ideally a model would not suffer from distribution shift and fall on the y = x line ([trained human labelers are within a percentage point of the y = x line](http://proceedings.mlr.press/v119/shankar20c.html)).
26 | 
27 | Even though the CLIP models trained with
28 | this codebase achieve much lower accuracy than those trained by OpenAI, our models still lie on the same
29 | trend of improved effective robustness (the purple line). Therefore, we can study what makes
30 | CLIP robust without requiring industrial-scale compute.
31 | 
32 | For more information on effective robustness, please see:
33 | 
34 | - [Recht et al., 2019](https://arxiv.org/abs/1902.10811).
35 | - [Taori et al., 2020](https://arxiv.org/abs/2007.00644).
36 | - [Miller et al., 2021](https://arxiv.org/abs/2107.04649).
37 | 
38 | To know more about the factors that contribute to CLIP's robustness refer to [Fang et al., 2022](https://arxiv.org/abs/2205.01397).


--------------------------------------------------------------------------------
/open_clip_training/docs/clip_conceptual_captions.md:
--------------------------------------------------------------------------------
 1 | ## Additional training curves for CLIP on Conceptual Captions
 2 | 
 3 | # Zero shot accuracy
 4 | ![](/docs/clip_zeroshot.png)
 5 | 
 6 | # Training loss curve
 7 | ![](/docs/clip_loss.png)
 8 | 
 9 | # Validation loss curve
10 | ![](/docs/clip_val_loss.png)
11 | 
12 | # Validation recall
13 | ![](/docs/clip_recall.png)


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_b16/i50_t16_finetune.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 8 -m training.main \
 2 |     --save-frequency 1 \
 3 |     --save-most-recent \
 4 |     --zeroshot-frequency 1 \
 5 |     --train-data '/path/to/laion-400m' \
 6 |     --dataset-type webdataset \
 7 |     --lr "2.56e-5" \
 8 |     --beta1 0.9 \
 9 |     --beta2 0.95 \
10 |     --warmup 3072 \
11 |     --wd 0.2 \
12 |     --batch-size 1024 \
13 |     --aug-cfg scale='(0.4, 1.0)' \
14 |     --epochs 1 \
15 |     --train-num-samples 131072000 \
16 |     --workers 6 \
17 |     --model ViT-B-16-CL16 \
18 |     --pretrained '/path/to/ckpt' \
19 |     --precision 'amp_bf16' \
20 |     --ddp-static-graph \
21 |     --local-loss \
22 |     --gather-with-grad \
23 |     --grad-checkpointing \
24 |     --log-every-n-steps 256 \
25 |     --seed 0 \
26 |     --logs ./logs/ \
27 |     --imagenet-val '/path/to/imagenet/val'
28 | 


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_b16/i50_t16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 8 -m training.main \
 2 |     --save-frequency 1 \
 3 |     --save-most-recent \
 4 |     --zeroshot-frequency 1 \
 5 |     --train-data '/path/to/laion-400m' \
 6 |     --dataset-type webdataset \
 7 |     --lr "2.048e-3" \
 8 |     --beta1 0.9 \
 9 |     --beta2 0.95 \
10 |     --warmup 782 \
11 |     --wd 0.2 \
12 |     --batch-size 8192 \
13 |     --aug-cfg scale='(0.4, 1.0)' \
14 |     --epochs 6 \
15 |     --workers 6 \
16 |     --model ViT-B-16-CL16 \
17 |     --precision 'amp_bf16' \
18 |     --ddp-static-graph \
19 |     --local-loss \
20 |     --gather-with-grad \
21 |     --force-image-size 112 \
22 |     --grad-checkpointing \
23 |     --log-every-n-steps 32 \
24 |     --seed 0 \
25 |     --logs ./logs/ \
26 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i17_t16_finetune.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 8 -m training.main \
 2 |     --save-frequency 1 \
 3 |     --save-most-recent \
 4 |     --zeroshot-frequency 1 \
 5 |     --train-data '/path/to/laion-400m' \
 6 |     --dataset-type webdataset \
 7 |     --lr "2.24e-5" \
 8 |     --beta1 0.9 \
 9 |     --beta2 0.95 \
10 |     --warmup 3571 \
11 |     --wd 0.2 \
12 |     --batch-size 896 \
13 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 |     --epochs 1 \
15 |     --train-num-samples 131072000 \
16 |     --workers 6 \
17 |     --model ViT-L-16-CL16-GAP \
18 |     --pretrained '/path/to/ckpt' \
19 |     --precision 'amp_bf16' \
20 |     --ddp-static-graph \
21 |     --local-loss \
22 |     --gather-with-grad \
23 |     --grad-checkpointing \
24 |     --log-every-n-steps 293 \
25 |     --seed 0 \
26 |     --logs ./logs/ \
27 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i17_t16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 8 -m training.main \
 2 |     --save-frequency 1 \
 3 |     --save-most-recent \
 4 |     --zeroshot-frequency 1 \
 5 |     --train-data '/path/to/laion-400m' \
 6 |     --dataset-type webdataset \
 7 |     --lr "1.024e-3" \
 8 |     --beta1 0.9 \
 9 |     --beta2 0.95 \
10 |     --warmup 1563 \
11 |     --wd 0.2 \
12 |     --batch-size 4096 \
13 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 |     --epochs 6 \
15 |     --workers 6 \
16 |     --model ViT-L-16-CL16-GAP \
17 |     --precision 'amp_bf16' \
18 |     --ddp-static-graph \
19 |     --local-loss \
20 |     --gather-with-grad \
21 |     --force-image-size 64 \
22 |     --grad-checkpointing \
23 |     --log-every-n-steps 64 \
24 |     --seed 0 \
25 |     --logs ./logs/ \
26 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i37_t8_finetune.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 8 -m training.main \
 2 |     --save-frequency 1 \
 3 |     --save-most-recent \
 4 |     --zeroshot-frequency 1 \
 5 |     --train-data '/path/to/laion-400m' \
 6 |     --dataset-type webdataset \
 7 |     --lr "2.24e-5" \
 8 |     --beta1 0.9 \
 9 |     --beta2 0.95 \
10 |     --warmup 3571 \
11 |     --wd 0.2 \
12 |     --batch-size 896 \
13 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 |     --epochs 1 \
15 |     --train-num-samples 131072000 \
16 |     --workers 6 \
17 |     --model ViT-L-16-CL32-GAP \
18 |     --pretrained '/path/to/ckpt' \
19 |     --precision 'amp_bf16' \
20 |     --ddp-static-graph \
21 |     --local-loss \
22 |     --gather-with-grad \
23 |     --grad-checkpointing \
24 |     --log-every-n-steps 293 \
25 |     --seed 0 \
26 |     --logs ./logs/ \
27 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipa/vit_l16/i37_t8_pretrain.sh:
--------------------------------------------------------------------------------
 1 | torchrun --nproc_per_node 8 -m training.main \
 2 |     --save-frequency 1 \
 3 |     --save-most-recent \
 4 |     --zeroshot-frequency 1 \
 5 |     --train-data '/path/to/laion-400m' \
 6 |     --dataset-type webdataset \
 7 |     --lr "1.024e-3" \
 8 |     --beta1 0.9 \
 9 |     --beta2 0.95 \
10 |     --warmup 1563 \
11 |     --wd 0.2 \
12 |     --batch-size 4096 \
13 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
14 |     --epochs 6 \
15 |     --workers 6 \
16 |     --model ViT-L-16-CL8-Syntax-GAP \
17 |     --precision 'amp_bf16' \
18 |     --ddp-static-graph \
19 |     --local-loss \
20 |     --gather-with-grad \
21 |     --force-image-size 96 \
22 |     --grad-checkpointing \
23 |     --log-every-n-steps 64 \
24 |     --seed 0 \
25 |     --logs ./logs/ \
26 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipav2/vit_h14/i257_t32_finetunex4.sh:
--------------------------------------------------------------------------------
 1 | # have not been tested. use it at your own discretion
 2 | # the original experiment was run on tpu v3-256.
 3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
 4 | torchrun --nproc_per_node 8 -m training.main \
 5 |     --save-frequency 1 \
 6 |     --save-most-recent \
 7 |     --zeroshot-frequency 1 \
 8 |     --train-data '/path/to/laion2b_or_datacomp1b' \
 9 |     --train-num-samples 131072000 \
10 |     --dataset-type webdataset \
11 |     --lr "5.12e-5" \
12 |     --beta1 0.9 \
13 |     --beta2 0.95 \
14 |     --warmup 800 \
15 |     --wd 0.2 \
16 |     --batch-size 4096 \
17 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
18 |     --epochs 4 \
19 |     --workers 6 \
20 |     --model ViT-H-14-CL32-GAP \
21 |     --pretrained '/path/to/pretrain84_ckpt' \
22 |     --precision 'amp_bf16' \
23 |     --ddp-static-graph \
24 |     --local-loss \
25 |     --gather-with-grad \
26 |     --force-image-size 224 \
27 |     --force-patch-dropout 0.3 \
28 |     --grad-checkpointing \
29 |     --log-every-n-steps 64 \
30 |     --seed 0 \
31 |     --logs ./logs/ \
32 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipav2/vit_h14/i50_t8_pretrain.sh:
--------------------------------------------------------------------------------
 1 | # have not been tested. use it at your own discretion
 2 | # the original experiment was run on tpu v3-256.
 3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
 4 | torchrun --nproc_per_node 8 -m training.main \
 5 |     --save-frequency 1 \
 6 |     --save-most-recent \
 7 |     --zeroshot-frequency 1 \
 8 |     --train-data '/path/to/laion2b_or_datacomp1b' \
 9 |     --train-num-samples 4e8 \
10 |     --dataset-type webdataset \
11 |     --lr "2.048e-3" \
12 |     --beta1 0.9 \
13 |     --beta2 0.95 \
14 |     --warmup 3200 \
15 |     --wd 0.2 \
16 |     --batch-size 8192 \
17 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
18 |     --epochs 32 \
19 |     --workers 6 \
20 |     --model ViT-H-14-CL8-Syntax-GAP \
21 |     --precision 'amp_bf16' \
22 |     --ddp-static-graph \
23 |     --local-loss \
24 |     --gather-with-grad \
25 |     --force-image-size 84 \
26 |     --grad-checkpointing \
27 |     --log-every-n-steps 32 \
28 |     --seed 0 \
29 |     --logs ./logs/ \
30 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/clipav2/vit_h14/i577_t32_finetunex1.sh:
--------------------------------------------------------------------------------
 1 | # have not been tested. use it at your own discretion
 2 | # the original experiment was run on tpu v3-256.
 3 | # this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
 4 | torchrun --nproc_per_node 8 -m training.main \
 5 |     --save-frequency 1 \
 6 |     --save-most-recent \
 7 |     --zeroshot-frequency 1 \
 8 |     --train-data '/path/to/laion2b_or_datacomp1b' \
 9 |     --train-num-samples 131072000 \
10 |     --dataset-type webdataset \
11 |     --lr "6.4e-6" \
12 |     --beta1 0.9 \
13 |     --beta2 0.95 \
14 |     --warmup 1600 \
15 |     --wd 0.2 \
16 |     --batch-size 2048 \
17 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
18 |     --epochs 1 \
19 |     --workers 6 \
20 |     --model ViT-H-14-CL32-GAP \
21 |     --pretrained '/path/to/finetune224_ckpt' \
22 |     --precision 'amp_bf16' \
23 |     --ddp-static-graph \
24 |     --local-loss \
25 |     --gather-with-grad \
26 |     --force-image-size 336 \
27 |     --force-patch-dropout 0.4 \
28 |     --grad-checkpointing \
29 |     --log-every-n-steps 64 \
30 |     --seed 0 \
31 |     --logs ./logs/ \
32 |     --imagenet-val '/path/to/imagenet/val'


--------------------------------------------------------------------------------
/open_clip_training/docs/script_examples/stability_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=g40423
 3 | #SBATCH --job-name=testopenclip
 4 | #SBATCH --nodes 30
 5 | #SBATCH --ntasks-per-node=8
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --output=%x_%j.out
 8 | #SBATCH --comment=laion
 9 | #SBATCH --open-mode=append
10 | #SBATCH --exclusive
11 | 
12 | module load openmpi
13 | module load cuda/11.7
14 | 
15 | export MASTER_ADDR=`hostname`
16 | export MASTER_PORT=12802
17 | export NCCL_PROTO=simple
18 | export FI_EFA_FORK_SAFE=1
19 | export FI_LOG_LEVEL=1
20 | export FI_EFA_USE_DEVICE_RDMA=1
21 | export NCCL_DEBUG=info
22 | 
23 | export PYTHONFAULTHANDLER=1
24 | 
25 | export CUDA_LAUNCH_BLOCKING=0
26 | export OMPI_MCA_mtl_base_verbose=1
27 | export FI_EFA_ENABLE_SHM_TRANSFER=0
28 | export FI_PROVIDER=efa
29 | export FI_EFA_TX_MIN_CREDITS=64
30 | export NCCL_TREE_THRESHOLD=0
31 | 
32 | cd /admin/home-mitchellw/open_clip/src
33 | export PYTHONPATH="$PYTHONPATH:/admin/home-mitchellw/open_clip/src"
34 | 
35 | EXP_NAME="test-B-32-laion5b-lr1e-3-bs90k"
36 | 
37 | srun --comment laion --cpu_bind=v --accel-bind=gn python -m training.main \
38 |     --save-frequency 1 \
39 |     --train-data="pipe:aws s3 cp s3://s-datasets/laion5b/{laion2B-data/{000000..231349}.tar,laion2B-multi-data/{000000..226687}.tar,laion1B-nolang-data/{000000..127231}.tar} -" \
40 |     --train-num-samples 135646078 \
41 |     --dataset-type webdataset \
42 |     --dataset-resampled \
43 |     --warmup 2000 \
44 |     --batch-size=375 \
45 |     --epochs=97 \
46 |     --lr 1e-3 \
47 |     --workers=8 \
48 |     --report-to wandb \
49 |     --name ${EXP_NAME} \
50 |     --logs /scratch/logs/ \
51 |     --model ViT-B-32 \
52 |     --seed 0 \
53 |     --ddp-static-graph \
54 |     --local-loss \
55 |     --gather-with-grad \
56 |     --grad-checkpointing \
57 |     --precision amp_bfloat16 \
58 |     --wandb-project-name open_clip6 \
59 |     --resume "latest" \
60 |     --remote-sync s3://s-laion/mitchellw/logs
61 | 


--------------------------------------------------------------------------------
/open_clip_training/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     regression_test
4 | 


--------------------------------------------------------------------------------
/open_clip_training/scripts/clipav1_vit_l16_i37_t8.sh:
--------------------------------------------------------------------------------
1 | # eval on a single gpu
2 | CUDA_VISIBLE_DEVICES=2 TORCH_CUDNN_V8_API_ENABLED=1 TFDS_PREFETCH_SIZE=8192 python3 -m training.main \
3 |     --model ViT-L-16-CL32-GAP \
4 |     --pretrained "/path/to/clipa_vit_l16_i37_t8.pt" \
5 |     --seed 0 \
6 |     --imagenet-val '/path/to/ImageNet/val'


--------------------------------------------------------------------------------
/open_clip_training/scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=1 python3 -m training.main \
 2 |     --model ViT-H-14-CL32-GAP-BigVision \
 3 |     --pretrained "/path/to/vit_h14_i84_224_336_cl32_gap_datacomp1b.pt" \
 4 |     --force-image-size 336 \
 5 |     --square-resize-only \
 6 |     --interpolation 'bilinear' \
 7 |     --image-mean 0.485 0.456 0.406 \
 8 |     --image-std 0.229 0.224 0.225 \
 9 |     --seed 0 \
10 |     --imagenet-val '/path/to/ImageNet/val'
11 | 


--------------------------------------------------------------------------------
/open_clip_training/scripts/h14_224_32_finetune.sh:
--------------------------------------------------------------------------------
 1 | # 64k batchsize for 2.048e-3 lr
 2 | TORCH_CUDNN_V8_API_ENABLED=1 torchrun --nproc_per_node 8 -m training.main \
 3 |     --save-frequency 1 \
 4 |     --save-most-recent \
 5 |     --zeroshot-frequency 1 \
 6 |     --train-data '/path/to/laion' \
 7 |     --dataset-type webdataset \
 8 |     --lr "2.048e-3" \
 9 |     --beta1 0.9 \
10 |     --beta2 0.95 \
11 |     --warmup 782 \
12 |     --wd 0.2 \
13 |     --batch-size 4096 \
14 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
15 |     --epochs=7 \
16 |     --workers=6 \
17 |     --model ViT-H-14-CL32-GAP \
18 |     --precision 'amp_bf16' \
19 |     --local-loss \
20 |     --gather-with-grad \
21 |     --force-image-size 224 \
22 |     --grad-checkpointing \
23 |     --log-every-n-steps 32 \
24 |     --seed 0 \
25 |     --logs ./logs/ \
26 |     --imagenet-val '/path/to/ImageNet/val' \
27 |     --name 'name' \
28 |     --report-to "wandb" \
29 |     --wandb-project-name "project_name"
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/open_clip_training/scripts/h14_84_8_pretrain.sh:
--------------------------------------------------------------------------------
 1 | # 64k batchsize for 2.048e-3 lr
 2 | TORCH_CUDNN_V8_API_ENABLED=1 torchrun --nproc_per_node 8 -m training.main \
 3 |     --save-frequency 1 \
 4 |     --save-most-recent \
 5 |     --zeroshot-frequency 1 \
 6 |     --train-data '/path/to/laion' \
 7 |     --dataset-type webdataset \
 8 |     --lr "2.048e-3" \
 9 |     --beta1 0.9 \
10 |     --beta2 0.95 \
11 |     --warmup 782 \
12 |     --wd 0.2 \
13 |     --batch-size 4096 \
14 |     --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
15 |     --epochs=7 \
16 |     --workers=6 \
17 |     --model ViT-H-14-CL8-SyntaxMask-GAP \
18 |     --precision 'amp_bf16' \
19 |     --local-loss \
20 |     --gather-with-grad \
21 |     --force-image-size 84 \
22 |     --grad-checkpointing \
23 |     --log-every-n-steps 32 \
24 |     --seed 0 \
25 |     --logs ./logs/ \
26 |     --imagenet-val '/path/to/ImageNet/val' \
27 |     --name 'name' \
28 |     --report-to "wandb" \
29 |     --wandb-project-name "project_name"
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/open_clip_training/setup.py:
--------------------------------------------------------------------------------
 1 | """ Setup
 2 | """
 3 | from setuptools import setup, find_packages
 4 | from codecs import open
 5 | from os import path
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | # Get the long description from the README file
10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
11 |     long_description = f.read()
12 | 
13 | def _read_reqs(relpath):
14 |     fullpath = path.join(path.dirname(__file__), relpath)
15 |     with open(fullpath) as f:
16 |         return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))]
17 | 
18 | REQUIREMENTS = _read_reqs("requirements.txt")
19 | TRAINING_REQUIREMENTS = _read_reqs("requirements-training.txt")
20 | 
21 | exec(open('src/open_clip/version.py').read())
22 | setup(
23 |     name='open_clip_torch',
24 |     version=__version__,
25 |     description='OpenCLIP',
26 |     long_description=long_description,
27 |     long_description_content_type='text/markdown',
28 |     url='https://github.com/mlfoundations/open_clip',
29 |     author='',
30 |     author_email='',
31 |     classifiers=[
32 |         # How mature is this project? Common values are
33 |         #   3 - Alpha
34 |         #   4 - Beta
35 |         #   5 - Production/Stable
36 |         'Development Status :: 3 - Alpha',
37 |         'Intended Audience :: Education',
38 |         'Intended Audience :: Science/Research',
39 |         'License :: OSI Approved :: Apache Software License',
40 |         'Programming Language :: Python :: 3.7',
41 |         'Programming Language :: Python :: 3.8',
42 |         'Programming Language :: Python :: 3.9',
43 |         'Programming Language :: Python :: 3.10',
44 |         'Topic :: Scientific/Engineering',
45 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
46 |         'Topic :: Software Development',
47 |         'Topic :: Software Development :: Libraries',
48 |         'Topic :: Software Development :: Libraries :: Python Modules',
49 |     ],
50 | 
51 |     # Note that this is a string of words separated by whitespace, not a list.
52 |     keywords='CLIP pretrained',
53 |     package_dir={'': 'src'},
54 |     packages=find_packages(where='src'),
55 |     include_package_data=True,
56 |     install_requires=REQUIREMENTS,
57 |     extras_require={
58 |         "training": TRAINING_REQUIREMENTS,
59 |     },
60 |     python_requires='>=3.7',
61 | )
62 | 


--------------------------------------------------------------------------------
/open_clip_training/src/clip_adapter/clip_adapter.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Union, Callable, Optional, List
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch import nn
 7 | 
 8 | from open_clip.factory import create_model_and_transforms
 9 | import copy
10 | 
11 | class ClipAdapter(nn.Module):
12 |     def __init__(self, args, device):
13 |         super().__init__()
14 |         model, preprocess_train, preprocess_val, preprocess_val_entire = create_model_and_transforms(
15 |                                                     args.model,
16 |                                                     args.pretrained,
17 |                                                     precision=args.precision,
18 |                                                     device=device,
19 |                                                     jit=args.torchscript,
20 |                                                     force_quick_gelu=args.force_quick_gelu,
21 |                                                     force_custom_text=args.force_custom_text,
22 |                                                     force_patch_dropout=args.force_patch_dropout,
23 |                                                     force_image_size=args.force_image_size,
24 |                                                     image_mean=args.image_mean,
25 |                                                     image_std=args.image_std,
26 |                                                     image_interpolation=args.image_interpolation,
27 |                                                     image_resize_mode=args.image_resize_mode,  # only effective for inference
28 |                                                     aug_cfg=args.aug_cfg,
29 |                                                     pretrained_image=args.pretrained_image,
30 |                                                     output_dict=True,
31 |                                                     with_mask=args.with_mask,
32 |                                                     mask_emb_depth=args.mask_emb_depth
33 |                                                 )
34 |         
35 |         self.clip_model = model
36 |         self.preprocess_train = preprocess_train
37 |         self.preprocess_val = preprocess_val
38 |         self.preprocess_val_entire = preprocess_val_entire
39 | 
40 |         self.original_clip_visual = copy.deepcopy(model.visual)
41 |         for _, param in self.original_clip_visual.named_parameters():
42 |             param.requires_grad = False
43 |     
44 |     def forward(self, original_image, image, text, mask=None):
45 |         if image is None:
46 |             return self.encode_text(text)
47 |         elif text is None:
48 |             ori_image_features = self.original_clip_visual(original_image, get_embedding=True)
49 |             image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features, mask=mask)  # [32, 768]
50 | 
51 |             image_features = F.normalize(image_features, dim=-1)  # [32, 768]
52 |             return {'image_features': image_features}
53 |         
54 |         if mask is None:
55 |             ori_image_features = self.original_clip_visual(original_image, get_embedding=True)
56 |             image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features)  # [32, 768]
57 |         else:
58 |             ori_image_features = self.original_clip_visual(original_image, get_embedding=True)
59 |             image_features = self.clip_model.encode_image(image, ori_image_features=ori_image_features, mask=mask)  # [32, 768]
60 | 
61 |         image_features = F.normalize(image_features, dim=-1)  # [32, 768]
62 | 
63 |         text_features = self.clip_model.encode_text(text)
64 |         text_features = F.normalize(text_features, dim=-1)  # [32, 768]
65 | 
66 |         # return image_features, text_features, self.clip_model.logit_scale.exp()
67 |         out_dict = {
68 |                 "image_features": image_features,
69 |                 "text_features": text_features,
70 |                 "logit_scale": self.clip_model.logit_scale.exp()
71 |             }
72 |         if self.clip_model.logit_bias is not None:
73 |             out_dict['logit_bias'] = self.logit_bias
74 |         return out_dict


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coca_model import CoCa
 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
 7 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
 8 |     get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
 9 | from .openai import load_openai_model, list_openai_models
10 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
11 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
12 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
13 | from .tokenizer import SimpleTokenizer, tokenize, decode
14 | from .transform import image_transform, AugmentationCfg
15 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
16 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES
17 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/open_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
4 | IMAGENET_STD = (0.229, 0.224, 0.225)
5 | INCEPTION_MEAN = (0.5, 0.5, 0.5)
6 | INCEPTION_STD = (0.5, 0.5, 0.5)
7 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/open_clip/generation_utils.py


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |     "roberta": {
 5 |         "config_names": {
 6 |             "context_length": "max_position_embeddings",
 7 |             "vocab_size": "vocab_size",
 8 |             "width": "hidden_size",
 9 |             "heads": "num_attention_heads",
10 |             "layers": "num_hidden_layers",
11 |             "layer_attr": "layer",
12 |             "token_embeddings_attr": "embeddings"
13 |         },
14 |         "pooler": "mean_pooler",
15 |     },
16 |     # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |     "xlm-roberta": {
18 |         "config_names": {
19 |             "context_length": "max_position_embeddings",
20 |             "vocab_size": "vocab_size",
21 |             "width": "hidden_size",
22 |             "heads": "num_attention_heads",
23 |             "layers": "num_hidden_layers",
24 |             "layer_attr": "layer",
25 |             "token_embeddings_attr": "embeddings"
26 |         },
27 |         "pooler": "mean_pooler",
28 |     },
29 |     # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |     "mt5": {
31 |         "config_names": {
32 |             # unlimited seqlen
33 |             # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |             "context_length": "",
36 |             "vocab_size": "vocab_size",
37 |             "width": "d_model",
38 |             "heads": "num_heads",
39 |             "layers": "num_layers",
40 |             "layer_attr": "block",
41 |             "token_embeddings_attr": "embed_tokens"
42 |         },
43 |         "pooler": "mean_pooler",
44 |     },
45 |     # https://huggingface.co/docs/transformers/model_doc/bert
46 |     "bert": {
47 |         "config_names": {
48 |             "context_length": "max_position_embeddings",
49 |             "vocab_size": "vocab_size",
50 |             "width": "hidden_size",
51 |             "heads": "num_attention_heads",
52 |             "layers": "num_hidden_layers",
53 |         },
54 |         "pooler": "cls_pooler",
55 |     },
56 |     # https://huggingface.co/docs/transformers/model_doc/m2m_100
57 |     "m2m_100": {
58 |         "config_names": {
59 |             "context_length": "max_position_embeddings",
60 |             "vocab_size": "vocab_size",
61 |             "width": "d_model",
62 |             "heads": "encoder_attention_heads",
63 |             "layers": "encoder_layers",
64 |         },
65 |         "pooler": "cls_pooler",
66 |     },
67 | }
68 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA01-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva_giant_patch14_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA01-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva_giant_patch14_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_base_patch16_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-E-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_enormous_patch14_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1280,
14 |         "heads": 20,
15 |         "layers": 32
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-E-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_enormous_patch14_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "timm_model_name": "eva02_large_patch14_clip_336",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/EVA02-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_large_patch14_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": [
 6 |             3,
 7 |             15,
 8 |             36,
 9 |             10
10 |         ],
11 |         "width": 128,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 1024,
18 |         "heads": 16,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 256,
 7 |         "timm_model_name": "vit_base_patch16_siglip_256",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_base_patch16_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 512,
 7 |         "timm_model_name": "vit_base_patch16_siglip_512",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 256,
 7 |         "timm_model_name": "vit_base_patch16_siglip_256",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 250000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-SigLIP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 224,
 7 |         "timm_model_name": "vit_base_patch16_siglip_224",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-378-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 378,
 6 |         "layers": 32,
 7 |         "width": 1280,
 8 |         "head_width": 80,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14,
 9 |         "no_ln_pre": true,
10 |         "pool_type": "avg",
11 |         "final_ln_after_pool": true
12 |     },
13 |     "text_cfg": {
14 |         "context_length": 32,
15 |         "vocab_size": 32000,
16 |         "hf_tokenizer_name": "bert-base-uncased",
17 |         "tokenizer_kwargs": {
18 |             "strip_sep_token": true
19 |         },
20 |         "width": 1024,
21 |         "heads": 16,
22 |         "layers": 24,
23 |         "pool_type": "last",
24 |         "no_causal_mask": true
25 |     }
26 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-CLIPA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14,
 9 |         "no_ln_pre": true,
10 |         "pool_type": "avg",
11 |         "final_ln_after_pool": true
12 |     },
13 |     "text_cfg": {
14 |         "context_length": 32,
15 |         "vocab_size": 32000,
16 |         "hf_tokenizer_name": "bert-base-uncased",
17 |         "tokenizer_kwargs": {
18 |             "strip_sep_token": true
19 |         },
20 |         "width": 1024,
21 |         "heads": 16,
22 |         "layers": 24,
23 |         "pool_type": "last",
24 |         "no_causal_mask": true
25 |     }
26 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 32,
 7 |         "width": 1280,
 8 |         "head_width": 80,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-CLIPA-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "no_ln_pre": true,
 9 |         "pool_type": "avg",
10 |         "final_ln_after_pool": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 32,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "bert-base-uncased",
16 |         "tokenizer_kwargs": {
17 |             "strip_sep_token": true
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "pool_type": "last",
23 |         "no_causal_mask": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-CLIPA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "no_ln_pre": true,
 9 |         "pool_type": "avg",
10 |         "final_ln_after_pool": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 32,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "bert-base-uncased",
16 |         "tokenizer_kwargs": {
17 |             "strip_sep_token": true
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "pool_type": "last",
23 |         "no_causal_mask": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 24,
 7 |         "width": 1024,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16-SigLIP-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 256,
 7 |         "timm_model_name": "vit_large_patch16_siglip_256",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_large_patch16_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16,
 8 |         "ls_init_value": 1e-4
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 384,
14 |         "heads": 6,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1152,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_so400m_patch14_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1152,
20 |         "heads": 16,
21 |         "layers": 27,
22 |         "mlp_ratio": 3.7362,
23 |         "no_causal_mask": true,
24 |         "proj_bias": true,
25 |         "pool_type": "last",
26 |         "norm_kwargs":{
27 |             "eps": 1e-6
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1152,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 224,
 7 |         "timm_model_name": "vit_so400m_patch14_siglip_224",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 16,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1152,
20 |         "heads": 16,
21 |         "layers": 27,
22 |         "mlp_ratio": 3.7362,
23 |         "no_causal_mask": true,
24 |         "proj_bias": true,
25 |         "pool_type": "last",
26 |         "norm_kwargs":{
27 |             "eps": 1e-6
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14,
10 |         "no_ln_pre": true,
11 |         "pool_type": "avg",
12 |         "final_ln_after_pool": true
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 32,
16 |         "vocab_size": 32000,
17 |         "hf_tokenizer_name": "bert-base-uncased",
18 |         "tokenizer_kwargs": {
19 |             "strip_sep_token": true
20 |         },
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "pool_type": "last",
25 |         "no_causal_mask": true
26 |     }
27 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14,
10 |         "no_ln_pre": true,
11 |         "pool_type": "avg",
12 |         "final_ln_after_pool": true
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 32,
16 |         "vocab_size": 32000,
17 |         "hf_tokenizer_name": "bert-base-uncased",
18 |         "tokenizer_kwargs": {
19 |             "strip_sep_token": true
20 |         },
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "pool_type": "last",
25 |         "no_causal_mask": true
26 |     }
27 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 32
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 56,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.5715,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 36
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 512,
25 |         "heads": 8,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 8
28 |     },
29 |     "custom_text": true
30 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 768,
25 |         "heads": 12,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 12
28 |     },
29 |     "custom_text": true
30 | }
31 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "multimodal_cfg": {
 4 |         "width": 768,
 5 |         "context_length": 76,
 6 |         "vocab_size": 64000,
 7 |         "mlp_ratio": 4,
 8 |         "layers": 12,
 9 |         "dim_head": 64,
10 |         "heads": 12,
11 |         "n_queries": 256,
12 |         "attn_pooler_heads": 8
13 |     },
14 |     "vision_cfg": {
15 |         "image_size": 288,
16 |         "layers": 12,
17 |         "width": 768,
18 |         "patch_size": 18,
19 |         "output_tokens": true
20 |     },
21 |     "text_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 64000,
24 |         "layers": 12,
25 |         "heads": 12,
26 |         "width": 768,
27 |         "embed_cls": true,
28 |         "output_tokens": true
29 |     },
30 |     "custom_text": true
31 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "output_tokens": true
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "hf_proj_type": "linear",
14 |         "width": 768,
15 |         "output_tokens": true
16 |     },
17 |     "multimodal_cfg": {
18 |         "context_length": 76,
19 |         "width": 768,
20 |         "heads": 8,
21 |         "layers": 12
22 |     },
23 |     "custom_text": true
24 | }
25 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 640,
16 |         "heads": 10,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 640,
16 |         "heads": 10,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_large_d_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_small",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_tiny",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 20
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 24
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 24
18 |     }
19 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "google/mt5-base",
11 |         "hf_tokenizer_name": "google/mt5-base",
12 |         "hf_pooler_type": "mean_pooler"
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "google/mt5-xl",
12 |         "hf_tokenizer_name": "google/mt5-xl",
13 |         "hf_pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-base-siglip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "custom_text": true,
 4 |     "init_logit_bias": -10,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_base_patch16_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "hf_model_name": "facebook/nllb-200-distilled-600M",
14 |         "hf_tokenizer_name": "facebook/nllb-200-distilled-600M",
15 |         "hf_proj_type": "linear",
16 |         "hf_pooler_type": "cls_pooler"
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "facebook/nllb-200-distilled-600M",
11 |         "hf_tokenizer_name": "facebook/nllb-200-distilled-600M",
12 |         "hf_proj_type": "linear",
13 |         "hf_pooler_type": "cls_pooler"
14 |     }
15 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-large-siglip.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1152,
 3 |     "custom_text": true,
 4 |     "init_logit_bias": -10,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_so400m_patch14_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "hf_model_name": "facebook/nllb-200-distilled-1.3B",
14 |         "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B",
15 |         "hf_proj_type": "linear",
16 |         "hf_pooler_type": "cls_pooler"
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/nllb-clip-large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "facebook/nllb-200-distilled-1.3B",
12 |         "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B",
13 |         "hf_proj_type": "linear",
14 |         "hf_pooler_type": "cls_pooler"
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "hf_pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 640,
14 |         "heads": 10,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_medium_patch16_gap_256",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_relpos_medium_patch16_cls_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "xlm-roberta-base",
11 |         "hf_tokenizer_name": "xlm-roberta-base",
12 |         "hf_pooler_type": "mean_pooler"
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "xlm-roberta-large",
12 |         "hf_tokenizer_name": "xlm-roberta-large",
13 |         "hf_pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/openai.py:
--------------------------------------------------------------------------------
 1 | """ OpenAI pretrained model functions
 2 | 
 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 4 | """
 5 | 
 6 | import os
 7 | import warnings
 8 | from typing import List, Optional, Union
 9 | 
10 | import torch
11 | 
12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
13 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
14 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
15 | 
16 | __all__ = ["list_openai_models", "load_openai_model"]
17 | 
18 | 
19 | def list_openai_models() -> List[str]:
20 |     """Returns the names of available CLIP models"""
21 |     return list_pretrained_models_by_tag('openai')
22 | 
23 | 
24 | def load_openai_model(
25 |         name: str,
26 |         precision: Optional[str] = None,
27 |         device: Optional[Union[str, torch.device]] = None,
28 |         cache_dir: Optional[str] = None,
29 | ):
30 |     """Load a CLIP model
31 | 
32 |     Parameters
33 |     ----------
34 |     name : str
35 |         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
36 |     precision: str
37 |         Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
38 |     device : Union[str, torch.device]
39 |         The device to put the loaded model
40 |     cache_dir : Optional[str]
41 |         The directory to cache the downloaded model weights
42 | 
43 |     Returns
44 |     -------
45 |     model : torch.nn.Module
46 |         The CLIP model
47 |     preprocess : Callable[[PIL.Image], torch.Tensor]
48 |         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
49 |     """
50 |     if device is None:
51 |         device = "cuda" if torch.cuda.is_available() else "cpu"
52 |     if precision is None:
53 |         precision = 'fp32' if device == 'cpu' else 'fp16'
54 | 
55 |     if get_pretrained_url(name, 'openai'):
56 |         model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
57 |     elif os.path.isfile(name):
58 |         model_path = name
59 |     else:
60 |         raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
61 | 
62 |     try:
63 |         # loading JIT archive
64 |         model = torch.jit.load(model_path, map_location="cpu").eval()
65 |         state_dict = None
66 |     except RuntimeError:
67 |         # loading saved state dict
68 |         state_dict = torch.load(model_path, map_location="cpu")
69 | 
70 |     # Build a non-jit model from the OpenAI jitted model state dict
71 |     cast_dtype = get_cast_dtype(precision)
72 |     try:
73 |         model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
74 |     except KeyError:
75 |         sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
76 |         model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
77 | 
78 |     # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
79 |     model = model.to(device)
80 |     # FIXME support pure fp16/bf16 precision modes
81 |     if precision != 'fp16':
82 |         model.float()
83 |         if precision == 'bf16':
84 |             # for bf16, convert back to low-precision
85 |             convert_weights_to_lp(model, dtype=torch.bfloat16)
86 | 
87 |     # add mean / std attributes for consistency with OpenCLIP models
88 |     model.visual.image_mean = OPENAI_DATASET_MEAN
89 |     model.visual.image_std = OPENAI_DATASET_STD
90 |     return model
91 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/pos_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # Position embedding utils
 8 | # --------------------------------------------------------
 9 | 
10 | import numpy as np
11 | 
12 | import torch
13 | 
14 | # --------------------------------------------------------
15 | # 2D sine-cosine position embedding
16 | # References:
17 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
18 | # MoCo v3: https://github.com/facebookresearch/moco-v3
19 | # --------------------------------------------------------
20 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
21 |     """
22 |     grid_size: int of the grid height and width
23 |     return:
24 |     pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
25 |     """
26 |     grid_h = np.arange(grid_size, dtype=np.float32)
27 |     grid_w = np.arange(grid_size, dtype=np.float32)
28 |     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
29 |     grid = np.stack(grid, axis=0)
30 | 
31 |     grid = grid.reshape([2, 1, grid_size, grid_size])
32 |     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
33 |     if cls_token:
34 |         pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
35 |     return pos_embed
36 | 
37 | 
38 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
39 |     assert embed_dim % 2 == 0
40 | 
41 |     # use half of dimensions to encode grid_h
42 |     emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
43 |     emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
44 | 
45 |     emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
46 |     return emb
47 | 
48 | 
49 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
50 |     """
51 |     embed_dim: output dimension for each position
52 |     pos: a list of positions to be encoded: size (M,)
53 |     out: (M, D)
54 |     """
55 |     assert embed_dim % 2 == 0
56 |     omega = np.arange(embed_dim // 2, dtype=float)
57 |     omega /= embed_dim / 2.
58 |     omega = 1. / 10000**omega  # (D/2,)
59 | 
60 |     pos = pos.reshape(-1)  # (M,)
61 |     out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
62 | 
63 |     emb_sin = np.sin(out) # (M, D/2)
64 |     emb_cos = np.cos(out) # (M, D/2)
65 | 
66 |     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
67 |     return emb
68 | 
69 | 
70 | # --------------------------------------------------------
71 | # Interpolate position embeddings for high-resolution
72 | # References:
73 | # DeiT: https://github.com/facebookresearch/deit
74 | # --------------------------------------------------------
75 | def interpolate_pos_embed(model, checkpoint_model):
76 |     if 'pos_embed' in checkpoint_model:
77 |         pos_embed_checkpoint = checkpoint_model['pos_embed']
78 |         embedding_size = pos_embed_checkpoint.shape[-1]
79 |         num_patches = model.patch_embed.num_patches
80 |         num_extra_tokens = model.pos_embed.shape[-2] - num_patches
81 |         # height (== width) for the checkpoint position embedding
82 |         orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
83 |         # height (== width) for the new position embedding
84 |         new_size = int(num_patches ** 0.5)
85 |         # class_token and dist_token are kept unchanged
86 |         if orig_size != new_size:
87 |             print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
88 |             extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
89 |             # only the position tokens are interpolated
90 |             pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
91 |             pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
92 |             pos_tokens = torch.nn.functional.interpolate(
93 |                 pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
94 |             pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
95 |             new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
96 |             checkpoint_model['pos_embed'] = new_pos_embed
97 | 


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import repeat
 2 | import collections.abc
 3 | 
 4 | import torch
 5 | from torch import nn as nn
 6 | from torchvision.ops.misc import FrozenBatchNorm2d
 7 | 
 8 | 
 9 | def freeze_batch_norm_2d(module, module_match={}, name=''):
10 |     """
11 |     Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
12 |     itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
13 |     returned. Otherwise, the module is walked recursively and submodules are converted in place.
14 | 
15 |     Args:
16 |         module (torch.nn.Module): Any PyTorch module.
17 |         module_match (dict): Dictionary of full module names to freeze (all if empty)
18 |         name (str): Full module name (prefix)
19 | 
20 |     Returns:
21 |         torch.nn.Module: Resulting module
22 | 
23 |     Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
24 |     """
25 |     res = module
26 |     is_match = True
27 |     if module_match:
28 |         is_match = name in module_match
29 |     if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
30 |         res = FrozenBatchNorm2d(module.num_features)
31 |         res.num_features = module.num_features
32 |         res.affine = module.affine
33 |         if module.affine:
34 |             res.weight.data = module.weight.data.clone().detach()
35 |             res.bias.data = module.bias.data.clone().detach()
36 |         res.running_mean.data = module.running_mean.data
37 |         res.running_var.data = module.running_var.data
38 |         res.eps = module.eps
39 |     else:
40 |         for child_name, child in module.named_children():
41 |             full_child_name = '.'.join([name, child_name]) if name else child_name
42 |             new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
43 |             if new_child is not child:
44 |                 res.add_module(child_name, new_child)
45 |     return res
46 | 
47 | 
48 | # From PyTorch internals
49 | def _ntuple(n):
50 |     def parse(x):
51 |         if isinstance(x, collections.abc.Iterable):
52 |             return x
53 |         return tuple(repeat(x, n))
54 |     return parse
55 | 
56 | 
57 | to_1tuple = _ntuple(1)
58 | to_2tuple = _ntuple(2)
59 | to_3tuple = _ntuple(3)
60 | to_4tuple = _ntuple(4)
61 | to_ntuple = lambda n, x: _ntuple(n)(x)
62 | 
63 | # Replaces all linear layers with linear_replacement
64 | # TODO: add int8 support for other linear layers including attn and convnets
65 | def replace_linear(model, linear_replacement, include_modules=['c_fc', 'c_proj'], copy_weights=True):
66 |     for name, module in model.named_children():
67 |         if len(list(module.children())) > 0:
68 |             replace_linear(module, linear_replacement, include_modules, copy_weights)
69 | 
70 |         if isinstance(module, torch.nn.Linear) and name in include_modules:
71 |             old_module = model._modules[name]
72 |             model._modules[name] = linear_replacement(
73 |                 module.in_features,
74 |                 module.out_features,
75 |                 module.bias is not None,
76 |             )
77 |             if copy_weights:
78 |                 model._modules[name].weight.data.copy_(old_module.weight.data)
79 |                 if model._modules[name].bias is not None:
80 |                     model._modules[name].bias.data.copy_(old_module.bias)
81 | 
82 |     return model
83 | 
84 | def convert_int8_model_to_inference_mode(model):
85 |     for m in model.modules():
86 |         if hasattr(m, 'prepare_for_eval'):
87 |             int8_original_dtype = m.weight.dtype
88 |             m.prepare_for_eval()
89 |             m.int8_original_dtype = int8_original_dtype


--------------------------------------------------------------------------------
/open_clip_training/src/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.22.0'
2 | 


--------------------------------------------------------------------------------
/open_clip_training/src/scripts/1cap_finetune_VitL.sh:
--------------------------------------------------------------------------------
 1 | torchrun --master_port 12345 --nproc_per_node 8 -m training.main \
 2 |     --train-data /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/coco_proposal_1cap.csv \
 3 |     --train-num-samples 442117 \
 4 |     --lr 0.000005 \
 5 |     --warmup 100 \
 6 |     --force-quick-gelu \
 7 |     --dataset-type csv \
 8 |     --batch-size 32 \
 9 |     --precision amp \
10 |     --workers 8 \
11 |     --model  ViT-L-14 \
12 |     --lock-text \
13 |     --zeroshot-frequency 1 \
14 |     --save-frequency 1 \
15 |     --epochs 10 \
16 |     --pretrained datacomp_xl_s13b_b90k \
17 |     --ade-val /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/ade_gt_150cls_val


--------------------------------------------------------------------------------
/open_clip_training/src/scripts/finetune_VitL_with_mask.sh:
--------------------------------------------------------------------------------
 1 | torchrun --master_port 12345 --nproc_per_node 8 -m training.main \
 2 |     --train-data /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/coco_proposal_1cap.csv \
 3 |     --train-num-samples 442117 \
 4 |     --lr 0.000005 \
 5 |     --warmup 100 \
 6 |     --force-quick-gelu \
 7 |     --dataset-type csv \
 8 |     --batch-size 32 \
 9 |     --precision amp \
10 |     --workers 8 \
11 |     --model  ViT-L-14 \
12 |     --lock-text \
13 |     --zeroshot-frequency 1 \
14 |     --save-frequency 1 \
15 |     --epochs 10 \
16 |     --pretrained datacomp_xl_s13b_b90k \
17 |     --ade-val /opt/tiger/ljyaronld/method7/OVSeg/open_clip_training/openclip_data/ade_gt_150cls_val \
18 |     --with-mask


--------------------------------------------------------------------------------
/open_clip_training/src/training/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | 


--------------------------------------------------------------------------------
/open_clip_training/src/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongliu20/SCAN/097fbbdaf7789ac6d2da44533e95521e73fd7c4f/open_clip_training/src/training/__init__.py


--------------------------------------------------------------------------------
/open_clip_training/src/training/file_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import multiprocessing
 4 | import subprocess
 5 | import time
 6 | import fsspec
 7 | import torch
 8 | from tqdm import tqdm
 9 | 
10 | def remote_sync_s3(local_dir, remote_dir):
11 |     # skip epoch_latest which can change during sync.
12 |     result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
13 |     if result.returncode != 0:
14 |         logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}")
15 |         return False
16 |         
17 |     logging.info(f"Successfully synced with S3 bucket")
18 |     return True
19 | 
20 | def remote_sync_fsspec(local_dir, remote_dir):
21 |     # FIXME currently this is slow and not recommended. Look into speeding up.
22 |     a = fsspec.get_mapper(local_dir)
23 |     b = fsspec.get_mapper(remote_dir)
24 | 
25 |     for k in a:
26 |         # skip epoch_latest which can change during sync.
27 |         if 'epoch_latest.pt' in k:
28 |             continue
29 | 
30 |         logging.info(f'Attempting to sync {k}')
31 |         if k in b and len(a[k]) == len(b[k]):
32 |             logging.debug(f'Skipping remote sync for {k}.')
33 |             continue
34 | 
35 |         try:
36 |             logging.info(f'Successful sync for {k}.')
37 |             b[k] = a[k]
38 |         except Exception as e:
39 |             logging.info(f'Error during remote sync for {k}: {e}')
40 |             return False
41 | 
42 |     return True
43 | 
44 | def remote_sync(local_dir, remote_dir, protocol):
45 |     logging.info('Starting remote sync.')
46 |     if protocol == 's3':
47 |         return remote_sync_s3(local_dir, remote_dir)
48 |     elif protocol == 'fsspec':
49 |         return remote_sync_fsspec(local_dir, remote_dir)
50 |     else:
51 |         logging.error('Remote protocol not known')
52 |         return False
53 | 
54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol):
55 |     while True:
56 |         time.sleep(sync_every)
57 |         remote_sync(local_dir, remote_dir, protocol)
58 | 
59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol):
60 |     p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol))
61 |     return p
62 | 
63 | # Note: we are not currently using this save function.
64 | def pt_save(pt_obj, file_path):
65 |     of = fsspec.open(file_path, "wb")
66 |     with of as f:
67 |         torch.save(pt_obj, file_path)
68 | 
69 | def pt_load(file_path, map_location=None):
70 |     if file_path.startswith('s3'):
71 |         logging.info('Loading remote checkpoint, which may take a bit.')
72 |     of = fsspec.open(file_path, "rb")
73 |     with of as f:
74 |         out = torch.load(f, map_location=map_location)
75 |     return out
76 | 
77 | def check_exists(file_path):
78 |     try:
79 |         with fsspec.open(file_path):
80 |             pass
81 |     except FileNotFoundError:
82 |         return False
83 |     return True
84 | 


--------------------------------------------------------------------------------
/open_clip_training/src/training/precision.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from contextlib import suppress
 3 | 
 4 | 
 5 | def get_autocast(precision):
 6 |     if precision == 'amp':
 7 |         return torch.cuda.amp.autocast
 8 |     elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
 9 |         # amp_bfloat16 is more stable than amp float16 for clip training
10 |         return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
11 |     else:
12 |         return suppress
13 | 


--------------------------------------------------------------------------------
/open_clip_training/src/training/scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def assign_learning_rate(optimizer, new_lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group["lr"] = new_lr
 7 | 
 8 | 
 9 | def _warmup_lr(base_lr, warmup_length, step):
10 |     return base_lr * (step + 1) / warmup_length
11 | 
12 | 
13 | def const_lr(optimizer, base_lr, warmup_length, steps):
14 |     def _lr_adjuster(step):
15 |         if step < warmup_length:
16 |             lr = _warmup_lr(base_lr, warmup_length, step)
17 |         else:
18 |             lr = base_lr
19 |         assign_learning_rate(optimizer, lr)
20 |         return lr
21 |     return _lr_adjuster
22 | 
23 | 
24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.):
25 |     def _lr_adjuster(step):
26 |         start_cooldown_step = steps - cooldown_steps
27 |         if step < warmup_length:
28 |             lr = _warmup_lr(base_lr, warmup_length, step)
29 |         else:
30 |             if step < start_cooldown_step:
31 |                 lr = base_lr
32 |             else:
33 |                 e = step - start_cooldown_step
34 |                 es = steps - start_cooldown_step
35 |                 # linear decay if power == 1; polynomial decay otherwise;
36 |                 decay = (1 - (e/es)) ** cooldown_power
37 |                 lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr
38 |         assign_learning_rate(optimizer, lr)
39 |         return lr
40 |     return _lr_adjuster
41 | 
42 | 
43 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
44 |     def _lr_adjuster(step):
45 |         if step < warmup_length:
46 |             lr = _warmup_lr(base_lr, warmup_length, step)
47 |         else:
48 |             e = step - warmup_length
49 |             es = steps - warmup_length
50 |             lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
51 |         assign_learning_rate(optimizer, lr)
52 |         return lr
53 |     return _lr_adjuster
54 | 


--------------------------------------------------------------------------------
/open_clip_training/src/training/zero_shot.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | 
  6 | from open_clip import get_input_dtype, get_tokenizer, build_zero_shot_classifier, \
  7 |     IMAGENET_CLASSNAMES, OPENAI_IMAGENET_TEMPLATES
  8 | from .precision import get_autocast
  9 | from .ade150_zeroshot_data import ade150_classnames
 10 | from torchmetrics import Accuracy
 11 | 
 12 | def accuracy(output, target, topk=(1,)):
 13 |     pred = output.topk(max(topk), 1, True, True)[1].t()
 14 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
 15 |     return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk], pred[0]
 16 | 
 17 | 
 18 | def run(model, classifier, dataloader, args):
 19 |     autocast = get_autocast(args.precision)
 20 |     input_dtype = get_input_dtype(args.precision)
 21 | 
 22 |     with torch.no_grad():
 23 |         top1, top5, n = 0., 0., 0.
 24 |         preds = []
 25 |         targets = []
 26 |         macc = Accuracy('multiclass', num_classes=150, average='macro').cuda()
 27 |         for images, target, entire_images in tqdm(dataloader, unit_scale=args.batch_size):
 28 |             if args.with_mask:
 29 |                 images, masks = images
 30 |                 masks = masks.to(device=args.device, dtype=input_dtype)
 31 |             else:
 32 |                 images = images
 33 |                 masks=None
 34 |             images = images.to(device=args.device, dtype=input_dtype)
 35 |             target = target.to(args.device)
 36 |             entire_images = entire_images.to(device=args.device, dtype=input_dtype)
 37 | 
 38 |             with autocast():
 39 |                 # predict
 40 |                 output = model(original_image=entire_images, image=images, mask=masks, text=None)
 41 |                 image_features = output['image_features'] if isinstance(output, dict) else output[0]
 42 |                 logits = 100. * image_features @ classifier
 43 | 
 44 |             # measure accuracy
 45 |             (acc1, acc5), pred = accuracy(logits, target, topk=(1, 5))
 46 |             preds.append(pred)
 47 |             targets.append(target)
 48 |             top1 += acc1
 49 |             top5 += acc5
 50 |             n += images.size(0)
 51 |     preds = torch.cat(preds)
 52 |     targets = torch.cat(targets)
 53 |     top1 = (top1 / n)
 54 |     top5 = (top5 / n)
 55 |     return top1, top5, macc(preds, targets).item()
 56 | 
 57 | 
 58 | def zero_shot_eval(model, data, epoch, args, tokenizer=None):
 59 |     if 'imagenet-val' not in data and 'imagenet-v2' not in data and 'ade-val' not in data:
 60 |         return {}
 61 |     if args.zeroshot_frequency == 0:
 62 |         return {}
 63 |     if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
 64 |         return {}
 65 |     if args.distributed and not args.horovod:
 66 |         model = model.module
 67 | 
 68 |     logging.info('Starting zero-shot imagenet.')
 69 |     if tokenizer is None:
 70 |         tokenizer = get_tokenizer(args.model)
 71 | 
 72 |     logging.info('Building zero-shot classifier')
 73 |     autocast = get_autocast(args.precision)
 74 |     with autocast():
 75 |         if 'ade-val' in data:
 76 |             classifier = build_zero_shot_classifier(
 77 |                 model,
 78 |                 tokenizer=tokenizer,
 79 |                 classnames=ade150_classnames,
 80 |                 templates=OPENAI_IMAGENET_TEMPLATES,
 81 |                 num_classes_per_batch=10,
 82 |                 device=args.device,
 83 |                 use_tqdm=True,
 84 |             )
 85 |         else:
 86 |             classifier = build_zero_shot_classifier(
 87 |                 model,
 88 |                 tokenizer=tokenizer,
 89 |                 classnames=IMAGENET_CLASSNAMES,
 90 |                 templates=OPENAI_IMAGENET_TEMPLATES,
 91 |                 num_classes_per_batch=10,
 92 |                 device=args.device,
 93 |                 use_tqdm=True,
 94 |             )
 95 | 
 96 |     logging.info('Using classifier')
 97 |     results = {}
 98 |     if 'imagenet-val' in data:
 99 |         top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args)
100 |         results['imagenet-zeroshot-val-top1'] = top1
101 |         results['imagenet-zeroshot-val-top5'] = top5
102 |     if 'imagenet-v2' in data:
103 |         top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args)
104 |         results['imagenetv2-zeroshot-val-top1'] = top1
105 |         results['imagenetv2-zeroshot-val-top5'] = top5
106 |     if 'ade-val' in data:
107 |         top1, top5, macc = run(model, classifier, data['ade-val'].dataloader, args)
108 |         results['ade150-zeroshot-val-top1'] = top1
109 |         results['ade150-zeroshot-val-top5'] = top5
110 | 
111 |     logging.info('Finished zero-shot imagenet.')
112 | 
113 |     return results
114 | 


--------------------------------------------------------------------------------
/open_clip_training/tests/test_hf_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | from open_clip.hf_model import _POOLERS, HFTextEncoder
 5 | from transformers import AutoConfig
 6 | from transformers.modeling_outputs import BaseModelOutput
 7 | # test poolers
 8 | def test_poolers():
 9 |     bs, sl, d = 2, 10, 5
10 |     h = torch.arange(sl).repeat(bs).reshape(bs, sl)[..., None] * torch.linspace(0.2, 1., d)
11 |     mask = torch.ones(bs, sl, dtype=torch.bool)
12 |     mask[:2, 6:] = False
13 |     x = BaseModelOutput(h)
14 |     for name, cls in _POOLERS.items():
15 |         pooler = cls()
16 |         res = pooler(x, mask)
17 |         assert res.shape == (bs, d), f"{name} returned wrong shape"
18 | 
19 | # test HFTextEncoder
20 | @pytest.mark.parametrize("model_id", ["arampacha/roberta-tiny", "roberta-base", "xlm-roberta-base", "google/mt5-base"])
21 | def test_pretrained_text_encoder(model_id):
22 |     bs, sl, d = 2, 10, 64
23 |     cfg = AutoConfig.from_pretrained(model_id)
24 |     model = HFTextEncoder(model_id, d, proj_type='linear')
25 |     x = torch.randint(0, cfg.vocab_size, (bs, sl))
26 |     with torch.no_grad():
27 |         emb = model(x)
28 | 
29 |     assert emb.shape == (bs, d)
30 | 


--------------------------------------------------------------------------------
/open_clip_training/tests/test_inference_simple.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from open_clip.factory import get_tokenizer
 4 | import pytest
 5 | import open_clip
 6 | import os
 7 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
 8 | 
 9 | if hasattr(torch._C, '_jit_set_profiling_executor'):
10 |     # legacy executor is too slow to compile large models for unit tests
11 |     # no need for the fusion performance here
12 |     torch._C._jit_set_profiling_executor(True)
13 |     torch._C._jit_set_profiling_mode(False)
14 | 
15 | 
16 | test_simple_models = [
17 |     # model, pretrained, jit, force_custom_text
18 |     ("ViT-B-32", "laion2b_s34b_b79k", False, False),
19 |     ("ViT-B-32", "laion2b_s34b_b79k", True, False),
20 |     ("ViT-B-32", "laion2b_s34b_b79k", True, True),
21 |     ("roberta-ViT-B-32", "laion2b_s12b_b32k", False, False),
22 | ]
23 | 
24 | 
25 | @pytest.mark.parametrize("model_type,pretrained,jit,force_custom_text", test_simple_models)
26 | def test_inference_simple(
27 |         model_type,
28 |         pretrained,
29 |         jit,
30 |         force_custom_text,
31 | ):
32 |     model, _, preprocess = open_clip.create_model_and_transforms(
33 |         model_type,
34 |         pretrained=pretrained,
35 |         jit=jit,
36 |         force_custom_text=force_custom_text,
37 |     )
38 |     tokenizer = get_tokenizer(model_type)
39 | 
40 |     current_dir = os.path.dirname(os.path.realpath(__file__))
41 | 
42 |     image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0)
43 |     text = tokenizer(["a diagram", "a dog", "a cat"])
44 | 
45 |     with torch.no_grad():
46 |         image_features = model.encode_image(image)
47 |         text_features = model.encode_text(text)
48 | 
49 |         text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
50 | 
51 |     assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0]
52 | 


--------------------------------------------------------------------------------
/open_clip_training/tests/test_num_shards.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from training.data import get_dataset_size
 4 | 
 5 | @pytest.mark.parametrize(
 6 |     "shards,expected_size",
 7 |     [
 8 |         ('/path/to/shard.tar', 1),
 9 |         ('/path/to/shard_{000..000}.tar', 1),
10 |         ('/path/to/shard_{000..009}.tar', 10),
11 |         ('/path/to/shard_{000..009}_{000..009}.tar', 100),
12 |         ('/path/to/shard.tar::/path/to/other_shard_{000..009}.tar', 11),
13 |         ('/path/to/shard_{000..009}.tar::/path/to/other_shard_{000..009}.tar', 20),
14 |         (['/path/to/shard.tar'], 1),
15 |         (['/path/to/shard.tar', '/path/to/other_shard.tar'], 2),
16 |     ]
17 | )
18 | def test_num_shards(shards, expected_size):
19 |     _, size = get_dataset_size(shards)
20 |     assert size == expected_size, f'Expected {expected_size} for {shards} but found {size} instead.'
21 | 


--------------------------------------------------------------------------------
/open_clip_training/tests/test_training_simple.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import sys
  4 | import pytest
  5 | from PIL import Image
  6 | import torch
  7 | from training.main import main
  8 | 
  9 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
 10 | 
 11 | if hasattr(torch._C, '_jit_set_profiling_executor'):
 12 |     # legacy executor is too slow to compile large models for unit tests
 13 |     # no need for the fusion performance here
 14 |     torch._C._jit_set_profiling_executor(True)
 15 |     torch._C._jit_set_profiling_mode(False)
 16 | 
 17 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
 18 | def test_training():
 19 |     main([
 20 |     '--save-frequency', '1',
 21 |     '--zeroshot-frequency', '1',
 22 |     '--dataset-type', "synthetic",
 23 |     '--train-num-samples', '16',
 24 |     '--warmup', '1',
 25 |     '--batch-size', '4',
 26 |     '--lr', '1e-3',
 27 |     '--wd', '0.1',
 28 |     '--epochs', '1',
 29 |     '--workers', '2',
 30 |     '--model', 'RN50'
 31 |     ])
 32 | 
 33 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
 34 | def test_training_coca():
 35 |     main([
 36 |     '--save-frequency', '1',
 37 |     '--zeroshot-frequency', '1',
 38 |     '--dataset-type', "synthetic",
 39 |     '--train-num-samples', '16',
 40 |     '--warmup', '1',
 41 |     '--batch-size', '4',
 42 |     '--lr', '1e-3',
 43 |     '--wd', '0.1',
 44 |     '--epochs', '1',
 45 |     '--workers', '2',
 46 |     '--model', 'coca_ViT-B-32'
 47 |     ])
 48 | 
 49 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
 50 | def test_training_mt5():
 51 |     main([
 52 |     '--save-frequency', '1',
 53 |     '--zeroshot-frequency', '1',
 54 |     '--dataset-type', "synthetic",
 55 |     '--train-num-samples', '16',
 56 |     '--warmup', '1',
 57 |     '--batch-size', '4',
 58 |     '--lr', '1e-3',
 59 |     '--wd', '0.1',
 60 |     '--epochs', '1',
 61 |     '--workers', '2',
 62 |     '--model', 'mt5-base-ViT-B-32',
 63 |     '--lock-text',
 64 |     '--lock-text-unlocked-layers', '2'
 65 |     ])
 66 | 
 67 | 
 68 | 
 69 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
 70 | def test_training_unfreezing_vit():
 71 |     main([
 72 |     '--save-frequency', '1',
 73 |     '--zeroshot-frequency', '1',
 74 |     '--dataset-type', "synthetic",
 75 |     '--train-num-samples', '16',
 76 |     '--warmup', '1',
 77 |     '--batch-size', '4',
 78 |     '--lr', '1e-3',
 79 |     '--wd', '0.1',
 80 |     '--epochs', '1',
 81 |     '--workers', '2',
 82 |     '--model', 'ViT-B-32',
 83 |     '--lock-image',
 84 |     '--lock-image-unlocked-groups', '5',
 85 |     '--accum-freq', '2'
 86 |     ])
 87 | 
 88 | 
 89 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals")
 90 | def test_training_clip_with_jit():
 91 |     main([
 92 |     '--save-frequency', '1',
 93 |     '--zeroshot-frequency', '1',
 94 |     '--dataset-type', "synthetic",
 95 |     '--train-num-samples', '16',
 96 |     '--warmup', '1',
 97 |     '--batch-size', '4',
 98 |     '--lr', '1e-3',
 99 |     '--wd', '0.1',
100 |     '--epochs', '1',
101 |     '--workers', '2',
102 |     '--model', 'ViT-B-32',
103 |     '--torchscript'
104 |     ])
105 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cython
 2 | scipy
 3 | shapely
 4 | timm
 5 | h5py
 6 | wandb
 7 | fire
 8 | opencv-python
 9 | pandas
10 | braceexpand
11 | torch-ema
12 | torchmetrics==0.11.4
13 | setuptools==59.5.0
14 | webdataset>=0.2.5
15 | numpy==1.23.0


--------------------------------------------------------------------------------
/scan/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | # from .config import add_maskformer2_config
 7 | from .config import add_ovseg_config
 8 | 
 9 | # dataset loading
10 | # from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
11 | # from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
12 | # from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
13 | #     MaskFormerInstanceDatasetMapper,
14 | # )
15 | # from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
16 | #     MaskFormerPanopticDatasetMapper,
17 | # )
18 | # from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
19 | #     MaskFormerSemanticDatasetMapper,
20 | # )
21 | 
22 | # models
23 | # from .maskformer_model import MaskFormer
24 | from .test_time_augmentation import SemanticSegmentorWithTTA
25 | 
26 | # evaluation
27 | # from .evaluation.instance_evaluation import InstanceSegEvaluator
28 | from .ovseg_model import SCAN, SCANDEMO


--------------------------------------------------------------------------------
/scan/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | from .dataset_mappers import *
 5 | from . import datasets
 6 | from .build import (
 7 |     build_detection_train_loader,
 8 |     build_detection_test_loader,
 9 | )
10 | 


--------------------------------------------------------------------------------
/scan/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 | 
4 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
5 | 


--------------------------------------------------------------------------------
/scan/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import register_coco_stuff, register_voc_seg
3 | from . import register_cc3m
4 | from . import register_ade20k_full
5 | from . import register_pascal_context


--------------------------------------------------------------------------------
/scan/data/datasets/register_voc_seg.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | 
 7 | PASCALVOC20_NAMES = (
 8 |     "aeroplane",
 9 |     "bicycle",
10 |     "bird",
11 |     "boat",
12 |     "bottle",
13 |     "bus",
14 |     "car",
15 |     "cat",
16 |     "chair",
17 |     "cow",
18 |     "diningtable",
19 |     "dog",
20 |     "horse",
21 |     "motorbike",
22 |     "person",
23 |     "pottedplant",
24 |     "sheep",
25 |     "sofa",
26 |     "train",
27 |     "tvmonitor",
28 | )
29 | 
30 | def _get_voc_meta(cat_list):
31 |     ret = {
32 |         "stuff_classes": cat_list,
33 |     }
34 |     return ret
35 | 
36 | 
37 | def register_pascalvoc(root):
38 |     root = os.path.join(root, "VOCdevkit/VOC2012")
39 |     meta = _get_voc_meta(PASCALVOC20_NAMES)
40 | 
41 |     for name, image_dirname, sem_seg_dirname in [
42 |         ("val", "JPEGImages", "annotations_detectron2/val"),
43 |     ]:
44 |         image_dir = os.path.join(root, image_dirname)
45 |         gt_dir = os.path.join(root, sem_seg_dirname)
46 |         all_name = f"pascalvoc20_sem_seg_{name}"
47 |         DatasetCatalog.register(
48 |             all_name,
49 |             lambda x=image_dir, y=gt_dir: load_sem_seg(
50 |                 y, x, gt_ext="png", image_ext="jpg"
51 |             ),
52 |         )
53 |         MetadataCatalog.get(all_name).set(
54 |             image_root=image_dir,
55 |             sem_seg_root=gt_dir,
56 |             evaluator_type="sem_seg",
57 |             ignore_label=255,
58 |             **meta,
59 |         )
60 | 
61 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
62 | register_pascalvoc(_root)
63 | 


--------------------------------------------------------------------------------
/scan/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 | 
4 | from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator, SGIoU_SemSegEvaluator
5 | 


--------------------------------------------------------------------------------
/scan/frequency.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class LFM(nn.Module):
 6 |     def __init__(self, num_channels):
 7 |         super(LFM, self).__init__()
 8 |         self.conv1 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0)
 9 |         self.conv2 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0)
10 | 
11 |     def make_gaussian(self, y_idx, x_idx, height, width, sigma=7):
12 |         yv, xv = torch.meshgrid([torch.arange(0, height), torch.arange(0, width)])
13 | 
14 |         yv = yv.unsqueeze(0).float().cuda()
15 |         xv = xv.unsqueeze(0).float().cuda()
16 | 
17 | 
18 |         g = torch.exp(- ((yv - y_idx) ** 2 + (xv - x_idx) ** 2) / (2 * sigma ** 2))
19 | 
20 |         return g.unsqueeze(0)       #1, 1, H, W
21 | 
22 | 
23 |     def forward(self, x, sigma):
24 |         b, c, h, w = x.shape
25 |         x = x.float()
26 |         y = torch.fft.fft2(x)
27 | 
28 | 
29 |         h_idx, w_idx = h // 2, w // 2
30 |         high_filter = self.make_gaussian(h_idx, w_idx, h, w, sigma=sigma)
31 |         y = y * (1 - high_filter)
32 | 
33 |         y_imag = y.imag
34 |         y_real = y.real
35 |         y_f = torch.cat([y_real, y_imag], dim=1)
36 |         y = F.relu(self.conv1(y_f))
37 | 
38 |         y = self.conv2(y).float()
39 |         y_real, y_imag = torch.chunk(y, 2, dim=1)
40 |         y = torch.complex(y_real, y_imag)
41 | 
42 |         y = torch.fft.ifft2(y, s=(h, w)).float()
43 |         return x + y
44 | 
45 | class MLP(nn.Module):
46 |     def __init__(self, input_dim, output_dim):
47 |         super(MLP, self).__init__()
48 |         self.fc1 = nn.Linear(input_dim, output_dim)
49 |         self.fc2 = nn.Linear(output_dim, output_dim)
50 |     
51 |     def forward(self, x):
52 |         x = self.fc2(self.fc1(x))
53 |         return x
54 | 
55 | 
56 | class CA(nn.Module):
57 |     def __init__(self, input_dim, num):
58 |         super(CA, self).__init__()
59 |         self.num = num
60 |         self.multiattn = nn.ModuleList()
61 |         self.ln = nn.ModuleList()
62 |         for i in range(num):
63 |             self.multiattn.append(nn.MultiheadAttention(embed_dim=input_dim, num_heads=8, batch_first=True))
64 |             if i != num - 1:
65 |                 self.ln.append(nn.LayerNorm(input_dim))
66 |     
67 |     def forward(self, tgt, memory):
68 |         for i in range(self.num):
69 |             tgt = tgt + self.multiattn[i](tgt, memory, memory)[0]
70 |             if i != self.num - 1:
71 |                 tgt = self.ln[i](tgt)
72 |         return tgt


--------------------------------------------------------------------------------
/scan/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import OpenVocaMask2FormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/scan/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3 | 


--------------------------------------------------------------------------------
/scan/modeling/clip_adapter/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | from .text_template import (
 5 |     PredefinedPromptExtractor,
 6 |     ImageNetPromptExtractor,
 7 |     VILDPromptExtractor,
 8 | )
 9 | from .adapter import ClipAdapter, MaskFormerClipAdapter
10 | 
11 | 
12 | def build_text_prompt(cfg):
13 |     if cfg.TEXT_TEMPLATES == "predefined":
14 |         text_templates = PredefinedPromptExtractor(cfg.PREDEFINED_PROMPT_TEMPLATES)
15 |     elif cfg.TEXT_TEMPLATES == "imagenet":
16 |         text_templates = ImageNetPromptExtractor()
17 |     elif cfg.TEXT_TEMPLATES == "vild":
18 |         text_templates = VILDPromptExtractor()
19 |     else:
20 |         raise NotImplementedError(
21 |             "Prompt learner {} is not supported".format(cfg.TEXT_TEMPLATES)
22 |         )
23 |     return text_templates
24 | 


--------------------------------------------------------------------------------
/scan/modeling/clip_adapter/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | from typing import Tuple
 5 | import numpy as np
 6 | import torch
 7 | from detectron2.utils.comm import get_local_rank, synchronize
 8 | 
 9 | 
10 | def expand_box(
11 |     x1: float,
12 |     y1: float,
13 |     x2: float,
14 |     y2: float,
15 |     expand_ratio: float = 1.0,
16 |     max_h: int = None,
17 |     max_w: int = None,
18 | ):
19 |     cx = 0.5 * (x1 + x2)
20 |     cy = 0.5 * (y1 + y2)
21 |     w = x2 - x1
22 |     h = y2 - y1
23 |     w = w * expand_ratio
24 |     h = h * expand_ratio
25 |     box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
26 |     if max_h is not None:
27 |         box[1] = max(0, box[1])
28 |         box[3] = min(max_h - 1, box[3])
29 |     if max_w is not None:
30 |         box[0] = max(0, box[0])
31 |         box[2] = min(max_w - 1, box[2])
32 |     return [int(b) for b in box]
33 | 
34 | 
35 | def mask2box(mask: torch.Tensor):
36 |     # use naive way
37 |     row = torch.nonzero(mask.sum(dim=0))[:, 0]
38 |     if len(row) == 0:
39 |         return None
40 |     x1 = row.min()
41 |     x2 = row.max()
42 |     col = np.nonzero(mask.sum(dim=1))[:, 0]
43 |     y1 = col.min()
44 |     y2 = col.max()
45 |     return x1, y1, x2 + 1, y2 + 1
46 | 
47 | 
48 | def crop_with_mask(
49 |     image: torch.Tensor,
50 |     mask: torch.Tensor,
51 |     bbox: torch.Tensor,
52 |     fill: Tuple[float, float, float] = (0, 0, 0),
53 |     expand_ratio: float = 1.0,
54 | ):
55 |     l, t, r, b = expand_box(*bbox, expand_ratio)
56 |     _, h, w = image.shape
57 |     l = max(l, 0)
58 |     t = max(t, 0)
59 |     r = min(r, w)
60 |     b = min(b, h)
61 |     new_image = torch.cat(
62 |         [image.new_full((1, b - t, r - l), fill_value=val) for val in fill]
63 |     )
64 |     # return image[:, t:b, l:r], mask[None, t:b, l:r]
65 |     return image[:, t:b, l:r] * mask[None, t:b, l:r] + (1 - mask[None, t:b, l:r]) * new_image, mask[None, t:b, l:r]


--------------------------------------------------------------------------------
/scan/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python3 setup.py build install
14 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/scan/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/scan/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/scan/modeling/transformer_decoder/open_vocab_mask2former_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
 3 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 4 | 
 5 | from torch import nn
 6 | from detectron2.config import configurable
 7 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder, MLP
 8 | 
 9 | 
10 | class OpenVocabMask2FormerPredictor(MultiScaleMaskedTransformerDecoder):
11 |     @configurable
12 |     def __init__(
13 |         self,
14 |         in_channels,
15 |         mask_classification=True,
16 |         *,
17 |         embedding_dim: int,
18 |         embed_hidden_dim: int,
19 |         embed_layers: int,
20 |         hidden_dim: int,
21 |         num_queries: int,
22 |         nheads: int,
23 |         # dropout: float,
24 |         dim_feedforward: int,
25 |         # enc_layers: int,
26 |         dec_layers: int,
27 |         pre_norm: bool,
28 |         # deep_supervision: bool,
29 |         mask_dim: int,
30 |         enforce_input_project: bool,
31 |     ):
32 |         super().__init__(
33 |             in_channels,
34 |             False,
35 |             num_classes=embedding_dim,
36 |             hidden_dim=hidden_dim,
37 |             num_queries=num_queries,
38 |             nheads=nheads,
39 |             # dropout=dropout,
40 |             dim_feedforward=dim_feedforward,
41 |             # enc_layers=enc_layers,
42 |             dec_layers=dec_layers,
43 |             pre_norm=pre_norm,
44 |             # deep_supervision=deep_supervision,
45 |             mask_dim=mask_dim,
46 |             enforce_input_project=enforce_input_project,
47 |         )
48 |         mask_classification = True
49 |         self.mask_classification = mask_classification
50 |         # output FFNs
51 |         if self.mask_classification:
52 |             self.class_embed = MLP(
53 |                 hidden_dim, embed_hidden_dim, embedding_dim, embed_layers
54 |             )
55 | 
56 |     def freeze_pretrained(self):
57 |         for name, module in self.named_children():
58 |             if name not in ["class_embed"]:
59 |                 for param in module.parameters():
60 |                     param.requires_grad = False
61 | 
62 |     @classmethod
63 |     def from_config(cls, cfg, in_channels, mask_classification):
64 |         ret = {}
65 |         ret["in_channels"] = in_channels
66 |         ret["mask_classification"] = mask_classification
67 | 
68 |         ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM
69 |         ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM
70 |         ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS
71 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
72 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
73 |         # Transformer parameters:
74 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
75 |         # ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
76 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
77 |         # ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
78 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
79 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
80 |         # ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
81 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
82 | 
83 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
84 | 
85 |         return ret
86 | 


--------------------------------------------------------------------------------
/scan/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/scan/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/scan/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .events import setup_wandb, WandbWriter
3 | from .predictor import VisualizationDemo


--------------------------------------------------------------------------------
/scan/utils/events.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
  3 | 
  4 | import os
  5 | import wandb
  6 | from detectron2.utils import comm
  7 | from detectron2.utils.events import EventWriter, get_event_storage
  8 | 
  9 | 
 10 | def setup_wandb(cfg, args):
 11 |     if comm.is_main_process():
 12 |         init_args = {
 13 |             k.lower(): v
 14 |             for k, v in cfg.WANDB.items()
 15 |             if isinstance(k, str) and k not in ["config", "name"]
 16 |         }
 17 |         # only include most related part to avoid too big table
 18 |         # TODO: add configurable params to select which part of `cfg` should be saved in config
 19 |         if "config_exclude_keys" in init_args:
 20 |             init_args["config"] = cfg
 21 |             init_args["config"]["cfg_file"] = args.config_file
 22 |         else:
 23 |             init_args["config"] = {
 24 |                 "model": cfg.MODEL,
 25 |                 "solver": cfg.SOLVER,
 26 |                 "cfg_file": args.config_file,
 27 |             }
 28 |         if ("name" not in init_args) or (init_args["name"] is None):
 29 |             init_args["name"] = os.path.basename(args.config_file)
 30 |         # wandb.init(**init_args)
 31 | 
 32 | 
 33 | class BaseRule(object):
 34 |     def __call__(self, target):
 35 |         return target
 36 | 
 37 | 
 38 | class IsIn(BaseRule):
 39 |     def __init__(self, keyword: str):
 40 |         self.keyword = keyword
 41 | 
 42 |     def __call__(self, target):
 43 |         return self.keyword in target
 44 | 
 45 | 
 46 | class Prefix(BaseRule):
 47 |     def __init__(self, keyword: str):
 48 |         self.keyword = keyword
 49 | 
 50 |     def __call__(self, target):
 51 |         return "/".join([self.keyword, target])
 52 | 
 53 | 
 54 | class WandbWriter(EventWriter):
 55 |     """
 56 |     Write all scalars to a tensorboard file.
 57 |     """
 58 | 
 59 |     def __init__(self):
 60 |         """
 61 |         Args:
 62 |             log_dir (str): the directory to save the output events
 63 |             kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
 64 |         """
 65 |         self._last_write = -1
 66 |         self._group_rules = [
 67 |             (IsIn("/"), BaseRule()),
 68 |             (IsIn("loss"), Prefix("train")),
 69 |         ]
 70 | 
 71 |     def write(self):
 72 | 
 73 |         storage = get_event_storage()
 74 | 
 75 |         def _group_name(scalar_name):
 76 |             for (rule, op) in self._group_rules:
 77 |                 if rule(scalar_name):
 78 |                     return op(scalar_name)
 79 |             return scalar_name
 80 | 
 81 |         stats = {
 82 |             _group_name(name): scalars[0]
 83 |             for name, scalars in storage.latest().items()
 84 |             if scalars[1] > self._last_write
 85 |         }
 86 |         if len(stats) > 0:
 87 |             self._last_write = max([v[1] for k, v in storage.latest().items()])
 88 | 
 89 |         # storage.put_{image,histogram} is only meant to be used by
 90 |         # tensorboard writer. So we access its internal fields directly from here.
 91 |         if len(storage._vis_data) >= 1:
 92 |             stats["image"] = [
 93 |                 wandb.Image(img, caption=img_name)
 94 |                 for img_name, img, step_num in storage._vis_data
 95 |             ]
 96 |             # Storage stores all image data and rely on this writer to clear them.
 97 |             # As a result it assumes only one writer will use its image data.
 98 |             # An alternative design is to let storage store limited recent
 99 |             # data (e.g. only the most recent image) that all writers can access.
100 |             # In that case a writer may not see all image data if its period is long.
101 |             storage.clear_images()
102 | 
103 |         if len(storage._histograms) >= 1:
104 | 
105 |             def create_bar(tag, bucket_limits, bucket_counts, **kwargs):
106 |                 data = [
107 |                     [label, val] for (label, val) in zip(bucket_limits, bucket_counts)
108 |                 ]
109 |                 table = wandb.Table(data=data, columns=["label", "value"])
110 |                 return wandb.plot.bar(table, "label", "value", title=tag)
111 | 
112 |             stats["hist"] = [create_bar(**params) for params in storage._histograms]
113 | 
114 |             storage.clear_histograms()
115 | 
116 |         if len(stats) == 0:
117 |             return
118 |         # wandb.log(stats, step=storage.iter)
119 | 
120 |     def close(self):
121 |         wandb.finish()
122 | 


--------------------------------------------------------------------------------
/scan/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | 
 17 | def _max_by_axis(the_list):
 18 |     # type: (List[List[int]]) -> List[int]
 19 |     maxes = the_list[0]
 20 |     for sublist in the_list[1:]:
 21 |         for index, item in enumerate(sublist):
 22 |             maxes[index] = max(maxes[index], item)
 23 |     return maxes
 24 | 
 25 | 
 26 | class NestedTensor(object):
 27 |     def __init__(self, tensors, mask: Optional[Tensor]):
 28 |         self.tensors = tensors
 29 |         self.mask = mask
 30 | 
 31 |     def to(self, device):
 32 |         # type: (Device) -> NestedTensor # noqa
 33 |         cast_tensor = self.tensors.to(device)
 34 |         mask = self.mask
 35 |         if mask is not None:
 36 |             assert mask is not None
 37 |             cast_mask = mask.to(device)
 38 |         else:
 39 |             cast_mask = None
 40 |         return NestedTensor(cast_tensor, cast_mask)
 41 | 
 42 |     def decompose(self):
 43 |         return self.tensors, self.mask
 44 | 
 45 |     def __repr__(self):
 46 |         return str(self.tensors)
 47 | 
 48 | 
 49 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 50 |     # TODO make this more general
 51 |     if tensor_list[0].ndim == 3:
 52 |         if torchvision._is_tracing():
 53 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 54 |             # call _onnx_nested_tensor_from_tensor_list() instead
 55 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 56 | 
 57 |         # TODO make it support different-sized images
 58 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 59 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 60 |         batch_shape = [len(tensor_list)] + max_size
 61 |         b, c, h, w = batch_shape
 62 |         dtype = tensor_list[0].dtype
 63 |         device = tensor_list[0].device
 64 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 65 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 66 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 67 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 68 |             m[: img.shape[1], : img.shape[2]] = False
 69 |     else:
 70 |         raise ValueError("not supported")
 71 |     return NestedTensor(tensor, mask)
 72 | 
 73 | 
 74 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 75 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 76 | @torch.jit.unused
 77 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 78 |     max_size = []
 79 |     for i in range(tensor_list[0].dim()):
 80 |         max_size_i = torch.max(
 81 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 82 |         ).to(torch.int64)
 83 |         max_size.append(max_size_i)
 84 |     max_size = tuple(max_size)
 85 | 
 86 |     # work around for
 87 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 88 |     # m[: img.shape[1], :img.shape[2]] = False
 89 |     # which is not yet supported in onnx
 90 |     padded_imgs = []
 91 |     padded_masks = []
 92 |     for img in tensor_list:
 93 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 94 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 95 |         padded_imgs.append(padded_img)
 96 | 
 97 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 98 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 99 |         padded_masks.append(padded_mask.to(torch.bool))
100 | 
101 |     tensor = torch.stack(padded_imgs)
102 |     mask = torch.stack(padded_masks)
103 | 
104 |     return NestedTensor(tensor, mask=mask)
105 | 
106 | 
107 | def is_dist_avail_and_initialized():
108 |     if not dist.is_available():
109 |         return False
110 |     if not dist.is_initialized():
111 |         return False
112 |     return True
113 | 
114 | def get_gt_binary_masks(gt_semseg):
115 |     mask_ids = torch.unique(gt_semseg)
116 |     gt_masks = []
117 |     for id in mask_ids:
118 |         if id != 255:
119 |             gt_masks.append(gt_semseg == id)
120 |     gt_masks = torch.stack(gt_masks).float()
121 |     return gt_masks


--------------------------------------------------------------------------------
/scan/utils/post_process_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import torch
 5 | from torch.nn import functional as F
 6 | import numpy as np
 7 | 
 8 | try:
 9 |     import pydensecrf.densecrf as dcrf
10 |     from pydensecrf.utils import (
11 |         unary_from_softmax,
12 |         unary_from_labels,
13 |         create_pairwise_bilateral,
14 |         create_pairwise_gaussian,
15 |     )
16 | except:
17 |     dcrf = None
18 | 
19 | 
20 | def dense_crf_post_process(
21 |     logits,
22 |     image,
23 |     n_labels=None,
24 |     max_iters=5,
25 |     pos_xy_std=(3, 3),
26 |     pos_w=3,
27 |     bi_xy_std=(80, 80),
28 |     bi_rgb_std=(13, 13, 13),
29 |     bi_w=10,
30 | ):
31 |     """
32 |     logits : [C,H,W]
33 |     image : [3,H,W]
34 |     """
35 |     if dcrf is None:
36 |         raise FileNotFoundError(
37 |             "pydensecrf is required to perform dense crf inference."
38 |         )
39 |     if isinstance(logits, torch.Tensor):
40 |         logits = F.softmax(logits, dim=0).detach().cpu().numpy()
41 |         U = unary_from_softmax(logits)
42 |         n_labels = logits.shape[0]
43 |     elif logits.ndim == 3:
44 |         U = unary_from_softmax(logits)
45 |         n_labels = logits.shape[0]
46 |     else:
47 |         assert n_labels is not None
48 |         U = unary_from_labels(logits, n_labels, zero_unsure=False)
49 | 
50 |     d = dcrf.DenseCRF2D(image.shape[1], image.shape[0], n_labels)
51 | 
52 |     d.setUnaryEnergy(U)
53 | 
54 |     # This adds the color-independent term, features are the locations only.
55 |     d.addPairwiseGaussian(
56 |         sxy=pos_xy_std,
57 |         compat=pos_w,
58 |         kernel=dcrf.DIAG_KERNEL,
59 |         normalization=dcrf.NORMALIZE_SYMMETRIC,
60 |     )
61 | 
62 |     # This adds the color-dependent term, i.e. features are (x,y,r,g,b).
63 |     d.addPairwiseBilateral(
64 |         sxy=bi_xy_std,
65 |         srgb=bi_rgb_std,
66 |         rgbim=image,
67 |         compat=bi_w,
68 |         kernel=dcrf.DIAG_KERNEL,
69 |         normalization=dcrf.NORMALIZE_SYMMETRIC,
70 |     )
71 |     # Run five inference steps.
72 |     logits = d.inference(max_iters)
73 |     logits = np.asarray(logits).reshape((n_labels, image.shape[0], image.shape[1]))
74 |     return torch.from_numpy(logits)
75 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-clip-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | 
23 | def transform(path):
24 |     model = torch.load(path, map_location="cpu")
25 |     print(f"loading {path}......")
26 |     state_dict = model["model"]
27 |     state_dict = {
28 |         k.replace("visual_model.", ""): v
29 |         for k, v in state_dict.items()
30 |         if k.startswith("visual_model")
31 |     }
32 |     source_keys = [k for k in state_dict.keys() if "relative_coords" in k]
33 |     for k in source_keys:
34 |         state_dict[
35 |             k.replace("relative_coords", "relative_position_index")
36 |         ] = state_dict[k]
37 |         del state_dict[k]
38 | 
39 |     source_keys = [k for k in state_dict.keys() if "atten_mask_matrix" in k]
40 |     for k in source_keys:
41 |         state_dict[k.replace("atten_mask_matrix", "attn_mask")] = state_dict[k]
42 |         del state_dict[k]
43 | 
44 |     source_keys = [k for k in state_dict.keys() if "rel_pos_embed_table" in k]
45 |     for k in source_keys:
46 |         state_dict[
47 |             k.replace("rel_pos_embed_table", "relative_position_bias_table")
48 |         ] = state_dict[k]
49 |         del state_dict[k]
50 | 
51 |     source_keys = [k for k in state_dict.keys() if "channel_reduction" in k]
52 |     for k in source_keys:
53 |         state_dict[k.replace("channel_reduction", "reduction")] = state_dict[k]
54 |         del state_dict[k]
55 |     return {
56 |         k if k.startswith("backbone.") else "backbone." + k: v
57 |         for k, v in state_dict.items()
58 |     }
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     input = sys.argv[1]
63 |     res = {
64 |         "model": transform(input),
65 |         "__author__": "third_party",
66 |         "matching_heuristics": True,
67 |     }
68 |     with open(sys.argv[2], "wb") as f:
69 |         pkl.dump(res, f)
70 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 |   These models typically produce slightly worse results than the
26 |   pre-trained ResNets we use in official configs, which are the
27 |   original ResNet models released by MSRA.
28 | """
29 | 
30 | if __name__ == "__main__":
31 |     input = sys.argv[1]
32 | 
33 |     obj = torch.load(input, map_location="cpu")
34 | 
35 |     newmodel = {}
36 |     for k in list(obj.keys()):
37 |         old_k = k
38 |         if "layer" not in k:
39 |             k = "stem." + k
40 |         for t in [1, 2, 3, 4]:
41 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
42 |         for t in [1, 2, 3]:
43 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
44 |         k = k.replace("downsample.0", "shortcut")
45 |         k = k.replace("downsample.1", "shortcut.norm")
46 |         print(old_k, "->", k)
47 |         newmodel[k] = obj.pop(old_k).detach().numpy()
48 | 
49 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
50 | 
51 |     with open(sys.argv[2], "wb") as f:
52 |         pkl.dump(res, f)
53 |     if obj:
54 |         print("Unconverted keys:", obj.keys())
55 | 


--------------------------------------------------------------------------------
/tools/replace_clip.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import torch
 5 | from collections import OrderedDict
 6 | 
 7 | 
 8 | # PATH to finetune clip model
 9 | clip_ckpt = torch.load('CS_CLIP.pt')
10 | 
11 | new_model = OrderedDict()
12 | state_dict = clip_ckpt['state_dict']
13 | 
14 | for k, v in state_dict.items():
15 |     if 'clip_model' in k:
16 |         new_key = k.replace('module.clip_model.','')
17 |         new_model[new_key] = v
18 | 
19 | # PATH to trained MaskFormer model
20 | ovseg_model = torch.load('Seg_model.pth', 'cpu')
21 | 
22 | for k, v in new_model.items():
23 |     new_k = 'clip_adapter.clip_model.' + k
24 |     if new_k in ovseg_model['model'].keys():
25 |         ovseg_model['model'][new_k] = v
26 |     else:
27 |         print(f'{new_k} does not exist in ckpt')
28 | try:
29 |     ovseg_model['model']['clip_adapter.clip_model.visual.mask_embedding'] = new_model['visual.mask_embedding']
30 |     print('clip_ckpt has mask_embedding, remember to set MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD True during evaluation')
31 | except:
32 |     print('clip_ckpt does not have mask_embedding, remember to set MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False during evaluation')
33 | 
34 | torch.save(ovseg_model, 'SCAN.pth')
35 | 


--------------------------------------------------------------------------------