├── datasets
    ├── __init__.py
    ├── __pycache__
    │   ├── dtd.cpython-38.pyc
    │   ├── bamboo.cpython-38.pyc
    │   ├── eurosat.cpython-38.pyc
    │   ├── food101.cpython-38.pyc
    │   ├── sun397.cpython-38.pyc
    │   ├── ucf101.cpython-38.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── caltech101.cpython-38.pyc
    │   ├── imagenet.cpython-38.pyc
    │   ├── imagenet_a.cpython-38.pyc
    │   ├── imagenet_r.cpython-38.pyc
    │   ├── imagenetv2.cpython-38.pyc
    │   ├── imagenet_21k.cpython-38.pyc
    │   ├── oxford_pets.cpython-38.pyc
    │   ├── fgvc_aircraft.cpython-38.pyc
    │   ├── imagenet_sketch.cpython-38.pyc
    │   ├── oxford_flowers.cpython-38.pyc
    │   └── stanford_cars.cpython-38.pyc
    ├── imagenet_sketch.py
    ├── imagenetv2.py
    ├── imagenet_r.py
    ├── imagenet_a.py
    ├── food101.py
    ├── caltech101.py
    ├── fgvc_aircraft.py
    ├── eurosat.py
    ├── stanford_cars.py
    ├── sun397.py
    ├── ucf101.py
    ├── oxford_flowers.py
    ├── dtd.py
    └── imagenet.py
├── trainers
    ├── __init__.py
    ├── vision_benchmark
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   └── prepare_submit.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   └── constants.cpython-38.pyc
    │   │   ├── constants.py
    │   │   ├── utils.py
    │   │   └── data_class_base.py
    │   ├── optim
    │   │   └── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── __pycache__
    │   │   └── __init__.cpython-38.pyc
    │   ├── datasets
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── metrics.cpython-38.pyc
    │   │   │   ├── prompts.cpython-38.pyc
    │   │   │   ├── hfpt_tokenizer.cpython-38.pyc
    │   │   │   └── simple_tokenizer.cpython-38.pyc
    │   │   ├── languages
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── __init__.py
    │   │   │   ├── build.py
    │   │   │   ├── prompt_engineering.py
    │   │   │   └── hfpt_tokenizer.py
    │   │   ├── __init__.py
    │   │   └── hfpt_tokenizer.py
    │   ├── evaluation
    │   │   ├── __pycache__
    │   │   │   ├── feature.cpython-38.pyc
    │   │   │   └── __init__.cpython-38.pyc
    │   │   ├── __init__.py
    │   │   └── dataset.py
    │   ├── resources
    │   │   ├── datasets
    │   │   │   ├── gtsrb.yaml
    │   │   │   ├── hateful-memes.yaml
    │   │   │   ├── mnist.yaml
    │   │   │   ├── dtd.yaml
    │   │   │   ├── cifar10.yaml
    │   │   │   ├── cifar100.yaml
    │   │   │   ├── country211.yaml
    │   │   │   ├── eurosat-clip.yaml
    │   │   │   ├── fer2013.yaml
    │   │   │   ├── resisc45-clip.yaml
    │   │   │   ├── rendered-sst2.yaml
    │   │   │   ├── caltech101.yaml
    │   │   │   ├── flower102.yaml
    │   │   │   ├── oxford-iiit-pets.yaml
    │   │   │   ├── patchcamelyon.yaml
    │   │   │   ├── stanfordcar.yaml
    │   │   │   ├── voc2007classification.yaml
    │   │   │   ├── kitti-distance.yaml
    │   │   │   ├── food101.yaml
    │   │   │   └── fgvc-aircraft-2013b.yaml
    │   │   ├── model
    │   │   │   ├── example.yaml
    │   │   │   ├── clip_example.yaml
    │   │   │   ├── deit_base_patch16_224.yaml
    │   │   │   ├── vit_base_patch16_224.yaml
    │   │   │   ├── vit_base_patch32_224.yaml
    │   │   │   ├── mae_vitb16.yaml
    │   │   │   ├── mocov3_vitb16.yaml
    │   │   │   ├── vitb16_CLIP.yaml
    │   │   │   ├── vitb32_CLIP.yaml
    │   │   │   ├── vitb32_SLIP.yaml
    │   │   │   ├── vitb32_DeCLIP.yaml
    │   │   │   ├── vitb32_DeCLIP_YFCC15M.yaml
    │   │   │   ├── vitb32_FILIP.yaml
    │   │   │   └── clip_swin_tiny.yaml
    │   │   └── knowledge
    │   │   │   ├── gpt3
    │   │   │       ├── GPT3_rendered-sst2.tsv
    │   │   │       ├── GPT3_patch-camelyon.tsv
    │   │   │       ├── GPT3_hateful-memes.tsv
    │   │   │       ├── GPT3_mnist.tsv
    │   │   │       ├── GPT3_kitti-distance.tsv
    │   │   │       ├── GPT3_fer-2013.tsv
    │   │   │       └── GPT3_eurosat_clip.tsv
    │   │   │   └── external
    │   │   │       ├── rendered-sst2_knowledge.tsv
    │   │   │       ├── patch-camelyon_knowledge.tsv
    │   │   │       ├── kitti-distance_knowledge.tsv
    │   │   │       ├── hateful-memes_knowledge.tsv
    │   │   │       ├── fer-2013_knowledge.tsv
    │   │   │       ├── eurosat_clip_knowledge.tsv
    │   │   │       ├── mnist_knowledge.tsv
    │   │   │       └── cifar-10_knowledge.tsv
    │   └── config
    │   │   ├── __init__.py
    │   │   └── models.py
    ├── __pycache__
    │   ├── coop.cpython-38.pyc
    │   ├── mvlpt.cpython-38.pyc
    │   ├── cocoop.cpython-38.pyc
    │   ├── zsclip.cpython-38.pyc
    │   ├── __init__.cpython-38.pyc
    │   └── imagenet_templates.cpython-38.pyc
    ├── imagenet_templates.py
    └── zsclip.py
├── clip
    ├── __init__.py
    ├── bpe_simple_vocab_16e6.txt.gz
    └── __pycache__
    │   ├── clip.cpython-38.pyc
    │   ├── model.cpython-38.pyc
    │   ├── __init__.cpython-38.pyc
    │   └── simple_tokenizer.cpython-38.pyc
├── configs
    ├── datasets
    │   ├── sun397.yaml
    │   ├── ucf101.yaml
    │   ├── eurosat.yaml
    │   ├── food101.yaml
    │   ├── imagenet.yaml
    │   ├── oxford_pets.yaml
    │   ├── caltech101.yaml
    │   ├── dtd.yaml
    │   ├── imagenet_a.yaml
    │   ├── imagenet_r.yaml
    │   ├── imagenetv2.yaml
    │   ├── oxford_flowers.yaml
    │   ├── fgvc_aircraft.yaml
    │   ├── stanford_cars.yaml
    │   └── imagenet_sketch.yaml
    └── trainers
    │   ├── CoOp
    │       ├── rn50_val.yaml
    │       ├── rn50.yaml
    │       ├── rn101.yaml
    │       ├── rn101_ep50.yaml
    │       ├── rn50_ep50.yaml
    │       ├── vit_b16_ep50.yaml
    │       ├── vit_b32.yaml
    │       ├── vit_b32_ep50.yaml
    │       ├── vit_l14.yaml
    │       ├── rn50_ep100.yaml
    │       ├── vit_b16_ep100.yaml
    │       ├── vit_l14_336.yaml
    │       ├── rn50_ctxv1.yaml
    │       ├── rn50_ep50_ctxv1.yaml
    │       ├── vit_b16_ctxv1.yaml
    │       ├── vit_b16_ep50_ctxv1.yaml
    │       ├── vit_b16_ep100_ctxv1.yaml
    │       └── vit_b16.yaml
    │   ├── MVLPT
    │       ├── rn50_val.yaml
    │       ├── rn50.yaml
    │       ├── rn101.yaml
    │       ├── rn50_ep50.yaml
    │       ├── rn101_ep50.yaml
    │       ├── vit_b32.yaml
    │       ├── vit_l14.yaml
    │       ├── rn50_ep100.yaml
    │       ├── vit_b16_ep100.yaml
    │       ├── vit_b16_ep50.yaml
    │       ├── vit_b32_ep50.yaml
    │       ├── vit_l14_336.yaml
    │       ├── rn50_ctxv1.yaml
    │       ├── rn50_ep50_ctxv1.yaml
    │       ├── vit_b16_ctxv1.yaml
    │       ├── vit_b16_ep50_ctxv1.yaml
    │       ├── vit_b16_ep100_ctxv1.yaml
    │       └── vit_b16.yaml
    │   └── CoCoOp
    │       ├── vit_b16_c4_ep10_batch1.yaml
    │       ├── vit_b16_c8_ep10_batch1.yaml
    │       ├── vit_b16_c16_ep10_batch1.yaml
    │       └── vit_b16_c4_ep10_batch1_ctxv1.yaml
├── figs
    ├── fig2-cropped-1.png
    ├── ablate_flops_all-1.png
    └── MVLPT_figures-cropped-1.png
├── scripts
    ├── cocoop
    │   ├── README.md
    │   ├── xd_train.sh
    │   ├── xd_test.sh
    │   ├── base2new_train.sh
    │   └── base2new_test.sh
    ├── coop
    │   ├── README.md
    │   ├── zeroshot.sh
    │   ├── eval.sh
    │   └── main.sh
    ├── mvlpt
    │   ├── zeroshot.sh
    │   ├── env_mvlpt.yml
    │   ├── main_mt_coopdata_cut.sh
    │   ├── main_single_coopdata_cut.sh
    │   ├── main_mt_elevater_cut.sh
    │   └── main_single_elevater_cut.sh
    ├── interpret_prompt.py
    ├── avg_ckpt.py
    ├── data.sh
    └── read_record.py
├── lpclip
    ├── linear_probe.sh
    ├── README.md
    └── feat_extractor.sh
├── requirements.txt
├── LICENSE
└── README.md


/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/configs/datasets/sun397.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "SUN397"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/ucf101.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "UCF101"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/eurosat.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "EuroSAT"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/food101.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "Food101"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/imagenet.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "ImageNet"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/oxford_pets.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "OxfordPets"


--------------------------------------------------------------------------------
/configs/datasets/caltech101.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "Caltech101"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/dtd.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "DescribableTextures"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/imagenet_a.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "ImageNetA"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/imagenet_r.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "ImageNetR"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/imagenetv2.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "ImageNetV2"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/oxford_flowers.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "OxfordFlowers"


--------------------------------------------------------------------------------
/configs/datasets/fgvc_aircraft.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "FGVCAircraft"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/stanford_cars.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "StanfordCars"
3 | 


--------------------------------------------------------------------------------
/configs/datasets/imagenet_sketch.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   NAME: "ImageNetSketch"
3 | 


--------------------------------------------------------------------------------
/figs/fig2-cropped-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/figs/fig2-cropped-1.png


--------------------------------------------------------------------------------
/scripts/cocoop/README.md:
--------------------------------------------------------------------------------
1 | These scripts are only for reproducing the results on the CVPR'22 paper.


--------------------------------------------------------------------------------
/scripts/coop/README.md:
--------------------------------------------------------------------------------
1 | These scripts are only for reproducing the results on the IJCV'22 paper.


--------------------------------------------------------------------------------
/figs/ablate_flops_all-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/figs/ablate_flops_all-1.png


--------------------------------------------------------------------------------
/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/figs/MVLPT_figures-cropped-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/figs/MVLPT_figures-cropped-1.png


--------------------------------------------------------------------------------
/clip/__pycache__/clip.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/clip.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_optimizer
2 | 
3 | __all__ = ['build_optimizer']
4 | 


--------------------------------------------------------------------------------
/clip/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/clip/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/dtd.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/dtd.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/__pycache__/coop.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/coop.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/__pycache__/mvlpt.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/mvlpt.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/bamboo.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/bamboo.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/eurosat.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/eurosat.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/food101.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/food101.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/sun397.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/sun397.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/ucf101.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/ucf101.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/__pycache__/cocoop.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/cocoop.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/__pycache__/zsclip.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/zsclip.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/caltech101.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/caltech101.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/imagenet.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/imagenet_a.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_a.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/imagenet_r.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_r.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/imagenetv2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenetv2.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/clip/__pycache__/simple_tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/simple_tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/imagenet_21k.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_21k.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/oxford_pets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/oxford_pets.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/fgvc_aircraft.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/fgvc_aircraft.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/imagenet_sketch.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_sketch.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/oxford_flowers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/oxford_flowers.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/stanford_cars.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/stanford_cars.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .comm import comm
2 | from .utils import create_logger
3 | 
4 | __all__ = ['comm', 'create_logger']
5 | 


--------------------------------------------------------------------------------
/trainers/__pycache__/imagenet_templates.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/imagenet_templates.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/trainers/vision_benchmark/common/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/common/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/common/__pycache__/constants.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/common/__pycache__/constants.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/__pycache__/metrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/metrics.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/__pycache__/prompts.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/prompts.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/evaluation/__pycache__/feature.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/evaluation/__pycache__/feature.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/trainers/vision_benchmark/evaluation/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/evaluation/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/gtsrb.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'gtsrb'
4 |   ROOT: '../DATASET/gtsrb/'
5 |   NUM_CLASSES: 43
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/hateful-memes.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   DATASET: 'hateful-memes'
3 |   ROOT: '../DATASET/hateful_memes/'
4 |   NUM_CLASSES: 2
5 | TEST:
6 |   METRIC: 'roc_auc'


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/mnist.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'mnist'
4 |   ROOT: '../DATASET/mnist/'
5 |   NUM_CLASSES: 10
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/__pycache__/hfpt_tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/hfpt_tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/dtd.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'dtd'
5 |   ROOT: '../DATASET/dtd-v1/'
6 |   NUM_CLASSES: 47
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/__pycache__/simple_tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/simple_tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/cifar10.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'cifar-10'
5 |   ROOT: '../../DATASET/cifar10/'
6 |   NUM_CLASSES: 10
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/cifar100.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'cifar-100'
5 |   ROOT: '../DATASET/cifar100/'
6 |   NUM_CLASSES: 100
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/country211.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'country211'
4 |   ROOT: '../DATASET/country211/'
5 |   NUM_CLASSES: 211
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/eurosat-clip.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'eurosat_clip'
4 |   ROOT: '../DATASET/eurosat_clip/'
5 |   NUM_CLASSES: 10
6 | TEST:
7 |   METRIC: 'accuracy'


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/fer2013.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'fer-2013'
5 |   ROOT: '../DATASET/fer2013-v1/'
6 |   NUM_CLASSES: 7
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/resisc45-clip.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'resisc45_clip'
4 |   ROOT: '../DATASET/resisc45_clip/'
5 |   NUM_CLASSES: 45
6 | TEST:
7 |   METRIC: 'accuracy'


--------------------------------------------------------------------------------
/trainers/vision_benchmark/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .default import _C as config
2 | from .default import update_config
3 | from .models import MODEL_SPECS
4 | 
5 | __all__ = ['config', 'update_config', 'MODEL_SPECS']
6 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/rendered-sst2.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'rendered-sst2'
4 |   ROOT: '../DATASET/rendered-sst2/'
5 |   NUM_CLASSES: 2
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/caltech101.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'caltech-101'
5 |   ROOT: '../DATASET/caltech101-tf/'
6 |   NUM_CLASSES: 102
7 | TEST:
8 |   METRIC: 'mean-per-class'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/flower102.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'oxford-flower-102'
4 |   ROOT: '../DATASET/flower102/'
5 |   NUM_CLASSES: 102
6 | TEST:
7 |   METRIC: 'mean-per-class'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/oxford-iiit-pets.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'oxford-iiit-pets'
4 |   ROOT: '../DATASET/pet37/'
5 |   NUM_CLASSES: 37
6 | TEST:
7 |   METRIC: 'mean-per-class'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/patchcamelyon.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'patch-camelyon'
5 |   ROOT: '../DATASET/patchcamelyon/'
6 |   NUM_CLASSES: 2
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/stanfordcar.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'stanford-cars'
5 |   ROOT: '../DATASET/stanfordcars/'
6 |   NUM_CLASSES: 196
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/voc2007classification.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'voc-2007-classification'
4 |   ROOT: '../DATASET/voc2007/'
5 |   NUM_CLASSES: 20
6 | TEST:
7 |   METRIC: '11point_mAP'
8 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/kitti-distance.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'kitti-distance'
4 |   CENTER_CROP: false
5 |   ROOT: '../DATASET/kitti_distance_20210923/'
6 |   NUM_CLASSES: 4
7 | TEST:
8 |   METRIC: 'accuracy'


--------------------------------------------------------------------------------
/lpclip/linear_probe.sh:
--------------------------------------------------------------------------------
 1 | feature_dir=clip_feat
 2 | 
 3 | for DATASET in OxfordPets
 4 | do
 5 |     python linear_probe.py \
 6 |     --dataset ${DATASET} \
 7 |     --feature_dir ${feature_dir} \
 8 |     --num_step 8 \
 9 |     --num_run 3
10 | done
11 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/food101.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | DATA_DIR: ''
 3 | 
 4 | DATASET:
 5 |   DATASET: 'food-101'
 6 |   ROOT: '../DATASET/food101/'
 7 |   NUM_CLASSES: 101
 8 | TEST:
 9 |   METRIC: 'accuracy'
10 | DEBUG:
11 |   DEBUG: false
12 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/datasets/fgvc-aircraft-2013b.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | DATA_DIR: ''
 3 | 
 4 | DATASET:
 5 |   DATASET: 'fgvc-aircraft-2013b-variants102'
 6 |   ROOT: '../DATASET/fgvc-aircraft-2013b-variants102/'
 7 |   NUM_CLASSES: 100
 8 | TEST:
 9 |   METRIC: 'mean-per-class'
10 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/example.yaml:
--------------------------------------------------------------------------------
 1 | INPUT:
 2 |   MEAN:
 3 |   - 0.485
 4 |   - 0.456
 5 |   - 0.406
 6 |   STD:
 7 |   - 0.229
 8 |   - 0.224
 9 |   - 0.225
10 | MODEL:
11 |   NAME: cls_example
12 |   NUM_PARAMS_IN_M: 11
13 |   AUTHOR: 'MSFT'
14 |   PRETRAINED_DATA: 'ImageNet22K'
15 |   CREATION_TIME: '2019-05-27'


--------------------------------------------------------------------------------
/trainers/vision_benchmark/common/constants.py:
--------------------------------------------------------------------------------
 1 | from vision_datasets import DatasetHub
 2 | import pathlib
 3 | 
 4 | 
 5 | def get_dataset_hub():
 6 |     vision_dataset_json = (pathlib.Path(__file__).resolve().parents[1] / 'resources' / 'datasets' / 'vision_datasets.json').read_text()
 7 |     hub = DatasetHub(vision_dataset_json)
 8 | 
 9 |     return hub
10 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .prompts import class_map, template_map, class_map_metric
2 | from .simple_tokenizer import SimpleTokenizer
3 | from .hfpt_tokenizer import HFPTTokenizer
4 | from .metrics import get_metric
5 | 
6 | __all__ = ['class_map', 'template_map', 'SimpleTokenizer', 'HFPTTokenizer', 'class_map_metric', 'get_metric']
7 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/languages/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from typing import Union, List
 6 | 
 7 | from .simple_tokenizer import SimpleTokenizer
 8 | from .hfpt_tokenizer import HFPTTokenizer
 9 | 
10 | from .build import build_tokenizer
11 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature import extract_features, extract_text_features, construct_dataloader, \
2 | construct_multitask_dataset
3 | 
4 | __all__ = ['extract_features', 'linear_classifier', 'lr_classifier', 'extract_text_features', 'clip_zeroshot_evaluator', 'construct_dataloader', 'full_model_finetune', 'linear_classifier_contrast', 
5 | 'construct_multitask_dataset']
6 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_rendered-sst2.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "negative", "gpt3": [" Not positive or neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral."]}, {"classname": "positive", "gpt3": [" Not negative or neutral.", " Not negative or neutral.", " Not negative nor neutral.", " Not negative nor neutral.", " Not negative or neutral."]}]


--------------------------------------------------------------------------------
/scripts/coop/zeroshot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | DATA=/path/to/datasets
 5 | TRAINER=ZeroshotCLIP
 6 | DATASET=$1
 7 | CFG=$2  # rn50, rn101, vit_b32 or vit_b16
 8 | 
 9 | python train.py \
10 | --root ${DATA} \
11 | --trainer ${TRAINER} \
12 | --dataset-config-file configs/datasets/${DATASET}.yaml \
13 | --config-file configs/trainers/CoOp/${CFG}.yaml \
14 | --output-dir output/${TRAINER}/${CFG}/${DATASET} \
15 | --eval-only


--------------------------------------------------------------------------------
/scripts/mvlpt/zeroshot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | DATA=/path/to/datasets
 5 | TRAINER=ZeroshotCLIP
 6 | DATASET=$1
 7 | CFG=$2  # rn50, rn101, vit_b32 or vit_b16
 8 | 
 9 | python train.py \
10 | --root ${DATA} \
11 | --trainer ${TRAINER} \
12 | --dataset-config-file configs/datasets/${DATASET}.yaml \
13 | --config-file configs/trainers/CoOp/${CFG}.yaml \
14 | --output-dir output/${TRAINER}/${CFG}/${DATASET} \
15 | --eval-only


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn50_val.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 200
 4 |   TEST:
 5 |     BATCH_SIZE: 200
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | MODEL:
16 |   BACKBONE:
17 |     NAME: "RN50"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn50_val.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 200
 4 |   TEST:
 5 |     BATCH_SIZE: 200
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | MODEL:
16 |   BACKBONE:
17 |     NAME: "RN50"


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/clip_example.yaml:
--------------------------------------------------------------------------------
 1 | INPUT:
 2 |   MEAN:
 3 |   - 0.485
 4 |   - 0.456
 5 |   - 0.406
 6 |   STD:
 7 |   - 0.229
 8 |   - 0.224
 9 |   - 0.225
10 | MODEL:
11 |   NAME: clip_example
12 |   NUM_PARAMS_IN_M: 11.0
13 |   AUTHOR: 'MSFT'
14 |   PRETRAINED_DATA: 'ImageNet22K'
15 |   CREATION_TIME: '2019-05-27'
16 |   # Following configuration is needed for CLIP model.
17 |   SPEC:
18 |     TEXT:
19 |       TOKENIZER: clip
20 |       STYLE: clip
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ftfy
 2 | regex
 3 | tqdm
 4 | pytorch_lightning==1.4.0
 5 | torchmetrics==0.6.0
 6 | transformers==4.8.1
 7 | Pillow==8.3.1
 8 | tqdm==4.53.0
 9 | ipdb==0.13.7
10 | numpy==1.19.2
11 | einops==0.3.0
12 | pyarrow==2.0.0
13 | sacred==0.8.2
14 | pandas==1.1.5
15 | git+https://github.com/rwightman/pytorch-image-models.git
16 | scipy
17 | tensorboardX
18 | opencv-python
19 | datasets
20 | nltk
21 | git+https://github.com/KaiyangZhou/Dassl.pytorch.git
22 | vision_datasets
23 | gdown


--------------------------------------------------------------------------------
/lpclip/README.md:
--------------------------------------------------------------------------------
 1 | # Linear Probe CLIP
 2 | 
 3 | To run linear probe baselines, make sure that your current working directory is `lpclip/`.
 4 | 
 5 | Step 1: Extract Features using the CLIP Image Encoder
 6 | ```bash
 7 | sh feat_extractor.sh
 8 | ```
 9 | 
10 | Step 2: Train few-shot linear probe
11 | ```bash
12 | sh linear_probe.sh
13 | ```
14 | 
15 | We follow the instructions stated in the Appendix A3 (pp.38) of [the original CLIP paper](https://arxiv.org/pdf/2103.00020.pdf), with a careful hyperparameter sweep.
16 | 
17 | Note: please pull the latest Dassl (version >= `606a2c6`).
18 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/languages/build.py:
--------------------------------------------------------------------------------
 1 | from .hfpt_tokenizer import HFPTTokenizer
 2 | from .simple_tokenizer import SimpleTokenizer
 3 | 
 4 | 
 5 | def build_tokenizer(tokenizer_name):
 6 |     tokenizer = None
 7 |     if tokenizer_name == 'clip':
 8 |         tokenizer = SimpleTokenizer()
 9 |     elif 'hf_' in tokenizer_name:
10 |         tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:])
11 |     elif 'hfc_' in tokenizer_name:
12 |         tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:])
13 |     else:
14 |         raise ValueError('Unknown tokenizer')
15 | 
16 |     return tokenizer
17 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/rendered-sst2_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "negative", "def_wiki": "Not positive nor neutral.", "path_wn": ["negative", "denial", "speech_act", "act", "event", "psychological_feature", "abstraction", "entity"], "def_wn": "a reply of denial"}, {"classname": "positive", "def_wiki": "Not negative or neutral.", "path_wn": ["positive", "adjective", "modifier", "content_word", "word", "language_unit", "part", "relation", "abstraction", "entity"], "def_wn": "the primary form of an adjective or adverb; denotes a quality without qualification, comparison, or relation to increase or diminution"}]


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn101.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN101"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn101_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN101"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn50_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn101.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN101"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn50_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b16_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b32.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/32"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b32_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/32"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_l14.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-L/14"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn101_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN101"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b32.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/32"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_l14.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-L/14"


--------------------------------------------------------------------------------
/lpclip/feat_extractor.sh:
--------------------------------------------------------------------------------
 1 | # sh feat_extractor.sh
 2 | DATA=/path/to/datasets
 3 | OUTPUT='./clip_feat/'
 4 | SEED=1
 5 | 
 6 | # oxford_pets oxford_flowers fgvc_aircraft dtd eurosat stanford_cars food101 sun397 caltech101 ucf101 imagenet
 7 | for DATASET in oxford_pets
 8 | do
 9 |     for SPLIT in train val test
10 |     do
11 |         python feat_extractor.py \
12 |         --split ${SPLIT} \
13 |         --root ${DATA} \
14 |         --seed ${SEED} \
15 |         --dataset-config-file ../configs/datasets/${DATASET}.yaml \
16 |         --config-file ../configs/trainers/CoOp/rn50_val.yaml \
17 |         --output-dir ${OUTPUT} \
18 |         --eval-only
19 |     done
20 | done
21 | 


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn50_ep100.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 100
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"
30 | 


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b16_ep100.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 100
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn50_ep100.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 100
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"
30 | 


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b16_ep100.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 100
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b16_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b32_ep50.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/32"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_l14_336.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (336, 336)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-L/14@336px"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_l14_336.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (336, 336)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-L/14@336px"


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/deit_base_patch16_224.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: 'OUTPUT/DEIT_BASE_PATCH16_224/'
 3 | 
 4 | MODEL:
 5 |   NAME: deit_base_patch16_224
 6 |   NUM_PARAMS_IN_M: 86.5
 7 |   AUTHOR: 'timm'
 8 |   PRETRAINED_DATA: 'ImageNet1K'
 9 |   CREATION_TIME: '2020-10-13'
10 |   SPEC:
11 |     EMBED_DIM: 768
12 | 
13 | TEST:
14 |   BATCH_SIZE_PER_GPU: 128
15 |   MODEL_FILE: ''
16 | 
17 | TRAIN:
18 |   BATCH_SIZE_PER_GPU: 64
19 |   BEGIN_EPOCH: 0
20 |   END_EPOCH: 10
21 |   EXTRA_FINAL_TRAIN_EPOCH: 40
22 |   OPTIMIZER: sgd
23 |   WD: 0.
24 |   MOMENTUM: 0.9
25 |   NESTEROV: false
26 |   SHUFFLE: true
27 |   LR_SCHEDULER:
28 |     METHOD: 'WarmupCosine'
29 |     WARMUP_EPOCH: 5
30 | 


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn50_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/CoOp/rn50_ep50_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn50_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/rn50_ep50_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "RN50"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b16_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b16_ep50_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b16_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b16_ep50_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 50
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b16_ep100_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 100
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b16_ep100_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 100
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COOP:
33 |     CTX_INIT: "a photo of a"
34 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/patch-camelyon_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "lymph node", "def_wiki": "Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue filled with lymphocytes and macrophages that collect and destroy bacteria, viruses and foreign matter from lymph. When the body is fighting an infection, these lymphocytes multiply rapidly and produce a characteristic swelling of the lymph nodes.", "path_wn": "", "def_wn": ""}, {"classname": "lymph node containing metastatic tumor tissue", "def_wiki": "Thin, woven, gauze-like fabric.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vit_base_patch16_224.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: 'OUTPUT/VIT_BASE_PATCH16_224/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.5, 0.5, 0.5]
 6 |   STD: [0.5, 0.5, 0.5]
 7 | 
 8 | MODEL:
 9 |   NAME: vit_base_patch16_224
10 |   NUM_PARAMS_IN_M: 86.5
11 |   AUTHOR: 'timm'
12 |   PRETRAINED_DATA: 'ImageNet22K'
13 |   CREATION_TIME: '2020-10-13'
14 |   SPEC:
15 |     EMBED_DIM: 768
16 | 
17 | TEST:
18 |   BATCH_SIZE_PER_GPU: 128
19 |   MODEL_FILE: ''
20 | 
21 | TRAIN:
22 |   BATCH_SIZE_PER_GPU: 64
23 |   BEGIN_EPOCH: 0
24 |   END_EPOCH: 10
25 |   EXTRA_FINAL_TRAIN_EPOCH: 40
26 |   OPTIMIZER: sgd
27 |   WD: 0.
28 |   MOMENTUM: 0.9
29 |   NESTEROV: false
30 |   SHUFFLE: true
31 |   LR_SCHEDULER:
32 |     METHOD: 'WarmupCosine'
33 |     WARMUP_EPOCH: 5
34 | 


--------------------------------------------------------------------------------
/configs/trainers/CoCoOp/vit_b16_c4_ep10_batch1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 1
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 10
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 20
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COCOOP:
33 |     N_CTX: 4
34 |     CTX_INIT: ""
35 |     PREC: "fp16"


--------------------------------------------------------------------------------
/configs/trainers/CoCoOp/vit_b16_c8_ep10_batch1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 1
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 10
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 20
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COCOOP:
33 |     N_CTX: 8
34 |     CTX_INIT: ""
35 |     PREC: "fp16"


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vit_base_patch32_224.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VIT_BASE_PATCH32_224/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.5, 0.5, 0.5]
 6 |   STD: [0.5, 0.5, 0.5]
 7 | 
 8 | MODEL:
 9 |   NAME: vit_base_patch32_224
10 |   NUM_PARAMS_IN_M: 88.2
11 |   AUTHOR: 'timm'
12 |   PRETRAINED_DATA: 'ImageNet22K'
13 |   CREATION_TIME: '2020-10-13'
14 |   SPEC:
15 |     EMBED_DIM: 768
16 | 
17 | TEST:
18 |   BATCH_SIZE_PER_GPU: 128
19 |   MODEL_FILE: ''
20 | 
21 | TRAIN:
22 |   BATCH_SIZE_PER_GPU: 64
23 |   BEGIN_EPOCH: 0
24 |   END_EPOCH: 10
25 |   EXTRA_FINAL_TRAIN_EPOCH: 40
26 |   OPTIMIZER: sgd
27 |   WD: 0.
28 |   MOMENTUM: 0.9
29 |   NESTEROV: false
30 |   SHUFFLE: true
31 |   LR_SCHEDULER:
32 |     METHOD: 'WarmupCosine'
33 |     WARMUP_EPOCH: 5
34 | 


--------------------------------------------------------------------------------
/configs/trainers/CoCoOp/vit_b16_c16_ep10_batch1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 1
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 10
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 20
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COCOOP:
33 |     N_CTX: 16
34 |     CTX_INIT: ""
35 |     PREC: "fp16"


--------------------------------------------------------------------------------
/configs/trainers/CoOp/vit_b16.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | DATASET:
32 |   VAL_SET: ""
33 |   TEST_SET: "val"
34 |   TRAIN_SET: "train"
35 |   CENTER_CROP: False
36 | 


--------------------------------------------------------------------------------
/configs/trainers/MVLPT/vit_b16.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 32
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 200
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 5
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | DATASET:
32 |   VAL_SET: ""
33 |   TEST_SET: "val"
34 |   TRAIN_SET: "train"
35 |   CENTER_CROP: False
36 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/kitti-distance_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "a photo i took of a car on my left or right side.", "def_wiki": "The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car nearby.", "def_wiki": "adjacent, near, close by", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car in the distance.", "def_wiki": "far away; a long distance away", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with no car.", "def_wiki": "A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/configs/trainers/CoCoOp/vit_b16_c4_ep10_batch1_ctxv1.yaml:
--------------------------------------------------------------------------------
 1 | DATALOADER:
 2 |   TRAIN_X:
 3 |     BATCH_SIZE: 1
 4 |   TEST:
 5 |     BATCH_SIZE: 100
 6 |   NUM_WORKERS: 8
 7 | 
 8 | INPUT:
 9 |   SIZE: (224, 224)
10 |   INTERPOLATION: "bicubic"
11 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
12 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
13 |   TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
14 | 
15 | OPTIM:
16 |   NAME: "sgd"
17 |   LR: 0.002
18 |   MAX_EPOCH: 10
19 |   LR_SCHEDULER: "cosine"
20 |   WARMUP_EPOCH: 1
21 |   WARMUP_TYPE: "constant"
22 |   WARMUP_CONS_LR: 1e-5
23 | 
24 | TRAIN:
25 |   PRINT_FREQ: 20
26 | 
27 | MODEL:
28 |   BACKBONE:
29 |     NAME: "ViT-B/16"
30 | 
31 | TRAINER:
32 |   COCOOP:
33 |     N_CTX: 4
34 |     CTX_INIT: "a photo of a"
35 |     PREC: "fp16"


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/hateful-memes_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": ["meme", "acculturation", "content", "cognition", "psychological_feature", "abstraction", "entity"], "def_wn": "a cultural unit (an idea or value or pattern of behavior) that is passed from one person to another by non-genetic means (as by imitation)"}, {"classname": "hatespeech meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/scripts/cocoop/xd_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ../..
 4 | 
 5 | # custom config
 6 | DATA=/path/to/datasets
 7 | TRAINER=CoCoOp
 8 | # TRAINER=CoOp
 9 | 
10 | DATASET=imagenet
11 | SEED=$1
12 | 
13 | CFG=vit_b16_c4_ep10_batch1_ctxv1
14 | # CFG=vit_b16_ep50_ctxv1  # uncomment this when TRAINER=CoOp and DATASET=imagenet
15 | SHOTS=16
16 | 
17 | 
18 | DIR=output/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/seed${SEED}
19 | if [ -d "$DIR" ]; then
20 |     echo "Oops! The results exist at ${DIR} (so skip this job)"
21 | else
22 |     python train.py \
23 |     --root ${DATA} \
24 |     --seed ${SEED} \
25 |     --trainer ${TRAINER} \
26 |     --dataset-config-file configs/datasets/${DATASET}.yaml \
27 |     --config-file configs/trainers/${TRAINER}/${CFG}.yaml \
28 |     --output-dir ${DIR} \
29 |     DATASET.NUM_SHOTS ${SHOTS}
30 | fi


--------------------------------------------------------------------------------
/scripts/coop/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | DATA=/path/to/datasets
 5 | TRAINER=CoOp
 6 | SHOTS=16
 7 | NCTX=16
 8 | CSC=False
 9 | CTP=end
10 | 
11 | DATASET=$1
12 | CFG=$2
13 | 
14 | for SEED in 1 2 3
15 | do
16 |     python train.py \
17 |     --root ${DATA} \
18 |     --seed ${SEED} \
19 |     --trainer ${TRAINER} \
20 |     --dataset-config-file configs/datasets/${DATASET}.yaml \
21 |     --config-file configs/trainers/${TRAINER}/${CFG}.yaml \
22 |     --output-dir output/evaluation/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/${DATASET}/seed${SEED} \
23 |     --model-dir output/imagenet/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} \
24 |     --load-epoch 50 \
25 |     --eval-only \
26 |     TRAINER.COOP.N_CTX ${NCTX} \
27 |     TRAINER.COOP.CSC ${CSC} \
28 |     TRAINER.COOP.CLASS_TOKEN_POSITION ${CTP}
29 | done


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/mae_vitb16.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/'
 3 | 
 4 | MODEL:
 5 |   NAME: mae_vitb16
 6 |   NUM_PARAMS_IN_M: 86.6
 7 |   AUTHOR: 'Facebook'
 8 |   PRETRAINED_DATA: 'ImageNet22K'
 9 |   CREATION_TIME: '2020-10-13'
10 |   SPEC:
11 |     EMBED_DIM: 768
12 |     PATCH_SIZE: 16
13 |     DEPTH: 12
14 |     NUM_HEADS: 12
15 |     MLP_RATIO: 4
16 |     QKV_BIAS: True
17 |     GLOBAL_POOL: True
18 | 
19 | TEST:
20 |   BATCH_SIZE_PER_GPU: 128
21 |   MODEL_FILE: 'https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth'
22 | 
23 | TRAIN:
24 |   BATCH_SIZE_PER_GPU: 64
25 |   BEGIN_EPOCH: 0
26 |   END_EPOCH: 10
27 |   EXTRA_FINAL_TRAIN_EPOCH: 40
28 |   OPTIMIZER: sgd
29 |   WD: 0.
30 |   MOMENTUM: 0.9
31 |   NESTEROV: false
32 |   SHUFFLE: true
33 |   LR_SCHEDULER:
34 |     METHOD: 'WarmupCosine'
35 |     WARMUP_EPOCH: 5
36 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/mocov3_vitb16.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/'
 3 | 
 4 | MODEL:
 5 |   NAME: mocov3_vitb16
 6 |   NUM_PARAMS_IN_M: 86.6
 7 |   AUTHOR: 'Facebook'
 8 |   PRETRAINED_DATA: 'ImageNet22K'
 9 |   CREATION_TIME: '2020-10-13'
10 |   SPEC:
11 |     EMBED_DIM: 768
12 |     PATCH_SIZE: 16
13 |     DEPTH: 12
14 |     NUM_HEADS: 12
15 |     MLP_RATIO: 4
16 |     QKV_BIAS: True
17 |     GLOBAL_POOL: True
18 | 
19 | TEST:
20 |   BATCH_SIZE_PER_GPU: 128
21 |   MODEL_FILE: 'https://dl.fbaipublicfiles.com/moco-v3/vit-b-300ep/vit-b-300ep.pth.tar'
22 | 
23 | TRAIN:
24 |   BATCH_SIZE_PER_GPU: 64
25 |   BEGIN_EPOCH: 0
26 |   END_EPOCH: 10
27 |   EXTRA_FINAL_TRAIN_EPOCH: 40
28 |   OPTIMIZER: sgd
29 |   WD: 0.
30 |   MOMENTUM: 0.9
31 |   NESTEROV: false
32 |   SHUFFLE: true
33 |   LR_SCHEDULER:
34 |     METHOD: 'WarmupCosine'
35 |     WARMUP_EPOCH: 5
36 | 


--------------------------------------------------------------------------------
/scripts/cocoop/xd_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ../..
 4 | 
 5 | # custom config
 6 | DATA=/path/to/datasets
 7 | TRAINER=CoCoOp
 8 | # TRAINER=CoOp
 9 | 
10 | DATASET=$1
11 | SEED=$2
12 | 
13 | CFG=vit_b16_c4_ep10_batch1_ctxv1
14 | # CFG=vit_b16_ep50_ctxv1  # uncomment this when TRAINER=CoOp and DATASET=imagenet
15 | SHOTS=16
16 | 
17 | 
18 | DIR=output/evaluation/${TRAINER}/${CFG}_${SHOTS}shots/${DATASET}/seed${SEED}
19 | if [ -d "$DIR" ]; then
20 |     echo "Oops! The results exist at ${DIR} (so skip this job)"
21 | else
22 |     python train.py \
23 |     --root ${DATA} \
24 |     --seed ${SEED} \
25 |     --trainer ${TRAINER} \
26 |     --dataset-config-file configs/datasets/${DATASET}.yaml \
27 |     --config-file configs/trainers/${TRAINER}/${CFG}.yaml \
28 |     --output-dir ${DIR} \
29 |     --model-dir output/imagenet/${TRAINER}/${CFG}_${SHOTS}shots/seed${SEED} \
30 |     --load-epoch 10 \
31 |     --eval-only
32 | fi


--------------------------------------------------------------------------------
/scripts/cocoop/base2new_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ../..
 4 | 
 5 | # custom config
 6 | DATA=/path/to/datasets
 7 | TRAINER=CoCoOp
 8 | # TRAINER=CoOp
 9 | 
10 | DATASET=$1
11 | SEED=$2
12 | 
13 | CFG=vit_b16_c4_ep10_batch1_ctxv1
14 | # CFG=vit_b16_ctxv1  # uncomment this when TRAINER=CoOp
15 | # CFG=vit_b16_ep50_ctxv1  # uncomment this when TRAINER=CoOp and DATASET=imagenet
16 | SHOTS=16
17 | 
18 | 
19 | DIR=output/base2new/train_base/${DATASET}/shots_${SHOTS}/${TRAINER}/${CFG}/seed${SEED}
20 | if [ -d "$DIR" ]; then
21 |     echo "Oops! The results exist at ${DIR} (so skip this job)"
22 | else
23 |     python train.py \
24 |     --root ${DATA} \
25 |     --seed ${SEED} \
26 |     --trainer ${TRAINER} \
27 |     --dataset-config-file configs/datasets/${DATASET}.yaml \
28 |     --config-file configs/trainers/${TRAINER}/${CFG}.yaml \
29 |     --output-dir ${DIR} \
30 |     DATASET.NUM_SHOTS ${SHOTS} \
31 |     DATASET.SUBSAMPLE_CLASSES base
32 | fi


--------------------------------------------------------------------------------
/scripts/cocoop/base2new_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ../..
 4 | 
 5 | # custom config
 6 | DATA=/path/to/datasets
 7 | TRAINER=CoCoOp
 8 | # TRAINER=CoOp
 9 | 
10 | DATASET=$1
11 | SEED=$2
12 | 
13 | CFG=vit_b16_c4_ep10_batch1_ctxv1
14 | # CFG=vit_b16_ctxv1  # uncomment this when TRAINER=CoOp
15 | SHOTS=16
16 | LOADEP=10
17 | SUB=new
18 | 
19 | 
20 | COMMON_DIR=${DATASET}/shots_${SHOTS}/${TRAINER}/${CFG}/seed${SEED}
21 | MODEL_DIR=output/base2new/train_base/${COMMON_DIR}
22 | DIR=output/base2new/test_${SUB}/${COMMON_DIR}
23 | if [ -d "$DIR" ]; then
24 |     echo "Oops! The results exist at ${DIR} (so skip this job)"
25 | else
26 |     python train.py \
27 |     --root ${DATA} \
28 |     --seed ${SEED} \
29 |     --trainer ${TRAINER} \
30 |     --dataset-config-file configs/datasets/${DATASET}.yaml \
31 |     --config-file configs/trainers/${TRAINER}/${CFG}.yaml \
32 |     --output-dir ${DIR} \
33 |     --model-dir ${MODEL_DIR} \
34 |     --load-epoch ${LOADEP} \
35 |     --eval-only \
36 |     DATASET.NUM_SHOTS ${SHOTS} \
37 |     DATASET.SUBSAMPLE_CLASSES ${SUB}
38 | fi


--------------------------------------------------------------------------------
/scripts/coop/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | DATA=/path/to/datasets
 5 | TRAINER=CoOp
 6 | 
 7 | DATASET=$1
 8 | CFG=$2  # config file
 9 | CTP=$3  # class token position (end or middle)
10 | NCTX=$4  # number of context tokens
11 | SHOTS=$5  # number of shots (1, 2, 4, 8, 16)
12 | CSC=$6  # class-specific context (False or True)
13 | 
14 | for SEED in 1 2 3
15 | do
16 |     DIR=output/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED}
17 |     if [ -d "$DIR" ]; then
18 |         echo "Oops! The results exist at ${DIR} (so skip this job)"
19 |     else
20 |         python train.py \
21 |         --root ${DATA} \
22 |         --seed ${SEED} \
23 |         --trainer ${TRAINER} \
24 |         --dataset-config-file configs/datasets/${DATASET}.yaml \
25 |         --config-file configs/trainers/${TRAINER}/${CFG}.yaml \
26 |         --output-dir ${DIR} \
27 |         TRAINER.COOP.N_CTX ${NCTX} \
28 |         TRAINER.COOP.CSC ${CSC} \
29 |         TRAINER.COOP.CLASS_TOKEN_POSITION ${CTP} \
30 |         DATASET.NUM_SHOTS ${SHOTS}
31 |     fi
32 | done


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vitb16_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'ViT-B/16'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'OpenAI'
12 |   PRETRAINED_DATA: 'CLIP-data'
13 |   CREATION_TIME: '2021-01-05'
14 | 
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 16
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 | 
31 | TEST:
32 |   BATCH_SIZE_PER_GPU: 128
33 |   MODEL_FILE: ''
34 | 
35 | TRAIN:
36 |   BATCH_SIZE_PER_GPU: 64
37 |   BEGIN_EPOCH: 0
38 |   END_EPOCH: 10
39 |   EXTRA_FINAL_TRAIN_EPOCH: 40
40 |   OPTIMIZER: sgd
41 |   WD: 0.
42 |   MOMENTUM: 0.9
43 |   NESTEROV: false
44 |   SHUFFLE: true
45 |   LR_SCHEDULER:
46 |     METHOD: 'WarmupCosine'
47 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Kaiyang Zhou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vitb32_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'ViT-B/32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'OpenAI'
12 |   PRETRAINED_DATA: 'CLIP-data'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 | 
31 | TEST:
32 |   BATCH_SIZE_PER_GPU: 128
33 |   MODEL_FILE: ''
34 | 
35 | TRAIN:
36 |   BATCH_SIZE_PER_GPU: 64
37 |   BEGIN_EPOCH: 0
38 |   END_EPOCH: 10
39 |   EXTRA_FINAL_TRAIN_EPOCH: 40
40 |   OPTIMIZER: sgd
41 |   WD: 0.
42 |   MOMENTUM: 0.9
43 |   NESTEROV: false
44 |   SHUFFLE: true
45 |   LR_SCHEDULER:
46 |     METHOD: 'WarmupCosine'
47 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/trainers/vision_benchmark/common/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pprint
 3 | 
 4 | from torch.utils.collect_env import get_pretty_env_info
 5 | 
 6 | 
 7 | def log_arg_env_config(args, config, output_dir):
 8 |     logging.info("=> collecting env info (might take some time)")
 9 |     logging.info("\n" + get_pretty_env_info())
10 |     logging.info(pprint.pformat(args))
11 |     logging.info(config)
12 |     logging.info(f'=> saving logging info into: {output_dir}')
13 | 
14 | 
15 | def submit_predictions(prediction_list, submit_by, config, track, task):
16 |     from vision_benchmark.commands.submit_predictions import submit_predictions_to_leaderboard, submit_model_to_leaderboard
17 | 
18 |     submission = {
19 |         'dataset_name': config.DATASET.DATASET,
20 |         'model_name': config.MODEL.NAME,
21 |         'track': track,
22 |         'task': task,
23 |         'created_by': submit_by,
24 |         'predictions': [prediction_list]
25 |     }
26 | 
27 |     logging.info('Submit model and predictions to leaderboard.')
28 |     submit_predictions_to_leaderboard(submission)
29 | 
30 |     model_info = {
31 |         "name": config.MODEL.NAME,
32 |         "author": config.MODEL.AUTHOR,
33 |         "num_params_in_millions": config.MODEL.NUM_PARAMS_IN_M,
34 |         "pretrained_data": config.MODEL.PRETRAINED_DATA,
35 |         "creation_time": config.MODEL.CREATION_TIME
36 |     }
37 | 
38 |     submit_model_to_leaderboard(model_info)
39 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/fer-2013_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "angry", "def_wiki": "Displaying or feeling anger.", "path_wn": ["angry"], "def_wn": "feeling or showing anger"}, {"classname": "disgusted", "def_wiki": "Filled with disgust.", "path_wn": ["disgust", "dislike", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "fill with distaste"}, {"classname": "fearful", "def_wiki": "Frightening.", "path_wn": ["fearful"], "def_wn": "experiencing or showing fear"}, {"classname": "happy", "def_wiki": "Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", "path_wn": ["happy"], "def_wn": "enjoying or showing or marked by joy or pleasure"}, {"classname": "neutral", "def_wiki": "Not taking sides in a conflict such as war; nonaligned.", "path_wn": ["neutral", "person", "causal_agent", "physical_entity", "entity"], "def_wn": "one who does not side with any party in a war or dispute"}, {"classname": "sad", "def_wiki": "Emotionally negative.", "path_wn": ["sad"], "def_wn": "experiencing or showing sorrow or unhappiness; ; - Christina Rossetti"}, {"classname": "surprised", "def_wiki": "Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", "path_wn": ["surprise", "astonishment", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "cause to be surprised"}]


--------------------------------------------------------------------------------
/datasets/imagenet_sketch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 4 | from dassl.utils import listdir_nohidden
 5 | 
 6 | from .imagenet import ImageNet
 7 | 
 8 | 
 9 | @DATASET_REGISTRY.register()
10 | class ImageNetSketch(DatasetBase):
11 |     """ImageNet-Sketch.
12 | 
13 |     This dataset is used for testing only.
14 |     """
15 | 
16 |     dataset_dir = "imagenet-sketch"
17 | 
18 |     def __init__(self, cfg):
19 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
20 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
21 |         self.image_dir = os.path.join(self.dataset_dir, "images")
22 | 
23 |         text_file = os.path.join(self.dataset_dir, "classnames.txt")
24 |         classnames = ImageNet.read_classnames(text_file)
25 | 
26 |         data = self.read_data(classnames)
27 | 
28 |         super().__init__(train_x=data, test=data)
29 | 
30 |     def read_data(self, classnames):
31 |         image_dir = self.image_dir
32 |         folders = listdir_nohidden(image_dir, sort=True)
33 |         items = []
34 | 
35 |         for label, folder in enumerate(folders):
36 |             imnames = listdir_nohidden(os.path.join(image_dir, folder))
37 |             classname = classnames[folder]
38 |             for imname in imnames:
39 |                 impath = os.path.join(image_dir, folder, imname)
40 |                 item = Datum(impath=impath, label=label, classname=classname)
41 |                 items.append(item)
42 | 
43 |         return items
44 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | import os
 8 | import logging
 9 | import time
10 | 
11 | from .comm import comm
12 | 
13 | 
14 | def setup_logger(final_output_dir, rank, phase):
15 |     time_str = time.strftime('%Y-%m-%d-%H-%M')
16 |     log_file = f'{phase}_{time_str}_rank{rank}.txt'
17 |     final_log_file = os.path.join(final_output_dir, log_file)
18 |     head = "%(asctime)-15s:[P:%(process)d]:" + comm.head + ' %(message)s'
19 |     logging.basicConfig(
20 |         filename=str(final_log_file), format=head
21 |     )
22 |     logger = logging.getLogger()
23 |     logger.setLevel(logging.INFO)
24 |     console = logging.StreamHandler()
25 |     console.setFormatter(
26 |         logging.Formatter(head)
27 |     )
28 |     logging.getLogger('').addHandler(console)
29 | 
30 | 
31 | def create_logger(cfg, phase='train'):
32 |     root_output_dir = Path(cfg.OUTPUT_DIR)
33 |     dataset = cfg.DATASET.DATASET
34 |     cfg_name = cfg.NAME
35 | 
36 |     final_output_dir = root_output_dir / dataset / cfg_name
37 | 
38 |     print('=> creating {} ...'.format(root_output_dir))
39 |     root_output_dir.mkdir(parents=True, exist_ok=True)
40 |     print('=> creating {} ...'.format(final_output_dir))
41 |     final_output_dir.mkdir(parents=True, exist_ok=True)
42 | 
43 |     print('=> setup logger ...')
44 |     setup_logger(final_output_dir, cfg.RANK, phase)
45 | 
46 |     return str(final_output_dir)
47 | 
48 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_patch-camelyon.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "lymph node", "gpt3": [" A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infection by trapping ger", " Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue", " Thin, woven, gauze-like fabric.", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger"]}, {"classname": "lymph node containing metastatic tumor tissue", "gpt3": [" Thin, woven, gauze-like fabric.", " Thin, woven, gauze-like fabric.", " A small, oval, soft, elastic body of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of", " Thin, woven, gauze-like fabric.", " A small, oval, solid organ of the lymphatic system, distributed along the lymphatic vessels, that acts as a filter for bacteria, viruses, and foreign matter."]}]


--------------------------------------------------------------------------------
/datasets/imagenetv2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 4 | from dassl.utils import listdir_nohidden
 5 | 
 6 | from .imagenet import ImageNet
 7 | 
 8 | 
 9 | @DATASET_REGISTRY.register()
10 | class ImageNetV2(DatasetBase):
11 |     """ImageNetV2.
12 | 
13 |     This dataset is used for testing only.
14 |     """
15 | 
16 |     dataset_dir = "imagenetv2"
17 | 
18 |     def __init__(self, cfg):
19 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
20 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
21 |         image_dir = "imagenetv2-matched-frequency-format-val"
22 |         self.image_dir = os.path.join(self.dataset_dir, image_dir)
23 | 
24 |         text_file = os.path.join(self.dataset_dir, "classnames.txt")
25 |         classnames = ImageNet.read_classnames(text_file)
26 | 
27 |         data = self.read_data(classnames)
28 | 
29 |         super().__init__(train_x=data, test=data)
30 | 
31 |     def read_data(self, classnames):
32 |         image_dir = self.image_dir
33 |         folders = list(classnames.keys())
34 |         items = []
35 | 
36 |         for label in range(1000):
37 |             class_dir = os.path.join(image_dir, str(label))
38 |             imnames = listdir_nohidden(class_dir)
39 |             folder = folders[label]
40 |             classname = classnames[folder]
41 |             for imname in imnames:
42 |                 impath = os.path.join(class_dir, imname)
43 |                 item = Datum(impath=impath, label=label, classname=classname)
44 |                 items.append(item)
45 | 
46 |         return items
47 | 


--------------------------------------------------------------------------------
/datasets/imagenet_r.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 4 | from dassl.utils import listdir_nohidden
 5 | 
 6 | from .imagenet import ImageNet
 7 | 
 8 | TO_BE_IGNORED = ["README.txt"]
 9 | 
10 | 
11 | @DATASET_REGISTRY.register()
12 | class ImageNetR(DatasetBase):
13 |     """ImageNet-R(endition).
14 | 
15 |     This dataset is used for testing only.
16 |     """
17 | 
18 |     dataset_dir = "imagenet-rendition"
19 | 
20 |     def __init__(self, cfg):
21 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
22 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
23 |         self.image_dir = os.path.join(self.dataset_dir, "imagenet-r")
24 | 
25 |         text_file = os.path.join(self.dataset_dir, "classnames.txt")
26 |         classnames = ImageNet.read_classnames(text_file)
27 | 
28 |         data = self.read_data(classnames)
29 | 
30 |         super().__init__(train_x=data, test=data)
31 | 
32 |     def read_data(self, classnames):
33 |         image_dir = self.image_dir
34 |         folders = listdir_nohidden(image_dir, sort=True)
35 |         folders = [f for f in folders if f not in TO_BE_IGNORED]
36 |         items = []
37 | 
38 |         for label, folder in enumerate(folders):
39 |             imnames = listdir_nohidden(os.path.join(image_dir, folder))
40 |             classname = classnames[folder]
41 |             for imname in imnames:
42 |                 impath = os.path.join(image_dir, folder, imname)
43 |                 item = Datum(impath=impath, label=label, classname=classname)
44 |                 items.append(item)
45 | 
46 |         return items
47 | 


--------------------------------------------------------------------------------
/datasets/imagenet_a.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 4 | from dassl.utils import listdir_nohidden
 5 | 
 6 | from .imagenet import ImageNet
 7 | 
 8 | TO_BE_IGNORED = ["README.txt"]
 9 | 
10 | 
11 | @DATASET_REGISTRY.register()
12 | class ImageNetA(DatasetBase):
13 |     """ImageNet-A(dversarial).
14 | 
15 |     This dataset is used for testing only.
16 |     """
17 | 
18 |     dataset_dir = "imagenet-adversarial"
19 | 
20 |     def __init__(self, cfg):
21 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
22 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
23 |         self.image_dir = os.path.join(self.dataset_dir, "imagenet-a")
24 | 
25 |         text_file = os.path.join(self.dataset_dir, "classnames.txt")
26 |         classnames = ImageNet.read_classnames(text_file)
27 | 
28 |         data = self.read_data(classnames)
29 | 
30 |         super().__init__(train_x=data, test=data)
31 | 
32 |     def read_data(self, classnames):
33 |         image_dir = self.image_dir
34 |         folders = listdir_nohidden(image_dir, sort=True)
35 |         folders = [f for f in folders if f not in TO_BE_IGNORED]
36 |         items = []
37 | 
38 |         for label, folder in enumerate(folders):
39 |             imnames = listdir_nohidden(os.path.join(image_dir, folder))
40 |             classname = classnames[folder]
41 |             for imname in imnames:
42 |                 impath = os.path.join(image_dir, folder, imname)
43 |                 item = Datum(impath=impath, label=label, classname=classname)
44 |                 items.append(item)
45 | 
46 |         return items
47 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vitb32_SLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'slip_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'SLIP'
12 |   PRETRAINED_DATA: 'YFCC-15M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 |       SKIP_TOKENIZE: true
31 |     DECLIP:
32 |       image_encode:
33 |         embed_dim: 512
34 |       text_encode:
35 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
36 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
37 |         text_model_utils:
38 |           random: False
39 |           freeze: False
40 |         embed_dim: 512
41 |       clip:
42 |         use_allgather: False
43 |         return_sim: True
44 |         feature_dim: 768
45 |         sim_dim: 256
46 | 
47 | TEST:
48 |   BATCH_SIZE_PER_GPU: 128
49 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/SLIP_YFCC15M_vitb32.pth.tar'
50 | 
51 | TRAIN:
52 |   BATCH_SIZE_PER_GPU: 64
53 |   BEGIN_EPOCH: 0
54 |   END_EPOCH: 10
55 |   EXTRA_FINAL_TRAIN_EPOCH: 40
56 |   OPTIMIZER: sgd
57 |   WD: 0.
58 |   MOMENTUM: 0.9
59 |   NESTEROV: false
60 |   SHUFFLE: true
61 |   LR_SCHEDULER:
62 |     METHOD: 'WarmupCosine'
63 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vitb32_DeCLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'declip_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'DeCLIP'
12 |   PRETRAINED_DATA: 'DeCLIP-88M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 3072
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 |       SKIP_TOKENIZE: true
31 |     DECLIP:
32 |       image_encode:
33 |         embed_dim: 3072
34 |       text_encode:
35 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
36 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
37 |         text_model_utils:
38 |           random: False
39 |           freeze: False
40 |         embed_dim: 3072
41 |       # clip:
42 |       #   use_allgather: True
43 |       #   text_mask_type: MLM
44 |       #   return_nn_bank: True
45 |       #   EDA: True
46 |       #   feature_dim: 3072
47 | 
48 | TEST:
49 |   BATCH_SIZE_PER_GPU: 128
50 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_vitb32.pth.tar'
51 | 
52 | TRAIN:
53 |   BATCH_SIZE_PER_GPU: 64
54 |   BEGIN_EPOCH: 0
55 |   END_EPOCH: 10
56 |   EXTRA_FINAL_TRAIN_EPOCH: 40
57 |   OPTIMIZER: sgd
58 |   WD: 0.
59 |   MOMENTUM: 0.9
60 |   NESTEROV: false
61 |   SHUFFLE: true
62 |   LR_SCHEDULER:
63 |     METHOD: 'WarmupCosine'
64 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vitb32_DeCLIP_YFCC15M.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'declip_yfcc_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'DeCLIP'
12 |   PRETRAINED_DATA: 'YFCC-15M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 3072
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 |       SKIP_TOKENIZE: true
31 |     DECLIP:
32 |       image_encode:
33 |         embed_dim: 512
34 |       text_encode:
35 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
36 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
37 |         text_model_utils:
38 |           random: False
39 |           freeze: False
40 |         embed_dim: 512
41 |       # clip:
42 |       #   use_allgather: True
43 |       #   text_mask_type: MLM
44 |       #   return_nn_bank: True
45 |       #   EDA: True
46 |       #   feature_dim: 512
47 | 
48 | TEST:
49 |   BATCH_SIZE_PER_GPU: 128
50 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_YFCC15M_vitb32.pth.tar'
51 | 
52 | TRAIN:
53 |   BATCH_SIZE_PER_GPU: 64
54 |   BEGIN_EPOCH: 0
55 |   END_EPOCH: 10
56 |   EXTRA_FINAL_TRAIN_EPOCH: 40
57 |   OPTIMIZER: sgd
58 |   WD: 0.
59 |   MOMENTUM: 0.9
60 |   NESTEROV: false
61 |   SHUFFLE: true
62 |   LR_SCHEDULER:
63 |     METHOD: 'WarmupCosine'
64 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/trainers/vision_benchmark/config/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from yacs.config import CfgNode as CN
 6 | 
 7 | # high_resoluton_net related params for classification
 8 | HIGH_RESOLUTION_NET = CN()
 9 | HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*']
10 | HIGH_RESOLUTION_NET.STEM_INPLANES = 64
11 | HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1
12 | HIGH_RESOLUTION_NET.WITH_HEAD = True
13 | 
14 | HIGH_RESOLUTION_NET.STAGE2 = CN()
15 | HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1
16 | HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2
17 | HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4]
18 | HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64]
19 | HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC'
20 | HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'CAT'
21 | 
22 | HIGH_RESOLUTION_NET.STAGE3 = CN()
23 | HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1
24 | HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3
25 | HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4]
26 | HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128]
27 | HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC'
28 | HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'CAT'
29 | 
30 | HIGH_RESOLUTION_NET.STAGE4 = CN()
31 | HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1
32 | HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4
33 | HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
34 | HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
35 | HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC'
36 | HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'CAT'
37 | 
38 | RESNEXT = CN()
39 | RESNEXT.NUM_LAYERS = 50
40 | RESNEXT.BASE_WIDTH = 4
41 | RESNEXT.CARDINALITY = 32
42 | RESNEXT.KERNEL_SIZE_STEM = 7
43 | 
44 | RESNET = CN()
45 | RESNET.NUM_LAYERS = 50
46 | RESNET.KERNEL_SIZE_STEM = 7
47 | 
48 | 
49 | MODEL_SPECS = {
50 |     'cls_hrnet': HIGH_RESOLUTION_NET,
51 | }
52 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/vitb32_FILIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'filip_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'FILIP'
12 |   PRETRAINED_DATA: 'DeCLIP-88M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 768
17 |     DENSE_EVAL: true
18 |     VISION:
19 |       MODEL: vit
20 |       PATCH_SIZE: 32
21 |       WIDTH: 384
22 |       LAYERS: 12
23 |     TEXT:
24 |       TOKENIZER: clip
25 |       STYLE: clip
26 |       CONTEXT_LENGTH: 77
27 |       VOCAB_SIZE: 49408
28 |       WIDTH: 512
29 |       HEADS: 8
30 |       LAYERS: 12
31 |       SKIP_TOKENIZE: true
32 |     DECLIP:
33 |       image_encode:
34 |         embed_dim: 768
35 |       text_encode:
36 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
37 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
38 |         text_model_utils:
39 |           random: False
40 |           freeze: False
41 |         embed_dim: 768
42 |       clip:
43 |         mask_rate: 0.5
44 |         patch_number: 14
45 |         use_allgather: False
46 |         text_mask_type: MLM
47 |         return_nn_bank: False
48 |         return_dense: True
49 |         feature_dim: 768
50 |         select_topk: True
51 | 
52 | TEST:
53 |   BATCH_SIZE_PER_GPU: 128
54 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/FILIP_YFCC15M_vitb32.pth.tar'
55 | 
56 | TRAIN:
57 |   BATCH_SIZE_PER_GPU: 64
58 |   BEGIN_EPOCH: 0
59 |   END_EPOCH: 10
60 |   EXTRA_FINAL_TRAIN_EPOCH: 40
61 |   OPTIMIZER: sgd
62 |   WD: 0.
63 |   MOMENTUM: 0.9
64 |   NESTEROV: false
65 |   SHUFFLE: true
66 |   LR_SCHEDULER:
67 |     METHOD: 'WarmupCosine'
68 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/eurosat_clip_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "annual crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "forest", "def_wiki": "A dense uncultivated tract of trees and undergrowth, larger than woods.", "path_wn": ["forest", "vegetation", "collection", "group", "abstraction", "entity"], "def_wn": "the trees and other plants in a large densely wooded area"}, {"classname": "brushland or shrubland", "def_wiki": "Land that is covered mostly with shrubs.", "path_wn": "", "def_wn": ""}, {"classname": "highway or road", "def_wiki": "A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard: a country road is the same as a country lane.", "path_wn": "", "def_wn": ""}, {"classname": "industrial buildings or commercial buildings", "def_wiki": "The act or process by which something is built; construction.", "path_wn": "", "def_wn": ""}, {"classname": "pasture land", "def_wiki": "land used for grazing animals", "path_wn": "", "def_wn": ""}, {"classname": "permanent crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "residential buildings or homes or apartments", "def_wiki": "A complete domicile occupying only part of a building, especially one for rent; a flat.", "path_wn": "", "def_wn": ""}, {"classname": "river", "def_wiki": "A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", "path_wn": ["river", "stream", "body_of_water", "thing", "physical_entity", "entity"], "def_wn": "a large natural stream of water (larger than a creek)"}, {"classname": "lake or sea", "def_wiki": "A large body of salt water.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_hateful-memes.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}, {"classname": "hatespeech meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}]


--------------------------------------------------------------------------------
/scripts/interpret_prompt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import torch
 5 | 
 6 | from clip.simple_tokenizer import SimpleTokenizer
 7 | from clip import clip
 8 | 
 9 | 
10 | def load_clip_to_cpu(backbone_name="RN50"):
11 |     url = clip._MODELS[backbone_name]
12 |     model_path = clip._download(url)
13 | 
14 |     try:
15 |         # loading JIT archive
16 |         model = torch.jit.load(model_path, map_location="cpu").eval()
17 |         state_dict = None
18 | 
19 |     except RuntimeError:
20 |         state_dict = torch.load(model_path, map_location="cpu")
21 | 
22 |     model = clip.build_model(state_dict or model.state_dict())
23 | 
24 |     return model
25 | 
26 | 
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument("fpath", type=str, help="Path to the learned prompt")
29 | parser.add_argument("topk", type=int, help="Select top-k similar words")
30 | args = parser.parse_args()
31 | 
32 | fpath = args.fpath
33 | topk = args.topk
34 | 
35 | assert os.path.exists(fpath)
36 | 
37 | print(f"Return the top-{topk} matched words")
38 | 
39 | tokenizer = SimpleTokenizer()
40 | clip_model = load_clip_to_cpu()
41 | token_embedding = clip_model.token_embedding.weight
42 | print(f"Size of token embedding: {token_embedding.shape}")
43 | 
44 | prompt_learner = torch.load(fpath, map_location="cpu")["state_dict"]
45 | ctx = prompt_learner["ctx"]
46 | ctx = ctx.float()
47 | print(f"Size of context: {ctx.shape}")
48 | 
49 | if ctx.dim() == 2:
50 |     # Generic context
51 |     distance = torch.cdist(ctx, token_embedding)
52 |     print(f"Size of distance matrix: {distance.shape}")
53 |     sorted_idxs = torch.argsort(distance, dim=1)
54 |     sorted_idxs = sorted_idxs[:, :topk]
55 | 
56 |     for m, idxs in enumerate(sorted_idxs):
57 |         words = [tokenizer.decoder[idx.item()] for idx in idxs]
58 |         dist = [f"{distance[m, idx].item():.4f}" for idx in idxs]
59 |         print(f"{m+1}: {words} {dist}")
60 | 
61 | elif ctx.dim() == 3:
62 |     # Class-specific context
63 |     raise NotImplementedError
64 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_mnist.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "0", "gpt3": [" 0.", " The number zero (0).", " A particle used for marking the following verb as an infinitive.", " 0.", " To be in a state of confusion."]}, {"classname": "1", "gpt3": [" 1.", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The act of ingesting.", " The number one (1)."]}, {"classname": "2", "gpt3": [" A particle used for marking the following verb as an infinitive.", " The number two (2).", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The first person to visit the moon."]}, {"classname": "3", "gpt3": [" 0.", " The CIA.", " Because, as, since.", " A particle used for marking the following verb as an infinitive.", " Because, as, since."]}, {"classname": "4", "gpt3": [" The number four (4).", " The first of the four basic operations of arithmetic, that is, the operation of finding the remainder when one number is divided by another.", " Because, as, since.", " A type of small, flat, round cake.", " To be in a state of disrepair."]}, {"classname": "5", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " A kind of animal."]}, {"classname": "6", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be able to.", " The first person to be killed in the novel.", " The number six (6).", " Because, as, since."]}, {"classname": "7", "gpt3": [" To be in a state of disrepair.", " To be ingested.", " To be in a state of confusion.", " To be in a state of being.", " Because, as, since."]}, {"classname": "8", "gpt3": [" To ingest; to be ingested.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " Because, as, since."]}, {"classname": "9", "gpt3": [" The number nine (9).", " A particle used for marking the preceding noun as a subject.", " The number nine (9).", " To be in a state of being.", " The first person to do something."]}]


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/mnist_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "0", "def_wiki": "0.", "path_wn": ["nothing", "relative_quantity", "measure", "abstraction", "entity"], "def_wn": "a mathematical element that when added to another number yields the same number"}, {"classname": "1", "def_wiki": "The number one (1).", "path_wn": ["one", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the smallest whole number or a numeral representing this number"}, {"classname": "2", "def_wiki": "A particle used for marking the following verb as an infinitive.", "path_wn": ["two", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one or a numeral representing this number"}, {"classname": "3", "def_wiki": null, "path_wn": ["three", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one and one"}, {"classname": "4", "def_wiki": "Because, as, since.", "path_wn": ["four", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of three and one"}, {"classname": "5", "def_wiki": null, "path_wn": ["five", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of four and one"}, {"classname": "6", "def_wiki": "MI6; the agency or a particular agent.", "path_wn": ["six", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of five and one"}, {"classname": "7", "def_wiki": null, "path_wn": ["seven", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of six and one"}, {"classname": "8", "def_wiki": "To ingest; to be ingested.", "path_wn": ["eight", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of seven and one"}, {"classname": "9", "def_wiki": null, "path_wn": ["nine", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of eight and one"}]


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_kitti-distance.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "a photo i took of a car on my left or right side.", "gpt3": [" The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", " beside, next to, by", " a wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project."]}, {"classname": "a photo i took with a car nearby.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " near; close", " Nearby; close by; close at hand; close to hand; close by; close to; close on; close to at hand; close to hand; close by; close to; close on; close to at hand; close to hand;"]}, {"classname": "a photo i took with a car in the distance.", "gpt3": [" far away; a long distance away", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " far away; a long distance away", " distant, far away, far off", " far away; a long distance away"]}, {"classname": "a photo i took with no car.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car; no cars", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation."]}]


--------------------------------------------------------------------------------
/datasets/food101.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 5 | from dassl.utils import mkdir_if_missing
 6 | 
 7 | from .oxford_pets import OxfordPets
 8 | from .dtd import DescribableTextures as DTD
 9 | 
10 | 
11 | @DATASET_REGISTRY.register()
12 | class Food101(DatasetBase):
13 | 
14 |     dataset_dir = "food-101"
15 | 
16 |     def __init__(self, cfg):
17 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
18 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
19 |         self.image_dir = os.path.join(self.dataset_dir, "images")
20 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_Food101.json")
21 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
22 |         mkdir_if_missing(self.split_fewshot_dir)
23 | 
24 |         if os.path.exists(self.split_path):
25 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
26 |         else:
27 |             train, val, test = DTD.read_and_split_data(self.image_dir)
28 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
29 | 
30 |         num_shots = cfg.DATASET.NUM_SHOTS
31 |         if num_shots >= 1:
32 |             seed = cfg.SEED
33 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
34 |             
35 |             if os.path.exists(preprocessed):
36 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
37 |                 with open(preprocessed, "rb") as file:
38 |                     data = pickle.load(file)
39 |                     train, val = data["train"], data["val"]
40 |             else:
41 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
42 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
43 |                 data = {"train": train, "val": val}
44 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
45 |                 with open(preprocessed, "wb") as file:
46 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
47 | 
48 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
49 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
50 | 
51 |         super().__init__(train_x=train, val=val, test=test)
52 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/model/clip_swin_tiny.yaml:
--------------------------------------------------------------------------------
 1 | INPUT:
 2 |   MEAN:
 3 |   - 0.485
 4 |   - 0.456
 5 |   - 0.406
 6 |   STD:
 7 |   - 0.229
 8 |   - 0.224
 9 |   - 0.225
10 | MODEL:
11 |   NAME: clip_swin
12 |   NUM_PARAMS_IN_M: 11.0
13 |   AUTHOR: 'MSFT'
14 |   PRETRAINED_DATA: 'ImageNet22K_YFCC15M'
15 |   CREATION_TIME: '2021-10-27'
16 |   # Following configuration is needed for CLIP model.
17 |   PRETRAINED: ''
18 |   PRETRAINED_LAYERS: ['*']
19 |   SPEC:
20 |     EMBED_DIM: 512
21 |     GATHER_TENSORS: True
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       CONTEXT_LENGTH: 77
25 |       WIDTH: 512
26 |       HEADS: 8
27 |       LAYERS: 12
28 |     VISION:
29 |       PATCH_SIZE: 4
30 |       IN_CHANS: 3
31 |       EMBED_DIM: 96
32 |       DEPTHS: [2, 2, 6, 2]
33 |       NUM_HEADS: [3, 6, 12, 24]
34 |       WINDOW_SIZE: 7
35 |       MLP_RATIO: 4.
36 |       QKV_BIAS: True
37 |       APE: False
38 |       PATCH_NORM: True
39 |       DROP_RATE: 0.0
40 |       DROP_PATH_RATE: 0.0
41 | 
42 | KNOWLEDGE:
43 |   WORDNET:
44 |     USE_HIERARCHY: False # False
45 |     USE_DEFINITION: False # True
46 | 
47 | # DATASET:
48 | #   DATASET: 'imagenet'
49 | #   ROOT: ../../data/zeroshot/classification/imagenet 
50 | OUTPUT_DIR: /home/chunyl/azure_mount/chunyleu_output/cvinwild/ic_benchmark/debug/swin_tiny/unicl_imagenet21k 
51 | # ../../output/hcl_exp/hcl_yfcc15m_half_imagenet22k_half/wordnet_h_true_d_false
52 | TEST:
53 |   MODEL_FILE: '/home/chunyl/azure_mount/chunyleu_output/ckpts/benchmark/swin_tiny/unicl_imagenet21k/model_state_dict.pt' 
54 |   BATCH_SIZE_PER_GPU: 128
55 | 
56 | TRAIN:
57 |   BATCH_SIZE_PER_GPU: 64
58 |   BEGIN_EPOCH: 0
59 |   END_EPOCH: 10
60 |   EXTRA_FINAL_TRAIN_EPOCH: 40
61 |   OPTIMIZER: sgd
62 |   WD: 0.
63 |   MOMENTUM: 0.9
64 |   NESTEROV: false
65 |   SHUFFLE: true
66 |   LR_SCHEDULER:
67 |     METHOD: 'WarmupCosine'
68 |     WARMUP_EPOCH: 5
69 | 
70 | # hcl_imagenet_21k_wiki
71 | # hcl_imagenet21k
72 |   # hcl_yfcc15m_half_imagenet21k_half_multitask
73 |   # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_half_imagenet22k_half/model_state_dict.pt'
74 | 
75 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k_multitask/model_state_dict.pt'
76 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k/model_state_dict.pt'
77 | 
78 | # hcl_imagenet22k  hcl_yfcc15m  hcl_yfcc15m_half_imagenet21k_half  hcl_yfcc15m_half_imagenet22k_half  hcl_yfcc15m_imagenet21k  hcl_yfcc15m_imagenet22k  hcl_yfcc15m_imagenet22k_multitask
79 | # hcl_imagenet1k
80 | 


--------------------------------------------------------------------------------
/scripts/avg_ckpt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | from dassl.utils import (
 4 |     MetricMeter, AverageMeter, tolist_if_not, count_num_param, load_checkpoint,
 5 |     save_checkpoint, mkdir_if_missing, resume_from_checkpoint,
 6 |     load_pretrained_weights
 7 | )
 8 | from collections import OrderedDict
 9 | import os.path as osp
10 | 
11 | seeds = [1, 2, 3]
12 | 
13 | ckpt_dir = "//tmp//Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101/"
14 | # ckpt_path = f"{ckpt_dir}/VPT/vit_b16_5shots/nctx16_csc_ctp/"
15 | ckpt_path = f"{ckpt_dir}/CoOp/vit_b16_5shots/nctx16_csc_ctp/"
16 | 
17 | ckpt_dir = "prompt_learner/"
18 | ckpt_name = "model-best.pth.tar"
19 | import numpy as np
20 | 
21 | def average_ckpt(state_dict, ignore=['optimizer', 'scheduler']):
22 |     new_dict = dict()
23 |     print(state_dict['val_result'], state_dict['epoch'])
24 |     for key in state_dict:
25 |         if key in ignore:
26 |             continue
27 |         if isinstance(state_dict[key][0], int):
28 |             new_dict[key] = int(np.average(state_dict[key]))
29 |         elif isinstance(state_dict[key][0], float):
30 |             new_dict[key] = np.average(state_dict[key])
31 |         elif isinstance(state_dict[key][0], dict):
32 |             avg_dict = dict()
33 |             for ckpt_id in range(len(state_dict[key])):
34 |                 for param_key in state_dict[key][ckpt_id]:
35 |                     if param_key not in avg_dict:
36 |                         avg_dict[param_key] = []
37 |                     avg_dict[param_key].append( state_dict[key][ckpt_id][param_key] )
38 |             for param_key in avg_dict:
39 |                 # print(avg_dict[param_key][0].shape)
40 |                 avg_dict[param_key] = torch.stack( avg_dict[param_key] ).mean(dim=0)
41 |                 # print(avg_dict[param_key].shape)
42 |             new_dict[key] = dict(avg_dict)
43 |     return new_dict
44 | 
45 | state = {}
46 | for seed in seeds:
47 |     model_path = f"{ckpt_path}/seed{seed}/{ckpt_dir}{ckpt_name}"
48 |     checkpoint = load_checkpoint(model_path)
49 |     for key in checkpoint:
50 |         if key not in state: state[key] = []
51 |         state[key].append( checkpoint[key] )
52 | 
53 | avg_ckpt = average_ckpt(state)
54 | 
55 | print(avg_ckpt.keys())
56 | print(avg_ckpt['val_result'])
57 | print(osp.join(ckpt_path, ckpt_dir))
58 | save_checkpoint(
59 |     {
60 |         "state_dict": avg_ckpt['state_dict'],
61 |         "epoch": avg_ckpt['epoch'],
62 |         "val_result": avg_ckpt['val_result'],
63 |     },
64 |     osp.join(ckpt_path, ckpt_dir),
65 |     is_best=True,
66 | )


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_fer-2013.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "angry", "gpt3": [" Feeling or showing anger.", " Displaying or feeling anger.", " Feeling or showing anger; wrathful; irate.", " Displaying or feeling anger.", " A feeling of displeasure and antagonism aroused by a wrong; wrath; ire; rage."]}, {"classname": "disgusted", "gpt3": [" Having a feeling of loathing or revulsion; repulsed.", " Feeling or displaying disgust.", " Affected with a feeling of loathing or repugnance; having a feeling of extreme repugnance or aversion.", " Filled with disgust.", " Feeling or showing a strong dislike of something."]}, {"classname": "fearful", "gpt3": [" Causing fear.", " Causing fear.", " Frightening.", " Frightening.", " Causing fear or dread; frightening."]}, {"classname": "happy", "gpt3": [" Emotionally positive.", " Feeling or showing pleasure and satisfaction.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous."]}, {"classname": "neutral", "gpt3": [" Not showing any emotion.", " Having no particular feeling or interest; not caring one way or the other; indifferent.", " Not taking sides in a conflict such as war; nonaligned.", " Neither positive nor negative.", " Neither good nor bad."]}, {"classname": "sad", "gpt3": [" Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; gloomy; dismal; dreary; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; melancholy; dismal; gloomy; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; d"]}, {"classname": "surprised", "gpt3": [" Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Astonished; astounded; amazed; dumbfounded; flabbergasted; flummoxed; flabbergasted; thunderstruck; dumbstruck; thunderstruck; dumbstruck; thunderstruck; dumbstruck;", " Surprised.", " Surprised."]}]


--------------------------------------------------------------------------------
/datasets/caltech101.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 5 | from dassl.utils import mkdir_if_missing
 6 | 
 7 | from .oxford_pets import OxfordPets
 8 | from .dtd import DescribableTextures as DTD
 9 | 
10 | IGNORED = ["BACKGROUND_Google", "Faces_easy"]
11 | NEW_CNAMES = {
12 |     "airplanes": "airplane",
13 |     "Faces": "face",
14 |     "Leopards": "leopard",
15 |     "Motorbikes": "motorbike",
16 | }
17 | 
18 | 
19 | @DATASET_REGISTRY.register()
20 | class Caltech101(DatasetBase):
21 | 
22 |     dataset_dir = "caltech-101"
23 | 
24 |     def __init__(self, cfg):
25 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
26 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
27 |         self.image_dir = os.path.join(self.dataset_dir, "101_ObjectCategories")
28 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_Caltech101.json")
29 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
30 |         mkdir_if_missing(self.split_fewshot_dir)
31 | 
32 |         if os.path.exists(self.split_path):
33 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
34 |         else:
35 |             train, val, test = DTD.read_and_split_data(self.image_dir, ignored=IGNORED, new_cnames=NEW_CNAMES)
36 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
37 | 
38 |         num_shots = cfg.DATASET.NUM_SHOTS
39 |         if num_shots >= 1:
40 |             seed = cfg.SEED
41 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
42 |             
43 |             if os.path.exists(preprocessed):
44 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
45 |                 with open(preprocessed, "rb") as file:
46 |                     data = pickle.load(file)
47 |                     train, val = data["train"], data["val"]
48 |             else:
49 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
50 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
51 |                 data = {"train": train, "val": val}
52 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
53 |                 with open(preprocessed, "wb") as file:
54 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
55 | 
56 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
57 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
58 | 
59 |         super().__init__(train_x=train, val=val, test=test)
60 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/commands/prepare_submit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | submit predictions to leaderboard service
 3 | """
 4 | import argparse
 5 | from collections import defaultdict
 6 | import json
 7 | import logging
 8 | import pathlib
 9 | import zipfile
10 | import itertools
11 | import numpy as np
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser(description='Submit predictions to leaderboard service.')
15 |     parser.add_argument('--combine_path', required=True, help='Prediction json file path.', type=pathlib.Path)
16 |     parser.add_argument('--combine_name', default='all_predictions', required=False, help='Output file name.', type=str)
17 |     args = parser.parse_args()
18 | 
19 |     return args
20 | 
21 | 
22 | # if you find the accuracy is not enough, pleae consider increasing `prec`.
23 | def json_prec_dump(data, prec=6):
24 |     return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec)))
25 | 
26 | 
27 | def main():
28 |     logging.basicConfig(level=logging.INFO)
29 |     args = parse_args()
30 | 
31 |     all_predictions = defaultdict(list)
32 |     for prediction_file in args.combine_path.iterdir():
33 |         if prediction_file.suffix != '.json':
34 |             print(f'Ignoring file {prediction_file.name} by suffix.')
35 |             continue
36 |         prediction_data = json.loads(prediction_file.read_text())
37 |         all_predictions[prediction_data['dataset_name']].append(prediction_data)
38 | 
39 |     all_combine_predictions = []
40 | 
41 |     KNOWN_AVERAGE_KEYS = ['num_trainable_params']
42 |     KNOWN_MERGE_KEYS = ['rnd_seeds', 'predictions']
43 |     KNOWN_DIFF_KEYS = KNOWN_AVERAGE_KEYS + KNOWN_MERGE_KEYS
44 | 
45 |     for ds, prediction_data in all_predictions.items():
46 |         prediction_keys = list(prediction_data[0])
47 |         combined_dict = dict()
48 |         for key in prediction_keys:
49 |             values = [x[key] for x in prediction_data]
50 |             if key not in KNOWN_DIFF_KEYS:
51 |                 assert all(x == values[0] for x in values)
52 |                 values = values[0]
53 |             else:
54 |                 if key in KNOWN_MERGE_KEYS:
55 |                     values = list(itertools.chain.from_iterable(values))
56 |                 elif key in KNOWN_AVERAGE_KEYS:
57 |                     values = np.asarray(values).mean()
58 |                 else:
59 |                     assert False
60 |             combined_dict[key] = values
61 |         all_combine_predictions.append(combined_dict)
62 | 
63 |     all_predictions = {"data": all_combine_predictions}
64 |     all_predictions = json_prec_dump(all_predictions)
65 |     save_path = args.combine_path / f'{args.combine_name}.zip'
66 |     zf = zipfile.ZipFile(save_path, "w", zipfile.ZIP_DEFLATED)
67 |     zf.writestr('all_predictions.json', all_predictions)
68 |     zf.close()
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/common/data_class_base.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import dataclasses
 3 | 
 4 | 
 5 | class DataClassBase:
 6 |     def __post_init__(self):
 7 |         self.validate()
 8 | 
 9 |     @classmethod
10 |     def from_dict(cls, data_content):
11 |         c = {}
12 |         for field in dataclasses.fields(cls):
13 |             d_type = DataClassBase._get_dataclass_type(field.type)
14 |             if field.name in data_content:
15 |                 c[field.name] = d_type.from_dict(data_content[field.name]) if d_type else data_content[field.name]
16 | 
17 |         assert len(data_content) == len(c), f"{data_content.keys()} vs {c.keys()}"
18 |         return cls(**c)
19 | 
20 |     def to_dict(self, skip_default=True):
21 |         result = {}
22 |         for f in dataclasses.fields(self):
23 |             value = getattr(self, f.name)
24 |             if dataclasses.is_dataclass(value):
25 |                 value = value.to_dict()
26 |             elif isinstance(value, (list, tuple)):
27 |                 value = type(value)(v.to_dict() if dataclasses.is_dataclass(v) else v for v in value)
28 |             if not skip_default or value != f.default:
29 |                 result[f.name] = value
30 |         return result
31 | 
32 |     def validate(self):
33 |         # Check the field types.
34 |         for field in dataclasses.fields(self):
35 |             if hasattr(field.type, '__origin__') and field.type.__origin__ in (tuple, collections.abc.Sequence):
36 |                 expected_types = field.type.__origin__
37 |             elif hasattr(field.type, '__args__'):
38 |                 # Optional[<type>].__args__ is (<type>, NoneType)
39 |                 expected_types = field.type.__args__
40 |             else:
41 |                 expected_types = field.type
42 | 
43 |             if not isinstance(self.__dict__[field.name], expected_types):
44 |                 raise TypeError(f"Unexpected field type for {field.name}: Expected: {expected_types}. Actual: {type(self.__dict__[field.name])}")
45 | 
46 |     def _raise_value_error(self, config_name, msg=None):
47 |         error_msg = f"Invalid {config_name}: {getattr(self, config_name)}."
48 |         if msg:
49 |             error_msg += ' ' + msg
50 | 
51 |         raise ValueError(error_msg)
52 | 
53 |     def _check_value(self, value_name, checker):
54 |         value = getattr(self, value_name)
55 |         if not checker(value):
56 |             raise ValueError(f"Invalid {value_name}: {value}.")
57 | 
58 |     def _get_dataclass_type(field_type):
59 |         """Returns dataclass type if the given type is dataclass or Optional[dataclass]."""
60 |         if dataclasses.is_dataclass(field_type):
61 |             return field_type
62 |         if hasattr(field_type, '__args__'):
63 |             args = field_type.__args__
64 |             if len(args) == 2 and type(None) in args:
65 |                 return next((t for t in args if dataclasses.is_dataclass(t)), None)
66 |         return None
67 | 


--------------------------------------------------------------------------------
/trainers/imagenet_templates.py:
--------------------------------------------------------------------------------
 1 | # source: https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb
 2 | 
 3 | IMAGENET_TEMPLATES = [
 4 |     "a bad photo of a {}.",
 5 |     "a photo of many {}.",
 6 |     "a sculpture of a {}.",
 7 |     "a photo of the hard to see {}.",
 8 |     "a low resolution photo of the {}.",
 9 |     "a rendering of a {}.",
10 |     "graffiti of a {}.",
11 |     "a bad photo of the {}.",
12 |     "a cropped photo of the {}.",
13 |     "a tattoo of a {}.",
14 |     "the embroidered {}.",
15 |     "a photo of a hard to see {}.",
16 |     "a bright photo of a {}.",
17 |     "a photo of a clean {}.",
18 |     "a photo of a dirty {}.",
19 |     "a dark photo of the {}.",
20 |     "a drawing of a {}.",
21 |     "a photo of my {}.",
22 |     "the plastic {}.",
23 |     "a photo of the cool {}.",
24 |     "a close-up photo of a {}.",
25 |     "a black and white photo of the {}.",
26 |     "a painting of the {}.",
27 |     "a painting of a {}.",
28 |     "a pixelated photo of the {}.",
29 |     "a sculpture of the {}.",
30 |     "a bright photo of the {}.",
31 |     "a cropped photo of a {}.",
32 |     "a plastic {}.",
33 |     "a photo of the dirty {}.",
34 |     "a jpeg corrupted photo of a {}.",
35 |     "a blurry photo of the {}.",
36 |     "a photo of the {}.",
37 |     "a good photo of the {}.",
38 |     "a rendering of the {}.",
39 |     "a {} in a video game.",
40 |     "a photo of one {}.",
41 |     "a doodle of a {}.",
42 |     "a close-up photo of the {}.",
43 |     "a photo of a {}.",
44 |     "the origami {}.",
45 |     "the {} in a video game.",
46 |     "a sketch of a {}.",
47 |     "a doodle of the {}.",
48 |     "a origami {}.",
49 |     "a low resolution photo of a {}.",
50 |     "the toy {}.",
51 |     "a rendition of the {}.",
52 |     "a photo of the clean {}.",
53 |     "a photo of a large {}.",
54 |     "a rendition of a {}.",
55 |     "a photo of a nice {}.",
56 |     "a photo of a weird {}.",
57 |     "a blurry photo of a {}.",
58 |     "a cartoon {}.",
59 |     "art of a {}.",
60 |     "a sketch of the {}.",
61 |     "a embroidered {}.",
62 |     "a pixelated photo of a {}.",
63 |     "itap of the {}.",
64 |     "a jpeg corrupted photo of the {}.",
65 |     "a good photo of a {}.",
66 |     "a plushie {}.",
67 |     "a photo of the nice {}.",
68 |     "a photo of the small {}.",
69 |     "a photo of the weird {}.",
70 |     "the cartoon {}.",
71 |     "art of the {}.",
72 |     "a drawing of the {}.",
73 |     "a photo of the large {}.",
74 |     "a black and white photo of a {}.",
75 |     "the plushie {}.",
76 |     "a dark photo of a {}.",
77 |     "itap of a {}.",
78 |     "graffiti of the {}.",
79 |     "a toy {}.",
80 |     "itap of my {}.",
81 |     "a photo of a cool {}.",
82 |     "a photo of a small {}.",
83 |     "a tattoo of the {}.",
84 | ]
85 | 
86 | IMAGENET_TEMPLATES_SELECT = [
87 |     "itap of a {}.",
88 |     "a bad photo of the {}.",
89 |     "a origami {}.",
90 |     "a photo of the large {}.",
91 |     "a {} in a video game.",
92 |     "art of the {}.",
93 |     "a photo of the small {}.",
94 | ]
95 | 


--------------------------------------------------------------------------------
/scripts/data.sh:
--------------------------------------------------------------------------------
 1 | DATA=/shared/sheng/coop_data/
 2 | mkdir -p $DATA
 3 | # DATA=/work/tianjun/few-shot-learning/prompt-moe/CoOp/data/
 4 | cd $DATA
 5 | 
 6 | # pip install gdown
 7 | 
 8 | mkdir -p caltech-101
 9 | cd caltech-101
10 | # wget http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz 
11 | wget https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip
12 | unzip caltech-101.zip
13 | mv caltech-101/101_ObjectCategories.tar.gz .
14 | gdown 1hyarUivQE36mY6jSomru6Fjd-JzwcCzN
15 | tar -xvf 101_ObjectCategories.tar.gz 
16 | cd $DATA
17 | 
18 | mkdir -p oxford_pets
19 | cd oxford_pets
20 | wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
21 | wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz
22 | gdown  1501r8Ber4nNKvmlFVQZ8SeUHTcdTTEqs
23 | tar -xvf images.tar.gz
24 | tar -xvf annotations.tar.gz
25 | cd $DATA
26 | 
27 | mkdir -p stanford_cars
28 | cd stanford_cars
29 | wget http://ai.stanford.edu/~jkrause/car196/cars_train.tgz 
30 | wget http://ai.stanford.edu/~jkrause/car196/cars_test.tgz
31 | wget https://ai.stanford.edu/~jkrause/cars/car_devkit.tgz
32 | wget http://ai.stanford.edu/~jkrause/car196/cars_test_annos_withlabels.mat
33 | gdown  1ObCFbaAgVu0I-k_Au-gIUcefirdAuizT
34 | tar -xvf cars_train.tgz 
35 | tar -xvf cars_test.tgz 
36 | tar -xvf car_devkit.tgz 
37 | cd $DATA
38 | 
39 | mkdir -p oxford_flowers
40 | cd oxford_flowers
41 | wget https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz 
42 | wget https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat
43 | gdown 1AkcxCXeK_RCGCEC_GvmWxjcjaNhu-at0
44 | gdown 1Pp0sRXzZFZq15zVOzKjKBu4A9i01nozT
45 | tar -xvf 102flowers.tgz 
46 | cd $DATA
47 | 
48 | 
49 | wget http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz
50 | tar -xvf food-101.tar.gz
51 | cd food-101
52 | gdown 1QK0tGi096I0Ba6kggatX1ee6dJFIcEJl
53 | cd $DATA
54 | 
55 | wget https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz
56 | tar -xvf fgvc-aircraft-2013b.tar.gz
57 | mv fgvc-aircraft-2013b/data fgvc_aircraft
58 | cd $DATA
59 | 
60 | mkdir -p sun397
61 | cd sun397
62 | wget http://vision.princeton.edu/projects/2010/SUN/SUN397.tar.gz
63 | wget https://vision.princeton.edu/projects/2010/SUN/download/Partitions.zip
64 | gdown 1y2RD81BYuiyvebdN-JymPfyWYcd8_MUq
65 | tar -xvf SUN397.tar.gz
66 | unzip Partitions.zip
67 | cd $DATA
68 | 
69 | 
70 | wget https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz
71 | tar -xvf dtd-r1.0.1.tar.gz
72 | cd dtd
73 | gdown 1u3_QfB467jqHgNXC00UIzbLZRQCg2S7x
74 | cd $DATA
75 | 
76 | mkdir -p eurosat
77 | cd eurosat
78 | wget http://madm.dfki.de/files/sentinel/EuroSAT.zip
79 | unzip EuroSAT.zip
80 | gdown 1Ip7yaCWFi0eaOFUGga0lUdVi_DDQth1o
81 | cd $DATA
82 | 
83 | mkdir -p ucf101
84 | cd ucf101
85 | gdown  10Jqome3vtUA2keJkNanAiFpgbyC9Hc2O
86 | unzip UCF-101-midframes.zip
87 | gdown 1I0S0q91hJfsV9Gf4xDIjgDq4AqBNJb1y
88 | cd $DATA
89 | 
90 | mkdir -p imagenetv2
91 | cd imagenetv2
92 | wget https://s3-us-west-2.amazonaws.com/imagenetv2public/imagenetv2-matched-frequency.tar.gz
93 | tar -xvf imagenetv2-matched-frequency.tar.gz
94 | gdown 1-61f_ol79pViBFDG_IDlUQSwoLcn2XXF


--------------------------------------------------------------------------------
/datasets/fgvc_aircraft.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 5 | from dassl.utils import mkdir_if_missing
 6 | 
 7 | from .oxford_pets import OxfordPets
 8 | 
 9 | 
10 | @DATASET_REGISTRY.register()
11 | class FGVCAircraft(DatasetBase):
12 | 
13 |     dataset_dir = "fgvc_aircraft"
14 | 
15 |     def __init__(self, cfg):
16 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
17 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
18 |         self.image_dir = os.path.join(self.dataset_dir, "images")
19 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
20 |         mkdir_if_missing(self.split_fewshot_dir)
21 | 
22 |         classnames = []
23 |         with open(os.path.join(self.dataset_dir, "variants.txt"), "r") as f:
24 |             lines = f.readlines()
25 |             for line in lines:
26 |                 classnames.append(line.strip())
27 |         cname2lab = {c: i for i, c in enumerate(classnames)}
28 | 
29 |         train = self.read_data(cname2lab, "images_variant_train.txt")
30 |         val = self.read_data(cname2lab, "images_variant_val.txt")
31 |         test = self.read_data(cname2lab, "images_variant_test.txt")
32 | 
33 |         num_shots = cfg.DATASET.NUM_SHOTS
34 |         if num_shots >= 1:
35 |             seed = cfg.SEED
36 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
37 |             
38 |             if os.path.exists(preprocessed):
39 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
40 |                 with open(preprocessed, "rb") as file:
41 |                     data = pickle.load(file)
42 |                     train, val = data["train"], data["val"]
43 |             else:
44 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
45 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
46 |                 data = {"train": train, "val": val}
47 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
48 |                 with open(preprocessed, "wb") as file:
49 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
50 | 
51 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
52 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
53 | 
54 |         super().__init__(train_x=train, val=val, test=test)
55 | 
56 |     def read_data(self, cname2lab, split_file):
57 |         filepath = os.path.join(self.dataset_dir, split_file)
58 |         items = []
59 | 
60 |         with open(filepath, "r") as f:
61 |             lines = f.readlines()
62 |             for line in lines:
63 |                 line = line.strip().split(" ")
64 |                 imname = line[0] + ".jpg"
65 |                 classname = " ".join(line[1:])
66 |                 impath = os.path.join(self.image_dir, imname)
67 |                 label = cname2lab[classname]
68 |                 item = Datum(impath=impath, label=label, classname=classname)
69 |                 items.append(item)
70 | 
71 |         return items
72 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/evaluation/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.utils.data
 4 | from PIL import Image
 5 | from torchvision import transforms
 6 | 
 7 | 
 8 | class Voc2007Classification(torch.utils.data.Dataset):
 9 |     def __init__(self, data_root, image_set="train", transform=None):
10 |         """
11 |         Pascal voc2007 training/validation data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
12 |         test data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
13 |         """
14 |         self.data_root = self._update_path(data_root, image_set)
15 |         self.transform = transform
16 |         self.labels = self._read_annotation(image_set)
17 |         self.images = list(self.labels.keys())
18 | 
19 |     @staticmethod
20 |     def _update_path(data_root, image_set):
21 |         if image_set == "train" or image_set == "val":
22 |             data_root += "train/VOCdevkit/VOC2007"
23 |         elif image_set == "test":
24 |             data_root += "test/VOCdevkit 2/VOC2007"
25 |         else:
26 |             raise Exception("Incorrect image set!")
27 |         return data_root
28 | 
29 |     def __getitem__(self, index):
30 |         img_path = os.path.join(self.data_root, 'JPEGImages/' + self.images[index] + '.jpg')
31 |         image = Image.open(img_path).convert("RGB")
32 |         if self.transform is not None:
33 |             image = self.transform(image)
34 |         else:
35 |             image = transforms.ToTensor()(image)
36 |         label = self.labels[self.images[index]]
37 |         label = torch.LongTensor(label)
38 |         return image, label
39 | 
40 |     def __len__(self):
41 |         return len(self.images)
42 | 
43 |     def _read_annotation(self, image_set="train"):
44 |         """
45 |         Annotation interpolation, refer to:
46 |         http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00093000000000000000
47 |         """
48 |         object_categories = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
49 |                              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
50 |                              'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
51 |         annotation_folder = os.path.join(self.data_root, "ImageSets/Main/")
52 |         files = [file_name for file_name in os.listdir(annotation_folder) if file_name.endswith("_" + image_set + ".txt")]
53 |         labels_all = dict()
54 |         for file_name in files:
55 |             label_str = file_name.split("_")[0]
56 |             label_int = object_categories.index(label_str)
57 |             with open(annotation_folder + "/" + file_name, "r") as fread:
58 |                 for line in fread.readlines():
59 |                     index = line[:6]
60 |                     if index not in labels_all.keys():
61 |                         labels_all[index] = [0] * len(object_categories)
62 |                     flag = 1
63 |                     if line[7:9] and int(line[7:9]) != 1:
64 |                         flag = -1
65 |                     if flag == 1:
66 |                         labels_all[index][label_int] = 1
67 |         return labels_all
68 | 
69 | 


--------------------------------------------------------------------------------
/datasets/eurosat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 5 | from dassl.utils import mkdir_if_missing
 6 | 
 7 | from .oxford_pets import OxfordPets
 8 | from .dtd import DescribableTextures as DTD
 9 | 
10 | NEW_CNAMES = {
11 |     "AnnualCrop": "Annual Crop Land",
12 |     "Forest": "Forest",
13 |     "HerbaceousVegetation": "Herbaceous Vegetation Land",
14 |     "Highway": "Highway or Road",
15 |     "Industrial": "Industrial Buildings",
16 |     "Pasture": "Pasture Land",
17 |     "PermanentCrop": "Permanent Crop Land",
18 |     "Residential": "Residential Buildings",
19 |     "River": "River",
20 |     "SeaLake": "Sea or Lake",
21 | }
22 | 
23 | 
24 | @DATASET_REGISTRY.register()
25 | class EuroSAT(DatasetBase):
26 | 
27 |     dataset_dir = "eurosat"
28 | 
29 |     def __init__(self, cfg):
30 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
31 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
32 |         self.image_dir = os.path.join(self.dataset_dir, "2750")
33 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_EuroSAT.json")
34 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
35 |         mkdir_if_missing(self.split_fewshot_dir)
36 | 
37 |         if os.path.exists(self.split_path):
38 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
39 |         else:
40 |             train, val, test = DTD.read_and_split_data(self.image_dir, new_cnames=NEW_CNAMES)
41 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
42 | 
43 |         num_shots = cfg.DATASET.NUM_SHOTS
44 |         if num_shots >= 1:
45 |             seed = cfg.SEED
46 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
47 |             
48 |             if os.path.exists(preprocessed):
49 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
50 |                 with open(preprocessed, "rb") as file:
51 |                     data = pickle.load(file)
52 |                     train, val = data["train"], data["val"]
53 |             else:
54 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
55 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
56 |                 data = {"train": train, "val": val}
57 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
58 |                 with open(preprocessed, "wb") as file:
59 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
60 | 
61 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
62 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
63 | 
64 |         super().__init__(train_x=train, val=val, test=test)
65 | 
66 |     def update_classname(self, dataset_old):
67 |         dataset_new = []
68 |         for item_old in dataset_old:
69 |             cname_old = item_old.classname
70 |             cname_new = NEW_CLASSNAMES[cname_old]
71 |             item_new = Datum(impath=item_old.impath, label=item_old.label, classname=cname_new)
72 |             dataset_new.append(item_new)
73 |         return dataset_new
74 | 


--------------------------------------------------------------------------------
/scripts/mvlpt/env_mvlpt.yml:
--------------------------------------------------------------------------------
 1 | name: mvlpt
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - conda-forge/linux-64::_libgcc_mutex==0.1=main
 6 |   - defaults/linux-64::ca-certificates==2022.07.19=h06a4308_0
 7 |   - defaults/linux-64::intel-openmp==2021.2.0=h06a4308_610
 8 |   - defaults/linux-64::libstdcxx-ng==9.3.0=hd4cf53a_17
 9 |   - pytorch/noarch::pytorch-mutex==1.0=cuda
10 |   - defaults/linux-64::libgomp==9.3.0=h5101ec6_17
11 |   - defaults/linux-64::mkl==2021.2.0=h06a4308_296
12 |   - defaults/linux-64::_openmp_mutex==4.5=1_gnu
13 |   - conda-forge/linux-64::blas==1.0=mkl
14 |   - defaults/linux-64::libgcc-ng==9.3.0=h5101ec6_17
15 |   - conda-forge/linux-64::bzip2==1.0.8=h7f98852_4
16 |   - defaults/linux-64::cudatoolkit==11.3.1=h2bc3f7f_2
17 |   - conda-forge/linux-64::gmp==6.2.1=h58526e2_0
18 |   - defaults/linux-64::jpeg==9b=h024ee3a_2
19 |   - conda-forge/linux-64::lame==3.100=h7f98852_1001
20 |   - defaults/linux-64::libffi==3.2.1=hf484d3e_1007
21 |   - conda-forge/linux-64::libiconv==1.16=h516909a_0
22 |   - conda-forge/linux-64::libuv==1.41.0=h7f98852_0
23 |   - conda-forge/linux-64::libwebp-base==1.2.0=h7f98852_2
24 |   - conda-forge/linux-64::lz4-c==1.9.3=h9c3ff4c_0
25 |   - defaults/linux-64::ncurses==6.2=he6710b0_1
26 |   - conda-forge/linux-64::nettle==3.6=he412f7d_0
27 |   - defaults/linux-64::ninja-base==1.10.2=hd09550d_5
28 |   - defaults/linux-64::openssl==1.1.1q=h7f8727e_0
29 |   - defaults/linux-64::xz==5.2.5=h7b6447c_0
30 |   - defaults/linux-64::zlib==1.2.11=h7b6447c_3
31 |   - conda-forge/linux-64::gnutls==3.6.13=h85f3911_1
32 |   - defaults/linux-64::libedit==3.1.20210216=h27cfd23_1
33 |   - conda-forge/linux-64::libpng==1.6.37=h21135ba_2
34 |   - conda-forge/linux-64::openh264==2.1.1=h780b84a_0
35 |   - defaults/linux-64::readline==7.0=h7b6447c_5
36 |   - defaults/linux-64::tk==8.6.10=hbc83047_0
37 |   - conda-forge/linux-64::zstd==1.4.9=ha95c52a_0
38 |   - conda-forge/linux-64::freetype==2.10.4=h0708190_1
39 |   - defaults/linux-64::libtiff==4.2.0=h85742a9_0
40 |   - defaults/linux-64::sqlite==3.33.0=h62c20be_0
41 |   - pytorch/linux-64::ffmpeg==4.3=hf484d3e_0
42 |   - defaults/linux-64::lcms2==2.12=h3be6417_0
43 |   - defaults/linux-64::python==3.8.0=h0371630_2
44 |   - defaults/linux-64::certifi==2022.6.15=py38h06a4308_0
45 |   - defaults/linux-64::ninja==1.10.2=h06a4308_5
46 |   - conda-forge/noarch::olefile==0.46=pyh9f0ad1d_1
47 |   - conda-forge/linux-64::python_abi==3.8=1_cp38
48 |   - conda-forge/noarch::six==1.16.0=pyh6c4a22f_0
49 |   - conda-forge/noarch::typing_extensions==4.3.0=pyha770c72_0
50 |   - defaults/noarch::wheel==0.36.2=pyhd3eb1b0_0
51 |   - conda-forge/linux-64::mkl-service==2.4.0=py38h497a2fe_0
52 |   - defaults/linux-64::pillow==8.2.0=py38he98fc37_0
53 |   - pytorch/linux-64::pytorch==1.10.0=py3.8_cuda11.3_cudnn8.2.0_0
54 |   - defaults/linux-64::setuptools==52.0.0=py38h06a4308_0
55 |   - defaults/linux-64::numpy-base==1.20.2=py38hfae3a4d_0
56 |   - defaults/linux-64::pip==21.1.2=py38h06a4308_0
57 |   - conda-forge/linux-64::mkl_random==1.2.2=py38h1abd341_0
58 |   - defaults/linux-64::mkl_fft==1.3.0=py38h42c9631_2
59 |   - defaults/linux-64::numpy==1.20.2=py38h2d18471_0
60 |   - pytorch/linux-64::torchaudio==0.10.0=py38_cu113
61 |   - pytorch/linux-64::torchvision==0.11.0=py38_cu113
62 | prefix: /home/sheng/anaconda3/envs/mvlpt
63 | 
64 | 


--------------------------------------------------------------------------------
/scripts/mvlpt/main_mt_coopdata_cut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | # DATA=/path/to/datasets
 5 | #TRAINER=UPT
 6 | #TRAINER=VPT
 7 | # TRAINER=CoOp
 8 | TRAINER=$1
 9 | 
10 | output_dir=~/opensource/ckpt/
11 | #root=/shared/sheng/coop_data
12 | # root=/tmp/ic/
13 | root=//tmp/coop_data
14 | 
15 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance']
16 | CFG=$2  # config file
17 | NCTX=$3  # number of context tokens
18 | SHOTS=$4  # number of shots (5, 20, 50)
19 | 
20 | # DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
21 | DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
22 | # for SEED in 1 2 3
23 | # for SEED in 1
24 | for SEED in $5
25 | do
26 |     DIR=$output_dir/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED}
27 |     # if [ -d "$DIR" ]; then
28 |     #     echo "Oops! The results exist at ${DIR} (so skip this job)"
29 |     # else
30 |     if [ $TRAINER = "UPT" ]; then
31 |         python3 train.py \
32 |         --root $root \
33 |         --seed ${SEED} \
34 |         --trainer MVLPT \
35 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
36 |         --output-dir ${DIR} \
37 |         --dataset ${DATASET} \
38 |         --shots ${SHOTS} \
39 |         --dataset-coop \
40 |         --multi-task \
41 |         TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
42 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
43 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
44 |         TRAINER.MVLPT.COOP.CSC False \
45 |         TEST.NO_TEST False \
46 | 		TEST.FINAL_MODEL "best_val" \
47 |         TRAINER.CUT_CONTEXTLEN True
48 |     elif  [ $TRAINER = "VPT" ]; then
49 |         python3 train.py \
50 |          --root $root \
51 |          --seed ${SEED} \
52 |          --trainer MVLPT \
53 |          --config-file configs/trainers/MVLPT/${CFG}.yaml \
54 |          --output-dir ${DIR} \
55 |          --dataset ${DATASET} \
56 |          --shots ${SHOTS} \
57 |          --dataset-coop \
58 |          --multi-task \
59 |          TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
60 |          TRAINER.MVLPT.COOP.N_CTX 0 \
61 |          TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
62 |          TRAINER.MVLPT.COOP.CSC False \
63 |          TEST.NO_TEST False \
64 |          TEST.FINAL_MODEL "best_val"
65 |     else
66 |         python3 train.py \
67 |         --root $root \
68 |         --seed ${SEED} \
69 |         --trainer MVLPT \
70 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
71 |         --output-dir ${DIR} \
72 |         --dataset ${DATASET} \
73 |         --shots ${SHOTS} \
74 |         --dataset-coop \
75 |         --multi-task \
76 |         TRAINER.MVLPT.VPT.N_CTX 0 \
77 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
78 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
79 |         TRAINER.MVLPT.COOP.CSC False \
80 |         TEST.NO_TEST False \
81 | 		TEST.FINAL_MODEL "best_val" \
82 |         TRAINER.CUT_CONTEXTLEN True
83 |     fi
84 | done
85 | 


--------------------------------------------------------------------------------
/scripts/mvlpt/main_single_coopdata_cut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | # DATA=/path/to/datasets
 5 | #TRAINER=UPT
 6 | #TRAINER=VPT
 7 | # TRAINER=CoOp
 8 | TRAINER=$1
 9 | 
10 | output_dir=~/opensource/ckpt/
11 | #root=/shared/sheng/coop_data
12 | # root=/tmp/ic/
13 | root=//tmp/coop_data
14 | 
15 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance']
16 | CFG=$2  # config file
17 | NCTX=$3  # number of context tokens
18 | SHOTS=$4  # number of shots (5, 20, 50)
19 | 
20 | # DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
21 | # DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
22 | DATASET=$6
23 | MODEL_DIR="--model-dir ${output_dir}/${PRETRAIN_DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp/"
24 | # MODEL_DIR=""
25 | # for SEED in 1 2 3
26 | # for SEED in 1
27 | for SEED in $5
28 | do
29 |     DIR=$output_dir/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED}
30 |     # if [ -d "$DIR" ]; then
31 |     #     echo "Oops! The results exist at ${DIR} (so skip this job)"
32 |     # else
33 |     if [ $TRAINER = "UPT" ]; then
34 |         python3 train.py \
35 |         --root $root \
36 |         --seed ${SEED} \
37 |         --trainer MVLPT \
38 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
39 |         --output-dir ${DIR} \
40 |         --dataset ${DATASET} \
41 |         --shots ${SHOTS} \
42 |         --dataset-coop \
43 |         ${MODEL_DIR} \
44 |         TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
45 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
46 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
47 |         TRAINER.MVLPT.COOP.CSC False \
48 |         TEST.NO_TEST False \
49 | 		TEST.FINAL_MODEL "best_val" \
50 |         TRAINER.CUT_CONTEXTLEN True
51 |     elif  [ $TRAINER = "VPT" ]; then
52 |         python3 train.py \
53 |          --root $root \
54 |          --seed ${SEED} \
55 |          --trainer MVLPT \
56 |          --config-file configs/trainers/MVLPT/${CFG}.yaml \
57 |          --output-dir ${DIR} \
58 |          --dataset ${DATASET} \
59 |          --shots ${SHOTS} \
60 |          --dataset-coop \
61 |          ${MODEL_DIR} \
62 |          TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
63 |          TRAINER.MVLPT.COOP.N_CTX 0 \
64 |          TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
65 |          TRAINER.MVLPT.COOP.CSC False \
66 |          TEST.NO_TEST False \
67 |          TEST.FINAL_MODEL "best_val"
68 |     else
69 |         python3 train.py \
70 |         --root $root \
71 |         --seed ${SEED} \
72 |         --trainer MVLPT \
73 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
74 |         --output-dir ${DIR} \
75 |         --dataset ${DATASET} \
76 |         --shots ${SHOTS} \
77 |         --dataset-coop \
78 |         ${MODEL_DIR} \
79 |         TRAINER.MVLPT.VPT.N_CTX 0 \
80 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
81 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
82 |         TRAINER.MVLPT.COOP.CSC False \
83 |         TEST.NO_TEST False \
84 | 		TEST.FINAL_MODEL "best_val" \
85 |         TRAINER.CUT_CONTEXTLEN True
86 |     fi
87 | done
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multitask Prompt Learning for Vision-Language Models
 2 | 
 3 | This repo contains the codebase of a series of research projects focused on adapting vision-language models like [CLIP](https://arxiv.org/abs/2103.00020) to downstream datasets via *multitask prompt learning*:
 4 | 
 5 | * [Multitask Vision-Language Prompt Tuning](https://arxiv.org/pdf/2211.11720.pdf)
 6 | 
 7 | <p align="center">
 8 |   <img src="./figs/MVLPT_figures-cropped-1.png" width=98%/>
 9 | </p>
10 | 
11 | <p align="center">
12 |   <img src="./figs/fig2-cropped-1.png" width=98%/>
13 |   <figcaption align = "center"><b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (a) CoOp &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (b) VPT &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (c) UPT</b></figcaption>
14 | </p>
15 | 
16 | ## How to Install
17 | This code is built on top of the toolbox [Dassl.pytorch](https://github.com/KaiyangZhou/Dassl.pytorch) and [CoOp](https://github.com/KaiyangZhou/CoOp) so you need to install the [`dassl`](https://github.com/KaiyangZhou/Dassl.pytorch#installation) and [PyTorch](https://pytorch.org/) environment first. After that, run `pip install -r requirements.txt` under `MVLPT/` to install a few more packages required by [CLIP](https://github.com/openai/CLIP) (this should be done when `dassl` is activated). Then, you are ready to go.
18 | 
19 | Follow [DATASETS.md](DATASETS.md) to install the datasets from [CoOp](https://github.com/KaiyangZhou/CoOp/tree/main/datasets) for multitask source prompt initialization or run the following script after install `gdown`. 
20 | ```bash
21 | bash scripts/data.sh
22 | ```
23 | 
24 | Note that the dataset for target [ELEVATER](https://arxiv.org/pdf/2204.08790.pdf) benchmark will be downloaded automatically in `MVLPT/trainers/vision_benchmark/`. 
25 | ## How to Run
26 | 
27 | Click a paper below to see the detailed instructions on how to run the code to reproduce the results.
28 | 
29 | * [Multitask Vision-Language Prompt Tuning](MVLPT.md)
30 | 
31 | ## Models and Results
32 | 
33 | - The pre-trained weights of MVLPT (MCoOp, MVPT, MUPT) on 11 tasks based on ViT-B/16 and ViT-B/32 can be downloaded altogether via this [link](https://drive.google.com/file/d/1YWVLsVcsTEP_z3ehIDgGpFTNalTG_1IE/view?usp=sharing). The weights can be used to reproduce the results in Table 1 of MVLPT's paper (i.e., the results on ImageNet and its four variants with domain shift). To load the weights and run the evaluation code, you will need to specify `--model-dir` and `--load-epoch` (see this [script](https://github.com/sIncerass/MVLPT/blob/main/scripts/mvlpt/main_single_elevater_cut.sh) for example).
34 | 
35 | <p align="center">
36 |   <img src="./figs/ablate_flops_all-1.png" width=98%/>
37 | </p>
38 | 
39 | 
40 | ## Citation
41 | If you use this code in your research, please kindly cite the following papers
42 | 
43 | ```bash
44 | @article{shen2022mvlpt,
45 |     title={Multitask Vision-Language Prompt Tuning},
46 |     author = {Shen, Sheng and Yang, Shijia and Zhang, Tianjun and Zhai, Bohan and Gonzalez, Joseph E. and Keutzer, Kurt and Darrell, Trevor},
47 |     journal={arXiv preprint arXiv:2211.11720},
48 |     year={2022}
49 | }
50 | ```
51 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/hfpt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | from transformers import AutoTokenizer
 4 | import torch
 5 | 
 6 | 
 7 | class HFPTTokenizer(object):
 8 |     def __init__(self, pt_name=None):
 9 | 
10 |         self.pt_name = pt_name
11 |         self.added_sep_token = 0
12 |         self.added_cls_token = 0
13 |         self.enable_add_tokens = False
14 |         self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name))
15 | 
16 |         if (pt_name is None):
17 |             self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
18 |         else:
19 |             self.tokenizer = AutoTokenizer.from_pretrained(pt_name)
20 | 
21 |         # Adding tokens to GPT causing NaN training loss.
22 |         # Disable for now until further investigation.
23 |         if (self.enable_add_tokens):
24 |             if (self.tokenizer.sep_token is None):
25 |                 self.tokenizer.add_special_tokens({'sep_token': '<SEP>'})
26 |                 self.added_sep_token = 1
27 | 
28 |             if (self.tokenizer.cls_token is None):
29 |                 self.tokenizer.add_special_tokens({'cls_token': '<CLS>'})
30 |                 self.added_cls_token = 1
31 | 
32 |         if (self.gpt_special_case):
33 |             self.tokenizer.pad_token = self.tokenizer.eos_token
34 |             self.tokenizer.sep_token = self.tokenizer.eos_token
35 | 
36 |     def get_eot_token(self):
37 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0]
38 | 
39 |     def get_sot_token(self):
40 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0]
41 | 
42 |     def get_eot_token_list(self):
43 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)
44 | 
45 |     def get_sot_token_list(self):
46 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)
47 | 
48 |     def get_tokenizer_obj(self):
49 |         return self.tokenizer
50 | 
51 |     # Language model needs to know if new tokens
52 |     # were added to the dictionary.
53 |     def check_added_tokens(self):
54 |         return self.added_sep_token + self.added_cls_token
55 | 
56 |     def tokenize(self, texts: Union[str, List[str]], context_length: int = 77):
57 |         if isinstance(texts, str):
58 |             texts = [texts]
59 | 
60 |         padding = 'max_length'
61 | 
62 |         seqstart = []
63 |         seqend = []
64 | 
65 |         max_length = context_length
66 | 
67 |         if (self.added_cls_token > 0):
68 |             seqstart = self.get_sot_token_list()
69 |             max_length = max_length - 1
70 | 
71 |         if (self.added_sep_token > 0):
72 |             seqend = self.get_eot_token_list()
73 |             max_length = max_length - 1
74 | 
75 |         tokens = self.tokenizer(
76 |                     texts, padding=padding,
77 |                     truncation=True,
78 |                     max_length=max_length
79 |                 )['input_ids']
80 | 
81 |         for i in range(len(tokens)):
82 |             tokens[i] = seqstart + tokens[i] + seqend
83 | 
84 |         if (self.gpt_special_case):
85 |             for i in range(len(tokens)):
86 |                 tokens[i][-1] = self.get_eot_token()
87 | 
88 |         result = torch.Tensor(tokens).type(torch.LongTensor)
89 | 
90 |         return result
91 | 
92 |     def get_vocab_size(self):
93 |         return self.tokenizer.vocab_size
94 | 
95 |     def __call__(self, texts: Union[str, List[str]], context_length: int = 77):
96 |         return self.tokenize(texts, context_length)
97 | 


--------------------------------------------------------------------------------
/datasets/stanford_cars.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from scipy.io import loadmat
 4 | 
 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 6 | from dassl.utils import mkdir_if_missing
 7 | 
 8 | from .oxford_pets import OxfordPets
 9 | 
10 | 
11 | @DATASET_REGISTRY.register()
12 | class StanfordCars(DatasetBase):
13 | 
14 |     dataset_dir = "stanford_cars"
15 | 
16 |     def __init__(self, cfg):
17 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
18 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
19 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_StanfordCars.json")
20 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
21 |         mkdir_if_missing(self.split_fewshot_dir)
22 | 
23 |         if os.path.exists(self.split_path):
24 |             train, val, test = OxfordPets.read_split(self.split_path, self.dataset_dir)
25 |         else:
26 |             trainval_file = os.path.join(self.dataset_dir, "devkit", "cars_train_annos.mat")
27 |             test_file = os.path.join(self.dataset_dir, "cars_test_annos_withlabels.mat")
28 |             meta_file = os.path.join(self.dataset_dir, "devkit", "cars_meta.mat")
29 |             trainval = self.read_data("cars_train", trainval_file, meta_file)
30 |             test = self.read_data("cars_test", test_file, meta_file)
31 |             train, val = OxfordPets.split_trainval(trainval)
32 |             OxfordPets.save_split(train, val, test, self.split_path, self.dataset_dir)
33 | 
34 |         num_shots = cfg.DATASET.NUM_SHOTS
35 |         if num_shots >= 1:
36 |             seed = cfg.SEED
37 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
38 |             
39 |             if os.path.exists(preprocessed):
40 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
41 |                 with open(preprocessed, "rb") as file:
42 |                     data = pickle.load(file)
43 |                     train, val = data["train"], data["val"]
44 |             else:
45 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
46 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
47 |                 data = {"train": train, "val": val}
48 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
49 |                 with open(preprocessed, "wb") as file:
50 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
51 | 
52 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
53 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
54 | 
55 |         super().__init__(train_x=train, val=val, test=test)
56 | 
57 |     def read_data(self, image_dir, anno_file, meta_file):
58 |         anno_file = loadmat(anno_file)["annotations"][0]
59 |         meta_file = loadmat(meta_file)["class_names"][0]
60 |         items = []
61 | 
62 |         for i in range(len(anno_file)):
63 |             imname = anno_file[i]["fname"][0]
64 |             impath = os.path.join(self.dataset_dir, image_dir, imname)
65 |             label = anno_file[i]["class"][0, 0]
66 |             label = int(label) - 1  # convert to 0-based index
67 |             classname = meta_file[label][0]
68 |             names = classname.split(" ")
69 |             year = names.pop(-1)
70 |             names.insert(0, year)
71 |             classname = " ".join(names)
72 |             item = Datum(impath=impath, label=label, classname=classname)
73 |             items.append(item)
74 | 
75 |         return items
76 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/languages/prompt_engineering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | def get_prompt_templates():
  6 |     prompt_templates = [
  7 |         '{}.',
  8 |         'a photo of a {}.',
  9 |         'a bad photo of a {}.',
 10 |         'a photo of many {}.',
 11 |         'a sculpture of a {}.',
 12 |         'a photo of the hard to see {}.',
 13 |         'a low resolution photo of the {}.',
 14 |         'a rendering of a {}.',
 15 |         'graffiti of a {}.',
 16 |         'a bad photo of the {}.',
 17 |         'a cropped photo of the {}.',
 18 |         'a tattoo of a {}.',
 19 |         'the embroidered {}.',
 20 |         'a photo of a hard to see {}.',
 21 |         'a bright photo of a {}.',
 22 |         'a photo of a clean {}.',
 23 |         'a photo of a dirty {}.',
 24 |         'a dark photo of the {}.',
 25 |         'a drawing of a {}.',
 26 |         'a photo of my {}.',
 27 |         'the plastic {}.',
 28 |         'a photo of the cool {}.',
 29 |         'a close-up photo of a {}.',
 30 |         'a black and white photo of the {}.',
 31 |         'a painting of the {}.',
 32 |         'a painting of a {}.',
 33 |         'a pixelated photo of the {}.',
 34 |         'a sculpture of the {}.',
 35 |         'a bright photo of the {}.',
 36 |         'a cropped photo of a {}.',
 37 |         'a plastic {}.',
 38 |         'a photo of the dirty {}.',
 39 |         'a jpeg corrupted photo of a {}.',
 40 |         'a blurry photo of the {}.',
 41 |         'a photo of the {}.',
 42 |         'a good photo of the {}.',
 43 |         'a rendering of the {}.',
 44 |         'a {} in a video game.',
 45 |         'a photo of one {}.',
 46 |         'a doodle of a {}.',
 47 |         'a close-up photo of the {}.',
 48 |         'the origami {}.',
 49 |         'the {} in a video game.',
 50 |         'a sketch of a {}.',
 51 |         'a doodle of the {}.',
 52 |         'a origami {}.',
 53 |         'a low resolution photo of a {}.',
 54 |         'the toy {}.',
 55 |         'a rendition of the {}.',
 56 |         'a photo of the clean {}.',
 57 |         'a photo of a large {}.',
 58 |         'a rendition of a {}.',
 59 |         'a photo of a nice {}.',
 60 |         'a photo of a weird {}.',
 61 |         'a blurry photo of a {}.',
 62 |         'a cartoon {}.',
 63 |         'art of a {}.',
 64 |         'a sketch of the {}.',
 65 |         'a embroidered {}.',
 66 |         'a pixelated photo of a {}.',
 67 |         'itap of the {}.',
 68 |         'a jpeg corrupted photo of the {}.',
 69 |         'a good photo of a {}.',
 70 |         'a plushie {}.',
 71 |         'a photo of the nice {}.',
 72 |         'a photo of the small {}.',
 73 |         'a photo of the weird {}.',
 74 |         'the cartoon {}.',
 75 |         'art of the {}.',
 76 |         'a drawing of the {}.',
 77 |         'a photo of the large {}.',
 78 |         'a black and white photo of a {}.',
 79 |         'the plushie {}.',
 80 |         'a dark photo of a {}.',
 81 |         'itap of a {}.',
 82 |         'graffiti of the {}.',
 83 |         'a toy {}.',
 84 |         'itap of my {}.',
 85 |         'a photo of a cool {}.',
 86 |         'a photo of a small {}.',
 87 |         'a tattoo of the {}.',
 88 |     ]
 89 |     return prompt_templates
 90 | 
 91 | 
 92 | def prompt_engineering(classnames):
 93 |     prompt_templates = get_prompt_templates()
 94 |     temp_idx = np.random.randint(len(prompt_templates))
 95 | 
 96 |     if isinstance(classnames, list):
 97 |         classname = random.choice(classnames)
 98 |     else:
 99 |         classname = classnames
100 | 
101 |     return prompt_templates[temp_idx].replace('{}', classname.replace(',', '').replace('+', ' '))
102 | 


--------------------------------------------------------------------------------
/datasets/sun397.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 5 | from dassl.utils import mkdir_if_missing
 6 | 
 7 | from .oxford_pets import OxfordPets
 8 | 
 9 | 
10 | @DATASET_REGISTRY.register()
11 | class SUN397(DatasetBase):
12 | 
13 |     dataset_dir = "sun397"
14 | 
15 |     def __init__(self, cfg):
16 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
17 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
18 |         self.image_dir = os.path.join(self.dataset_dir, "SUN397")
19 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_SUN397.json")
20 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
21 |         mkdir_if_missing(self.split_fewshot_dir)
22 | 
23 |         if os.path.exists(self.split_path):
24 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
25 |         else:
26 |             classnames = []
27 |             with open(os.path.join(self.dataset_dir, "ClassName.txt"), "r") as f:
28 |                 lines = f.readlines()
29 |                 for line in lines:
30 |                     line = line.strip()[1:]  # remove /
31 |                     classnames.append(line)
32 |             cname2lab = {c: i for i, c in enumerate(classnames)}
33 |             trainval = self.read_data(cname2lab, "Training_01.txt")
34 |             test = self.read_data(cname2lab, "Testing_01.txt")
35 |             train, val = OxfordPets.split_trainval(trainval)
36 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
37 | 
38 |         num_shots = cfg.DATASET.NUM_SHOTS
39 |         if num_shots >= 1:
40 |             seed = cfg.SEED
41 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
42 |             
43 |             if os.path.exists(preprocessed):
44 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
45 |                 with open(preprocessed, "rb") as file:
46 |                     data = pickle.load(file)
47 |                     train, val = data["train"], data["val"]
48 |             else:
49 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
50 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
51 |                 data = {"train": train, "val": val}
52 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
53 |                 with open(preprocessed, "wb") as file:
54 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
55 | 
56 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
57 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
58 | 
59 |         super().__init__(train_x=train, val=val, test=test)
60 | 
61 |     def read_data(self, cname2lab, text_file):
62 |         text_file = os.path.join(self.dataset_dir, text_file)
63 |         items = []
64 | 
65 |         with open(text_file, "r") as f:
66 |             lines = f.readlines()
67 |             for line in lines:
68 |                 imname = line.strip()[1:]  # remove /
69 |                 classname = os.path.dirname(imname)
70 |                 label = cname2lab[classname]
71 |                 impath = os.path.join(self.image_dir, imname)
72 | 
73 |                 names = classname.split("/")[1:]  # remove 1st letter
74 |                 names = names[::-1]  # put words like indoor/outdoor at first
75 |                 classname = " ".join(names)
76 | 
77 |                 item = Datum(impath=impath, label=label, classname=classname)
78 |                 items.append(item)
79 | 
80 |         return items
81 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/datasets/languages/hfpt_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, List
  2 | 
  3 | from transformers import AutoTokenizer
  4 | import torch
  5 | 
  6 | 
  7 | class HFPTTokenizer(object):
  8 |     def __init__(self, pt_name = None):
  9 |         
 10 |         self.pt_name = pt_name
 11 |         self.added_sep_token = 0
 12 |         self.added_cls_token = 0
 13 |         self.enable_add_tokens = False
 14 |         self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name))
 15 | 
 16 |         if (pt_name is None):
 17 |             self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
 18 |         else:
 19 |             self.tokenizer = AutoTokenizer.from_pretrained(pt_name)
 20 | 
 21 |         # Adding tokens to GPT causing NaN training loss.  
 22 |         # Disable for now until further investigation. 
 23 |         if (self.enable_add_tokens):
 24 |             if (self.tokenizer.sep_token is None):
 25 |                 self.tokenizer.add_special_tokens({'sep_token': '<SEP>'})
 26 |                 self.added_sep_token = 1
 27 |     
 28 |             if (self.tokenizer.cls_token is None):
 29 |                 self.tokenizer.add_special_tokens({'cls_token': '<CLS>'})
 30 |                 self.added_cls_token = 1
 31 | 
 32 |         if (self.gpt_special_case):
 33 |             self.tokenizer.pad_token = self.tokenizer.eos_token
 34 |             self.tokenizer.sep_token = self.tokenizer.eos_token
 35 | 
 36 |     def get_eot_token(self):
 37 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0]
 38 | 
 39 |     def get_sot_token(self):
 40 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0]
 41 | 
 42 |     def get_eot_token_list(self):
 43 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)
 44 | 
 45 |     def get_sot_token_list(self):
 46 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)
 47 | 
 48 |     def get_tokenizer_obj(self):
 49 |         return self.tokenizer
 50 | 
 51 |     # Language model needs to know if new tokens
 52 |     # were added to the dictionary.
 53 |     def check_added_tokens(self):
 54 |         return self.added_sep_token + self.added_cls_token
 55 | 
 56 |     def tokenize(self, texts: Union[str, List[str]], context_length: int = 77):
 57 |         if isinstance(texts, str):
 58 |             texts = [texts]
 59 | 
 60 |         padding = 'max_length'
 61 | 
 62 |         seqstart = []
 63 |         seqtok = []
 64 |         seqend = []
 65 | 
 66 |         max_length = context_length
 67 | 
 68 |         if (self.added_cls_token > 0):
 69 |             seqstart = self.get_sot_token_list()
 70 |             max_length = max_length - 1
 71 | 
 72 |         if (self.added_sep_token > 0):
 73 |             seqend = self.get_eot_token_list()
 74 |             max_length = max_length - 1
 75 | 
 76 |         tokens = self.tokenizer(
 77 |                     texts, padding=padding,
 78 |                     truncation=True,
 79 |                     max_length=max_length
 80 |                 )['input_ids']
 81 | 
 82 |         for i in range(len(tokens)):
 83 |             tokens[i] = seqstart + tokens[i] + seqend
 84 | 
 85 |         if (self.gpt_special_case):
 86 |             for i in range(len(tokens)):
 87 |                 tokens[i][-1] = self.get_eot_token()
 88 | 
 89 |         #print(str(tokens))
 90 | 
 91 |         result = torch.Tensor(tokens).type(torch.LongTensor)
 92 | 
 93 |         return result
 94 | 
 95 |     def get_vocab_size(self):
 96 |         return self.tokenizer.vocab_size
 97 | 
 98 |     def __call__(self, texts: Union[str, List[str]], context_length: int = 77):
 99 |         return self.tokenize(texts, context_length)
100 | 


--------------------------------------------------------------------------------
/datasets/ucf101.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import re
 4 | 
 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 6 | from dassl.utils import mkdir_if_missing
 7 | 
 8 | from .oxford_pets import OxfordPets
 9 | 
10 | 
11 | @DATASET_REGISTRY.register()
12 | class UCF101(DatasetBase):
13 | 
14 |     dataset_dir = "ucf101"
15 | 
16 |     def __init__(self, cfg):
17 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
18 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
19 |         self.image_dir = os.path.join(self.dataset_dir, "UCF-101-midframes")
20 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_UCF101.json")
21 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
22 |         mkdir_if_missing(self.split_fewshot_dir)
23 | 
24 |         if os.path.exists(self.split_path):
25 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
26 |         else:
27 |             cname2lab = {}
28 |             filepath = os.path.join(self.dataset_dir, "ucfTrainTestlist/classInd.txt")
29 |             with open(filepath, "r") as f:
30 |                 lines = f.readlines()
31 |                 for line in lines:
32 |                     label, classname = line.strip().split(" ")
33 |                     label = int(label) - 1  # conver to 0-based index
34 |                     cname2lab[classname] = label
35 | 
36 |             trainval = self.read_data(cname2lab, "ucfTrainTestlist/trainlist01.txt")
37 |             test = self.read_data(cname2lab, "ucfTrainTestlist/testlist01.txt")
38 |             train, val = OxfordPets.split_trainval(trainval)
39 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
40 | 
41 |         num_shots = cfg.DATASET.NUM_SHOTS
42 |         if num_shots >= 1:
43 |             seed = cfg.SEED
44 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
45 |             
46 |             if os.path.exists(preprocessed):
47 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
48 |                 with open(preprocessed, "rb") as file:
49 |                     data = pickle.load(file)
50 |                     train, val = data["train"], data["val"]
51 |             else:
52 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
53 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
54 |                 data = {"train": train, "val": val}
55 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
56 |                 with open(preprocessed, "wb") as file:
57 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
58 | 
59 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
60 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
61 | 
62 |         super().__init__(train_x=train, val=val, test=test)
63 | 
64 |     def read_data(self, cname2lab, text_file):
65 |         text_file = os.path.join(self.dataset_dir, text_file)
66 |         items = []
67 | 
68 |         with open(text_file, "r") as f:
69 |             lines = f.readlines()
70 |             for line in lines:
71 |                 line = line.strip().split(" ")[0]  # trainlist: filename, label
72 |                 action, filename = line.split("/")
73 |                 label = cname2lab[action]
74 | 
75 |                 elements = re.findall("[A-Z][^A-Z]*", action)
76 |                 renamed_action = "_".join(elements)
77 | 
78 |                 filename = filename.replace(".avi", ".jpg")
79 |                 impath = os.path.join(self.image_dir, renamed_action, filename)
80 | 
81 |                 item = Datum(impath=impath, label=label, classname=renamed_action)
82 |                 items.append(item)
83 | 
84 |         return items
85 | 


--------------------------------------------------------------------------------
/scripts/mvlpt/main_mt_elevater_cut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | # DATA=/path/to/datasets
 5 | #TRAINER=UPT
 6 | #TRAINER=VPT
 7 | # TRAINER=CoOp
 8 | TRAINER=$1
 9 | 
10 | output_dir=./CoCoOp_mt_20
11 | #root=/shared/sheng/coop_data
12 | # root=/tmp/ic/
13 | # root=/tmp//coop_data
14 | root=/rscratch/shijiayang/Prompt/new0/prompt-moe/CoOp/outputs/datasets
15 | 
16 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance']
17 | CFG=$2  # config file
18 | NCTX=$3  # number of context tokens
19 | SHOTS=$4  # number of shots (5, 20, 50)
20 | 
21 | # DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
22 | # DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
23 | DATASET="hateful-memes,cifar-10,mnist,oxford-flower-102,oxford-iiit-pets,resisc45_clip,country211,food-101,stanford-cars,caltech-101,dtd,voc-2007-classification,cifar-100,fgvc-aircraft-2013b-variants102,patch-camelyon,rendered-sst2,gtsrb,eurosat_clip,fer-2013,kitti-distance"
24 | for SEED in 1 2 3
25 | # for SEED in 1
26 | # for SEED in $5
27 | do
28 |     DIR=$output_dir/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED}
29 |     # if [ -d "$DIR" ]; then
30 |     #     echo "Oops! The results exist at ${DIR} (so skip this job)"
31 |     # else
32 |     if [ $TRAINER = "UPT" ]; then
33 |         python3 train.py \
34 |         --root $root \
35 |         --seed ${SEED} \
36 |         --trainer MVLPT \
37 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
38 |         --output-dir ${DIR} \
39 |         --dataset ${DATASET} \
40 |         --shots ${SHOTS} \
41 |         --multi-task \
42 |         TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
43 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
44 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
45 |         TRAINER.MVLPT.COOP.CSC False \
46 |         TEST.NO_TEST False \
47 | 		TEST.FINAL_MODEL "best_val" \
48 |         TRAINER.CUT_CONTEXTLEN True
49 |     elif  [ $TRAINER = "VPT" ]; then
50 |         python3 train.py \
51 |          --root $root \
52 |          --seed ${SEED} \
53 |          --trainer MVLPT \
54 |          --config-file configs/trainers/MVLPT/${CFG}.yaml \
55 |          --output-dir ${DIR} \
56 |          --dataset ${DATASET} \
57 |          --shots ${SHOTS} \
58 |          --multi-task \
59 |          TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
60 |          TRAINER.MVLPT.COOP.N_CTX 0 \
61 |          TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
62 |          TRAINER.MVLPT.COOP.CSC False \
63 |          TEST.NO_TEST False \
64 |          TEST.FINAL_MODEL "best_val"
65 | 
66 |     elif [ $TRAINER = "COCOOP" ]; then
67 |         python3 train.py \
68 |          --root $root \
69 |          --seed ${SEED} \
70 |          --trainer MVLPT \
71 |          --config-file configs/trainers/MVLPT/${CFG}.yaml \
72 |          --output-dir ${DIR} \
73 |          --dataset ${DATASET} \
74 |          --shots ${SHOTS} \
75 |          --multi-task \
76 |          TRAINER.MVLPT.COCOOP.N_CTX ${NCTX} \
77 |          TEST.NO_TEST False \
78 |          TEST.FINAL_MODEL "best_val"
79 |     else
80 |         python3 train.py \
81 |         --root $root \
82 |         --seed ${SEED} \
83 |         --trainer MVLPT \
84 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
85 |         --output-dir ${DIR} \
86 |         --dataset ${DATASET} \
87 |         --shots ${SHOTS} \
88 |         --multi-task \
89 |         TRAINER.MVLPT.VPT.N_CTX 0 \
90 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
91 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
92 |         TRAINER.MVLPT.COOP.CSC False \
93 |         TEST.NO_TEST False \
94 | 		TEST.FINAL_MODEL "best_val" \
95 |         TRAINER.CUT_CONTEXTLEN True
96 |     fi
97 | done
98 | 


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/external/cifar-10_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "airplane", "def_wiki": "A powered heavier-than-air aircraft with fixed wings.", "path_wn": ["airplane", "heavier-than-air_craft", "aircraft", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an aircraft that has a fixed wing and is powered by propellers or jets"}, {"classname": "automobile", "def_wiki": "A type of vehicle designed to move on the ground under its own stored power and intended to carry a driver, a small number of additional passengers, and a very limited amount of other load. A car or motorcar.", "path_wn": ["car", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a motor vehicle with four wheels; usually propelled by an internal combustion engine"}, {"classname": "bird", "def_wiki": "A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", "path_wn": ["bird", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"}, {"classname": "cat", "def_wiki": "An animal of the family Felidae:\n Synonym: felid", "path_wn": ["cat", "feline", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats"}, {"classname": "deer", "def_wiki": "A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", "path_wn": ["deer", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "distinguished from Bovidae by the male's having solid deciduous antlers"}, {"classname": "dog", "def_wiki": "A mammal, Canis familiaris or Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.", "path_wn": ["dog", "canine", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds"}, {"classname": "frog", "def_wiki": "A small tailless amphibian of the order Anura that typically hops.", "path_wn": ["frog", "amphibian", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "any of various tailless stout-bodied amphibians with long hind limbs for leaping; semiaquatic and terrestrial species"}, {"classname": "horse", "def_wiki": "Any of several animals related to Equus ferus caballus.", "path_wn": ["horse", "equine", "odd-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "solid-hoofed herbivorous quadruped domesticated since prehistoric times"}, {"classname": "ship", "def_wiki": "A water-borne vessel generally larger than a boat.", "path_wn": ["ship", "vessel", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a vessel that carries passengers or freight"}, {"classname": "truck", "def_wiki": "A small wheel or roller, specifically the wheel of a gun carriage.", "path_wn": ["truck", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an automotive vehicle suitable for hauling"}]


--------------------------------------------------------------------------------
/trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_eurosat_clip.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "annual crop land", "gpt3": [" arable land", " arable land", " land used for growing crops that are harvested once a year", " Land that is used to grow crops for one year.", " arable land"]}, {"classname": "forest", "gpt3": [" a large area of land covered with trees", " land covered with trees", " a tract of land covered with trees and underbrush", " A large area of land covered with trees.", " A tract of land covered with trees and undergrowth, larger than woods."]}, {"classname": "brushland or shrubland", "gpt3": [" A land area covered with low-growing woody plants, such as bushes, small trees, and shrubs.", " Land that is covered mostly with shrubs.", " land covered with bushes, shrubs, and small trees.", " land covered with low, scrubby vegetation", " land covered with low, scrubby vegetation, especially thorny bushes."]}, {"classname": "highway or road", "gpt3": [" A way or course for the passage of vehicles, persons, and merchandise, usually including paved or graveled surface, curbs, and usually sidewalks.", " a way (usually public) for the transportation of people or goods", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way or path for the travel of people or vehicles."]}, {"classname": "industrial buildings or commercial buildings", "gpt3": [" The act or process by which something is built; construction.", " A building or group of buildings where goods are produced, processed or manufactured, or where services are provided.", " A building or group of buildings where goods are manufactured, processed or repaired, or goods and services are sold.", " The act or process by which something is built; construction.", " Buildings used for manufacturing or storing and selling goods."]}, {"classname": "pasture land", "gpt3": [" grassland used for grazing livestock", " Land used for grazing.", " land used for grazing animals", " Land used for grazing.", " Land used for grazing."]}, {"classname": "permanent crop land", "gpt3": [" land used for growing crops that are not harvested annually", " arable land", " Land that is used for growing crops that are not intended for sale or for feeding livestock.", " Land used for growing crops that are not intended for harvest in the current year.", " land used for growing crops that are not harvested annually, such as trees, shrubs, and vines."]}, {"classname": "residential buildings or homes or apartments", "gpt3": [" A complete domicile occupying only part of a building, especially one for rent; a flat.", " A building or group of buildings containing a number of homes.", " The act or process by which something is built; construction.", " buildings where people live", " A complete domicile occupying only part of a building, especially one for rent; a flat."]}, {"classname": "river", "gpt3": [" A large natural stream of water flowing in a channel to the sea, a lake, or another river.", " a natural stream of water larger than a creek", " A natural stream of water of considerable volume flowing in a definite course from higher to lower ground.", " A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", " A large natural stream of water (larger than a creek) flowing in a channel on the surface of the earth."]}, {"classname": "lake or sea", "gpt3": [" A large body of salt water.", " a large body of water surrounded by land", " A large body of water surrounded by land.", " A large body of water surrounded by land.", " A large body of water surrounded by land."]}]


--------------------------------------------------------------------------------
/datasets/oxford_flowers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import random
 4 | from scipy.io import loadmat
 5 | from collections import defaultdict
 6 | 
 7 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 8 | from dassl.utils import read_json, mkdir_if_missing
 9 | 
10 | from .oxford_pets import OxfordPets
11 | 
12 | 
13 | @DATASET_REGISTRY.register()
14 | class OxfordFlowers(DatasetBase):
15 | 
16 |     dataset_dir = "oxford_flowers"
17 | 
18 |     def __init__(self, cfg):
19 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
20 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
21 |         self.image_dir = os.path.join(self.dataset_dir, "jpg")
22 |         self.label_file = os.path.join(self.dataset_dir, "imagelabels.mat")
23 |         self.lab2cname_file = os.path.join(self.dataset_dir, "cat_to_name.json")
24 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_OxfordFlowers.json")
25 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
26 |         mkdir_if_missing(self.split_fewshot_dir)
27 | 
28 |         if os.path.exists(self.split_path):
29 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
30 |         else:
31 |             train, val, test = self.read_data()
32 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
33 | 
34 |         num_shots = cfg.DATASET.NUM_SHOTS
35 |         if num_shots >= 1:
36 |             seed = cfg.SEED
37 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
38 |             
39 |             if os.path.exists(preprocessed):
40 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
41 |                 with open(preprocessed, "rb") as file:
42 |                     data = pickle.load(file)
43 |                     train, val = data["train"], data["val"]
44 |             else:
45 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
46 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
47 |                 data = {"train": train, "val": val}
48 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
49 |                 with open(preprocessed, "wb") as file:
50 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
51 | 
52 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
53 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
54 | 
55 |         super().__init__(train_x=train, val=val, test=test)
56 | 
57 |     def read_data(self):
58 |         tracker = defaultdict(list)
59 |         label_file = loadmat(self.label_file)["labels"][0]
60 |         for i, label in enumerate(label_file):
61 |             imname = f"image_{str(i + 1).zfill(5)}.jpg"
62 |             impath = os.path.join(self.image_dir, imname)
63 |             label = int(label)
64 |             tracker[label].append(impath)
65 | 
66 |         print("Splitting data into 50% train, 20% val, and 30% test")
67 | 
68 |         def _collate(ims, y, c):
69 |             items = []
70 |             for im in ims:
71 |                 item = Datum(impath=im, label=y - 1, classname=c)  # convert to 0-based label
72 |                 items.append(item)
73 |             return items
74 | 
75 |         lab2cname = read_json(self.lab2cname_file)
76 |         train, val, test = [], [], []
77 |         for label, impaths in tracker.items():
78 |             random.shuffle(impaths)
79 |             n_total = len(impaths)
80 |             n_train = round(n_total * 0.5)
81 |             n_val = round(n_total * 0.2)
82 |             n_test = n_total - n_train - n_val
83 |             assert n_train > 0 and n_val > 0 and n_test > 0
84 |             cname = lab2cname[str(label)]
85 |             train.extend(_collate(impaths[:n_train], label, cname))
86 |             val.extend(_collate(impaths[n_train : n_train + n_val], label, cname))
87 |             test.extend(_collate(impaths[n_train + n_val :], label, cname))
88 | 
89 |         return train, val, test
90 | 


--------------------------------------------------------------------------------
/trainers/zsclip.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from dassl.engine import TRAINER_REGISTRY, TrainerX
  5 | from dassl.optim import build_optimizer, build_lr_scheduler
  6 | 
  7 | from clip import clip
  8 | from clip.model import convert_weights
  9 | 
 10 | from .coop import load_clip_to_cpu
 11 | from .imagenet_templates import IMAGENET_TEMPLATES, IMAGENET_TEMPLATES_SELECT
 12 | 
 13 | CUSTOM_TEMPLATES = {
 14 |     "OxfordPets": "a photo of a {}, a type of pet.",
 15 |     "OxfordFlowers": "a photo of a {}, a type of flower.",
 16 |     "FGVCAircraft": "a photo of a {}, a type of aircraft.",
 17 |     "DescribableTextures": "{} texture.",
 18 |     "EuroSAT": "a centered satellite photo of {}.",
 19 |     "StanfordCars": "a photo of a {}.",
 20 |     "Food101": "a photo of {}, a type of food.",
 21 |     "SUN397": "a photo of a {}.",
 22 |     "Caltech101": "a photo of a {}.",
 23 |     "UCF101": "a photo of a person doing {}.",
 24 |     "ImageNet": "a photo of a {}.",
 25 |     "ImageNetSketch": "a photo of a {}.",
 26 |     "ImageNetV2": "a photo of a {}.",
 27 |     "ImageNetA": "a photo of a {}.",
 28 |     "ImageNetR": "a photo of a {}.",
 29 | }
 30 | 
 31 | 
 32 | @TRAINER_REGISTRY.register()
 33 | class ZeroshotCLIP(TrainerX):
 34 |     def build_model(self):
 35 |         cfg = self.cfg
 36 |         classnames = self.dm.dataset.classnames
 37 | 
 38 |         print(f"Loading CLIP (backbone: {cfg.MODEL.BACKBONE.NAME})")
 39 |         clip_model = load_clip_to_cpu(cfg)
 40 |         clip_model.to(self.device)
 41 | 
 42 |         temp = CUSTOM_TEMPLATES[cfg.DATASET.NAME]
 43 |         prompts = [temp.format(c.replace("_", " ")) for c in classnames]
 44 |         print(f"Prompts: {prompts}")
 45 |         prompts = torch.cat([clip.tokenize(p) for p in prompts])
 46 |         prompts = prompts.to(self.device)
 47 | 
 48 |         with torch.no_grad():
 49 |             text_features = clip_model.encode_text(prompts)
 50 |             text_features = text_features / text_features.norm(dim=-1, keepdim=True)
 51 | 
 52 |         self.text_features = text_features
 53 |         self.clip_model = clip_model
 54 | 
 55 |     def model_inference(self, image):
 56 |         image_features = self.clip_model.encode_image(image)
 57 |         image_features = image_features / image_features.norm(dim=-1, keepdim=True)
 58 |         logit_scale = self.clip_model.logit_scale.exp()
 59 |         logits = logit_scale * image_features @ self.text_features.t()
 60 |         return logits
 61 | 
 62 | 
 63 | @TRAINER_REGISTRY.register()
 64 | class ZeroshotCLIP2(ZeroshotCLIP):
 65 |     """Prompt ensembling."""
 66 | 
 67 |     # templates = IMAGENET_TEMPLATES
 68 |     templates = IMAGENET_TEMPLATES_SELECT
 69 | 
 70 |     def build_model(self):
 71 |         cfg = self.cfg
 72 |         classnames = self.dm.dataset.classnames
 73 | 
 74 |         print(f"Loading CLIP (backbone: {cfg.MODEL.BACKBONE.NAME})")
 75 |         clip_model = load_clip_to_cpu(cfg)
 76 |         clip_model.to(self.device)
 77 | 
 78 |         for params in clip_model.parameters():
 79 |             params.requires_grad_(False)
 80 | 
 81 |         # add custom-made prompt
 82 |         if cfg.DATASET.NAME != "ImageNet":
 83 |             self.templates += [CUSTOM_TEMPLATES[cfg.DATASET.NAME]]
 84 | 
 85 |         num_temp = len(self.templates)
 86 |         print(f"Prompt ensembling (n={num_temp})")
 87 | 
 88 |         mean_text_features = 0
 89 |         for i, temp in enumerate(self.templates):
 90 |             prompts = [temp.format(c.replace("_", " ")) for c in classnames]
 91 |             prompts = torch.cat([clip.tokenize(p) for p in prompts]).to(self.device)
 92 |             text_features = clip_model.encode_text(prompts)
 93 |             text_features = text_features / text_features.norm(dim=-1, keepdim=True)
 94 |             mean_text_features = mean_text_features + text_features
 95 |         mean_text_features = mean_text_features / num_temp
 96 |         mean_text_features = mean_text_features / mean_text_features.norm(dim=-1, keepdim=True)
 97 | 
 98 |         self.text_features = mean_text_features
 99 |         self.clip_model = clip_model
100 | 


--------------------------------------------------------------------------------
/scripts/mvlpt/main_single_elevater_cut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # custom config
 4 | # DATA=/path/to/datasets
 5 | #TRAINER=UPT
 6 | #TRAINER=VPT
 7 | # TRAINER=CoOp
 8 | TRAINER=$1
 9 | 
10 | # output_dir=./CoCoOp_single_task_20
11 | #root=/shared/sheng/coop_data
12 | # root=/tmp/ic/
13 | # root=//tmp/coop_data
14 | root=/rscratch/shijiayang/Prompt/new0/prompt-moe/CoOp/outputs/datasets
15 | output_dir=./CoCoOp_single_task_20
16 | 
17 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance']
18 | CFG=$2  # config file
19 | NCTX=$3  # number of context tokens
20 | SHOTS=$4  # number of shots (5, 20, 50)
21 | 
22 | # PRETRAIN_DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
23 | PRETRAIN_DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101"
24 | # PRETRAIN_DATASET="hateful-memes,cifar-10,mnist,oxford-flower-102,oxford-iiit-pets,resisc45_clip,country211,food-101,stanford-cars,caltech-101,dtd,voc-2007-classification,cifar-100,patch-camelyon,rendered-sst2,gtsrb,eurosat_clip,fer-2013,kitti-distance"
25 | DATASET=$6
26 | MODEL_DIR="--model-dir ${output_dir}/${PRETRAIN_DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp/"
27 | # for SEED in 1 2 3
28 | # for SEED in 1
29 | for SEED in $5
30 | do
31 |     DIR=$output_dir/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED}
32 |     # if [ -d "$DIR" ]; then
33 |     #     echo "Oops! The results exist at ${DIR} (so skip this job)"
34 |     # else
35 |     if [ $TRAINER = "UPT" ]; then
36 |         python3 train.py \
37 |         --root $root \
38 |         --seed ${SEED} \
39 |         --trainer MVLPT \
40 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
41 |         --output-dir ${DIR} \
42 |         --dataset ${DATASET} \
43 |         --shots ${SHOTS} \
44 |         ${MODEL_DIR} \
45 |         TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
46 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
47 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
48 |         TRAINER.MVLPT.COOP.CSC False \
49 |         TEST.NO_TEST False \
50 | 		TEST.FINAL_MODEL "best_val" \
51 |         TRAINER.CUT_CONTEXTLEN True
52 |     elif  [ $TRAINER = "VPT" ]; then
53 |         python3 train.py \
54 |          --root $root \
55 |          --seed ${SEED} \
56 |          --trainer MVLPT \
57 |          --config-file configs/trainers/MVLPT/${CFG}.yaml \
58 |          --output-dir ${DIR} \
59 |          --dataset ${DATASET} \
60 |          --shots ${SHOTS} \
61 |          ${MODEL_DIR} \
62 |          TRAINER.MVLPT.VPT.N_CTX ${NCTX} \
63 |          TRAINER.MVLPT.COOP.N_CTX 0 \
64 |          TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
65 |          TRAINER.MVLPT.COOP.CSC False \
66 |          TEST.NO_TEST False \
67 |          TEST.FINAL_MODEL "best_val"
68 | 
69 |     elif [ $TRAINER = "COCOOP" ]; then
70 |         python3 train.py \
71 |          --root $root \
72 |          --seed ${SEED} \
73 |          --trainer MVLPT \
74 |          --config-file configs/trainers/MVLPT/${CFG}.yaml \
75 |          --output-dir ${DIR} \
76 |          --dataset ${DATASET} \
77 |          --shots ${SHOTS} \
78 |          TRAINER.MVLPT.COCOOP.N_CTX ${NCTX} \
79 |          TEST.NO_TEST False \
80 |          TEST.FINAL_MODEL "best_val"
81 |     else
82 |         python3 train.py \
83 |         --root $root \
84 |         --seed ${SEED} \
85 |         --trainer MVLPT \
86 |         --config-file configs/trainers/MVLPT/${CFG}.yaml \
87 |         --output-dir ${DIR} \
88 |         --dataset ${DATASET} \
89 |         --shots ${SHOTS} \
90 |         TRAINER.MVLPT.VPT.N_CTX 0 \
91 |         TRAINER.MVLPT.COOP.N_CTX ${NCTX} \
92 |         TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \
93 |         TRAINER.MVLPT.COOP.CSC False \
94 |         TEST.NO_TEST False \
95 | 		TEST.FINAL_MODEL "best_val" \
96 |         TRAINER.CUT_CONTEXTLEN True
97 |     fi
98 | done
99 | 


--------------------------------------------------------------------------------
/datasets/dtd.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import random
 4 | 
 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
 6 | from dassl.utils import listdir_nohidden, mkdir_if_missing
 7 | 
 8 | from .oxford_pets import OxfordPets
 9 | 
10 | 
11 | @DATASET_REGISTRY.register()
12 | class DescribableTextures(DatasetBase):
13 | 
14 |     dataset_dir = "dtd"
15 | 
16 |     def __init__(self, cfg):
17 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
18 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
19 |         self.image_dir = os.path.join(self.dataset_dir, "images")
20 |         self.split_path = os.path.join(self.dataset_dir, "split_zhou_DescribableTextures.json")
21 |         self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
22 |         mkdir_if_missing(self.split_fewshot_dir)
23 | 
24 |         if os.path.exists(self.split_path):
25 |             train, val, test = OxfordPets.read_split(self.split_path, self.image_dir)
26 |         else:
27 |             train, val, test = self.read_and_split_data(self.image_dir)
28 |             OxfordPets.save_split(train, val, test, self.split_path, self.image_dir)
29 | 
30 |         num_shots = cfg.DATASET.NUM_SHOTS
31 |         if num_shots >= 1:
32 |             seed = cfg.SEED
33 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
34 |             
35 |             if os.path.exists(preprocessed):
36 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
37 |                 with open(preprocessed, "rb") as file:
38 |                     data = pickle.load(file)
39 |                     train, val = data["train"], data["val"]
40 |             else:
41 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
42 |                 val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4))
43 |                 data = {"train": train, "val": val}
44 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
45 |                 with open(preprocessed, "wb") as file:
46 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
47 | 
48 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
49 |         train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample)
50 | 
51 |         super().__init__(train_x=train, val=val, test=test)
52 | 
53 |     @staticmethod
54 |     def read_and_split_data(image_dir, p_trn=0.5, p_val=0.2, ignored=[], new_cnames=None):
55 |         # The data are supposed to be organized into the following structure
56 |         # =============
57 |         # images/
58 |         #     dog/
59 |         #     cat/
60 |         #     horse/
61 |         # =============
62 |         categories = listdir_nohidden(image_dir)
63 |         categories = [c for c in categories if c not in ignored]
64 |         categories.sort()
65 | 
66 |         p_tst = 1 - p_trn - p_val
67 |         print(f"Splitting into {p_trn:.0%} train, {p_val:.0%} val, and {p_tst:.0%} test")
68 | 
69 |         def _collate(ims, y, c):
70 |             items = []
71 |             for im in ims:
72 |                 item = Datum(impath=im, label=y, classname=c)  # is already 0-based
73 |                 items.append(item)
74 |             return items
75 | 
76 |         train, val, test = [], [], []
77 |         for label, category in enumerate(categories):
78 |             category_dir = os.path.join(image_dir, category)
79 |             images = listdir_nohidden(category_dir)
80 |             images = [os.path.join(category_dir, im) for im in images]
81 |             random.shuffle(images)
82 |             n_total = len(images)
83 |             n_train = round(n_total * p_trn)
84 |             n_val = round(n_total * p_val)
85 |             n_test = n_total - n_train - n_val
86 |             assert n_train > 0 and n_val > 0 and n_test > 0
87 | 
88 |             if new_cnames is not None and category in new_cnames:
89 |                 category = new_cnames[category]
90 | 
91 |             train.extend(_collate(images[:n_train], label, category))
92 |             val.extend(_collate(images[n_train : n_train + n_val], label, category))
93 |             test.extend(_collate(images[n_train + n_val :], label, category))
94 | 
95 |         return train, val, test
96 | 


--------------------------------------------------------------------------------
/scripts/read_record.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import csv  
 3 | import glob
 4 | out_name = 'coop_eval_baseline'
 5 | # ckpt_folder='/tmp/select_5/CoOp/'
 6 | # TRAINER="UPT"
 7 | TRAINER="CoOp"
 8 | # TRAINER="VPT"
 9 | 
10 | 
11 | # SHOTS=20
12 | SHOTS=5
13 | # SHOTS=1
14 | CONFIG="vit_b16"
15 | NCTX=4 if TRAINER=="UPT" else 16
16 | # NCTX=4
17 | # eval_cat="IN1K_ADAPT"
18 | # eval_cat="COOP_ADAPT"
19 | # eval_cat="COOP_ADAPT_SEED"
20 | eval_cat="IN1KCOOP_ADAPT_A100"
21 | # eval_cat="IN1KCOOP_ADAPT_A100_SEED"
22 | # eval_cat="IN1KCOOP_ADAPT_ZEROSHOT"
23 | # eval_cat="IN1KCOOP_ADAPT_ZEROSHOT_SEED"
24 | 
25 | # eval_cat="IN1K_ADAPT_ZERO_SHOT"
26 | # eval_cat="CLIP_ZEROSHOT"
27 | # eval_cat="EVAL_BEST"
28 | # eval_cat="COOP_ADAPT_ZEROSHOT"
29 | # eval_cat="COOP_ADAPT_ZEROSHOT_SEED"
30 | # eval_cat="COOP_ADAPT_A100"
31 | # eval_cat="COOP_ADAPT_A100_SEED"
32 | 
33 | ckpt_folder=f'/tmp/outputs/COOP_ELEVATER/{TRAINER}/{eval_cat}/'
34 | ckpt_setting=f'/{CONFIG}_{SHOTS}shots/nctx{NCTX}_csc_ctp/'
35 | 
36 | print(f'{ckpt_folder}/cifar-10/{ckpt_setting}')
37 | seeds = ["1", "2", "3"]
38 | # seeds = ["0"]
39 | if "ZERO" in eval_cat:
40 |     accuracy_index = -1
41 | else:
42 |     accuracy_index = -2
43 | # accuracy_index = -1
44 | # seeds = ["0"]
45 | # out_name = 'vpt_eval'
46 | # ckpt_folder='/tmp/select_5/UPT/'
47 | # ckpt_setting='vit_b16_20shots/nctx16_csc_ctp'
48 | COOP_ELEVATER_DATASET =  ['hateful-memes', 'cifar-10', 'mnist', 'resisc45_clip', 'country211', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'fer-2013', 'kitti-distance']
49 | 
50 | def main():
51 | #     dataset = ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance']
52 |     with open(f'./scripts/{out_name}.csv', 'w', encoding='UTF8') as f:
53 |         writer = csv.writer(f)
54 |         # dataset = ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance']
55 |         dataset = COOP_ELEVATER_DATASET
56 |         writer.writerow([" "]+dataset)
57 |         missed = 0
58 |         for seed in seeds:
59 |             temp_row = []
60 |             temp_row.append(f"seed {seed}")
61 |             
62 |             for data1 in dataset:
63 |                 # temp_row.append(data1+" seed"+seed)
64 | 
65 |                 # for data2 in dataset:
66 |                 #for seed in ["1", "2", "3"]:
67 |                     # with open("/rscratch/shijiayang/Prompt/new0/prompt-moe/CoOp/outputs/evaluation/"+data1+"_"+data2+"/CoOp/vit_b16_20shots/nctx16_cscFalse_ctpmiddle/seed"+seed+"/log.txt") as open_file:
68 |                 missed_ = True
69 |                 log_files =  glob.glob(f"{ckpt_folder}/{data1}/{ckpt_setting}/seed{seed}/log.txt*")
70 |                 
71 |                 for log_file in log_files:
72 |                     with open(log_file) as open_file:
73 |                     # with open(f"{ckpt_folder}/{data1}/{ckpt_setting}/seed{seed}/log.txt") as open_file:
74 |                         lines = open_file.readlines()
75 |                         # assert "results" in lines[accuracy_index]
76 |                         number = re.findall('([+-]?[0-9]*\.[0-9]*)', lines[accuracy_index])
77 |                         # print(number, lines[-1])
78 |                         if "results" in lines[accuracy_index] and "test" in lines[accuracy_index-2]:
79 |                             try:
80 |                                 temp_row.append(float(number[0]))
81 |                                 missed_ = False
82 |                                 break
83 |                             except Exception as e:
84 |                                 # temp_row.append(" ")
85 |                                 continue
86 |                 if missed_:
87 |                     temp_row.append(" ")
88 |                     missed += 1
89 |                     print("missed", data1, "seed", seed)
90 |                 # break
91 |             writer.writerow(temp_row)
92 |         print(f"okay we missed {missed} entries")
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     main()


--------------------------------------------------------------------------------
/datasets/imagenet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from collections import OrderedDict
  4 | 
  5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase
  6 | from dassl.utils import listdir_nohidden, mkdir_if_missing
  7 | 
  8 | from .oxford_pets import OxfordPets
  9 | 
 10 | 
 11 | @DATASET_REGISTRY.register()
 12 | class ImageNet(DatasetBase):
 13 | 
 14 |     dataset_dir = "imagenet"
 15 | 
 16 |     def __init__(self, cfg):
 17 |         root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT))
 18 |         self.dataset_dir = os.path.join(root, self.dataset_dir)
 19 |         self.image_dir = self.dataset_dir
 20 |         # self.image_dir = os.path.join(self.dataset_dir, "images")
 21 |         
 22 |         # self.preprocessed = os.path.join(self.dataset_dir, "preprocessed.pkl")
 23 |         # self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot")
 24 |         
 25 |         self.preprocessed = os.path.join(self.dataset_dir.replace('group', 'sheng'), "preprocessed.pkl")
 26 |         self.split_fewshot_dir = os.path.join(self.dataset_dir.replace('group', 'sheng'), "split_fewshot")
 27 |         mkdir_if_missing(self.split_fewshot_dir)
 28 | 
 29 |         if os.path.exists(self.preprocessed):
 30 |             with open(self.preprocessed, "rb") as f:
 31 |                 preprocessed = pickle.load(f)
 32 |                 train = preprocessed["train"]
 33 |                 test = preprocessed["test"]
 34 |         else:
 35 |             # text_file = os.path.join(self.dataset_dir, "classnames.txt")
 36 | 
 37 |             # HACK: hack for trevor's group machine dir's
 38 |             text_file = "./scripts/classnames.txt"
 39 |             classnames = self.read_classnames(text_file)
 40 |             train = self.read_data(classnames, "train")
 41 |             # Follow standard practice to perform evaluation on the val set
 42 |             # Also used as the val set (so evaluate the last-step model)
 43 |             test = self.read_data(classnames, "val")
 44 | 
 45 |             preprocessed = {"train": train, "test": test}
 46 |             with open(self.preprocessed, "wb") as f:
 47 |                 pickle.dump(preprocessed, f, protocol=pickle.HIGHEST_PROTOCOL)
 48 | 
 49 |         num_shots = cfg.DATASET.NUM_SHOTS
 50 |         if num_shots >= 1:
 51 |             seed = cfg.SEED
 52 |             preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl")
 53 |             
 54 |             if os.path.exists(preprocessed):
 55 |                 print(f"Loading preprocessed few-shot data from {preprocessed}")
 56 |                 with open(preprocessed, "rb") as file:
 57 |                     data = pickle.load(file)
 58 |                     train = data["train"]
 59 |             else:
 60 |                 train = self.generate_fewshot_dataset(train, num_shots=num_shots)
 61 |                 data = {"train": train}
 62 |                 print(f"Saving preprocessed few-shot data to {preprocessed}")
 63 |                 with open(preprocessed, "wb") as file:
 64 |                     pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)
 65 | 
 66 |         subsample = cfg.DATASET.SUBSAMPLE_CLASSES
 67 |         train, test = OxfordPets.subsample_classes(train, test, subsample=subsample)
 68 | 
 69 |         super().__init__(train_x=train, val=test, test=test)
 70 | 
 71 |     @staticmethod
 72 |     def read_classnames(text_file):
 73 |         """Return a dictionary containing
 74 |         key-value pairs of <folder name>: <class name>.
 75 |         """
 76 |         classnames = OrderedDict()
 77 |         with open(text_file, "r") as f:
 78 |             lines = f.readlines()
 79 |             for line in lines:
 80 |                 line = line.strip().split(" ")
 81 |                 folder = line[0]
 82 |                 classname = " ".join(line[1:])
 83 |                 classnames[folder] = classname
 84 |         return classnames
 85 | 
 86 |     def read_data(self, classnames, split_dir):
 87 |         split_dir = os.path.join(self.image_dir, split_dir)
 88 |         folders = sorted(f.name for f in os.scandir(split_dir) if f.is_dir())
 89 |         items = []
 90 | 
 91 |         for label, folder in enumerate(folders):
 92 |             imnames = listdir_nohidden(os.path.join(split_dir, folder))
 93 |             classname = classnames[folder]
 94 |             for imname in imnames:
 95 |                 impath = os.path.join(split_dir, folder, imname)
 96 |                 item = Datum(impath=impath, label=label, classname=classname)
 97 |                 items.append(item)
 98 | 
 99 |         return items
100 | 


--------------------------------------------------------------------------------