├── datasets ├── __init__.py ├── __pycache__ │ ├── dtd.cpython-38.pyc │ ├── bamboo.cpython-38.pyc │ ├── eurosat.cpython-38.pyc │ ├── food101.cpython-38.pyc │ ├── sun397.cpython-38.pyc │ ├── ucf101.cpython-38.pyc │ ├── __init__.cpython-38.pyc │ ├── caltech101.cpython-38.pyc │ ├── imagenet.cpython-38.pyc │ ├── imagenet_a.cpython-38.pyc │ ├── imagenet_r.cpython-38.pyc │ ├── imagenetv2.cpython-38.pyc │ ├── imagenet_21k.cpython-38.pyc │ ├── oxford_pets.cpython-38.pyc │ ├── fgvc_aircraft.cpython-38.pyc │ ├── imagenet_sketch.cpython-38.pyc │ ├── oxford_flowers.cpython-38.pyc │ └── stanford_cars.cpython-38.pyc ├── imagenet_sketch.py ├── imagenetv2.py ├── imagenet_r.py ├── imagenet_a.py ├── food101.py ├── caltech101.py ├── fgvc_aircraft.py ├── eurosat.py ├── stanford_cars.py ├── sun397.py ├── ucf101.py ├── oxford_flowers.py ├── dtd.py └── imagenet.py ├── trainers ├── __init__.py ├── vision_benchmark │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ └── prepare_submit.py │ ├── common │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ └── constants.cpython-38.pyc │ │ ├── constants.py │ │ ├── utils.py │ │ └── data_class_base.py │ ├── optim │ │ └── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── utils.py │ ├── __pycache__ │ │ └── __init__.cpython-38.pyc │ ├── datasets │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── metrics.cpython-38.pyc │ │ │ ├── prompts.cpython-38.pyc │ │ │ ├── hfpt_tokenizer.cpython-38.pyc │ │ │ └── simple_tokenizer.cpython-38.pyc │ │ ├── languages │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── __init__.py │ │ │ ├── build.py │ │ │ ├── prompt_engineering.py │ │ │ └── hfpt_tokenizer.py │ │ ├── __init__.py │ │ └── hfpt_tokenizer.py │ ├── evaluation │ │ ├── __pycache__ │ │ │ ├── feature.cpython-38.pyc │ │ │ └── __init__.cpython-38.pyc │ │ ├── __init__.py │ │ └── dataset.py │ ├── resources │ │ ├── datasets │ │ │ ├── gtsrb.yaml │ │ │ ├── hateful-memes.yaml │ │ │ ├── mnist.yaml │ │ │ ├── dtd.yaml │ │ │ ├── cifar10.yaml │ │ │ ├── cifar100.yaml │ │ │ ├── country211.yaml │ │ │ ├── eurosat-clip.yaml │ │ │ ├── fer2013.yaml │ │ │ ├── resisc45-clip.yaml │ │ │ ├── rendered-sst2.yaml │ │ │ ├── caltech101.yaml │ │ │ ├── flower102.yaml │ │ │ ├── oxford-iiit-pets.yaml │ │ │ ├── patchcamelyon.yaml │ │ │ ├── stanfordcar.yaml │ │ │ ├── voc2007classification.yaml │ │ │ ├── kitti-distance.yaml │ │ │ ├── food101.yaml │ │ │ └── fgvc-aircraft-2013b.yaml │ │ ├── model │ │ │ ├── example.yaml │ │ │ ├── clip_example.yaml │ │ │ ├── deit_base_patch16_224.yaml │ │ │ ├── vit_base_patch16_224.yaml │ │ │ ├── vit_base_patch32_224.yaml │ │ │ ├── mae_vitb16.yaml │ │ │ ├── mocov3_vitb16.yaml │ │ │ ├── vitb16_CLIP.yaml │ │ │ ├── vitb32_CLIP.yaml │ │ │ ├── vitb32_SLIP.yaml │ │ │ ├── vitb32_DeCLIP.yaml │ │ │ ├── vitb32_DeCLIP_YFCC15M.yaml │ │ │ ├── vitb32_FILIP.yaml │ │ │ └── clip_swin_tiny.yaml │ │ └── knowledge │ │ │ ├── gpt3 │ │ │ ├── GPT3_rendered-sst2.tsv │ │ │ ├── GPT3_patch-camelyon.tsv │ │ │ ├── GPT3_hateful-memes.tsv │ │ │ ├── GPT3_mnist.tsv │ │ │ ├── GPT3_kitti-distance.tsv │ │ │ ├── GPT3_fer-2013.tsv │ │ │ └── GPT3_eurosat_clip.tsv │ │ │ └── external │ │ │ ├── rendered-sst2_knowledge.tsv │ │ │ ├── patch-camelyon_knowledge.tsv │ │ │ ├── kitti-distance_knowledge.tsv │ │ │ ├── hateful-memes_knowledge.tsv │ │ │ ├── fer-2013_knowledge.tsv │ │ │ ├── eurosat_clip_knowledge.tsv │ │ │ ├── mnist_knowledge.tsv │ │ │ └── cifar-10_knowledge.tsv │ └── config │ │ ├── __init__.py │ │ └── models.py ├── __pycache__ │ ├── coop.cpython-38.pyc │ ├── mvlpt.cpython-38.pyc │ ├── cocoop.cpython-38.pyc │ ├── zsclip.cpython-38.pyc │ ├── __init__.cpython-38.pyc │ └── imagenet_templates.cpython-38.pyc ├── imagenet_templates.py └── zsclip.py ├── clip ├── __init__.py ├── bpe_simple_vocab_16e6.txt.gz └── __pycache__ │ ├── clip.cpython-38.pyc │ ├── model.cpython-38.pyc │ ├── __init__.cpython-38.pyc │ └── simple_tokenizer.cpython-38.pyc ├── configs ├── datasets │ ├── sun397.yaml │ ├── ucf101.yaml │ ├── eurosat.yaml │ ├── food101.yaml │ ├── imagenet.yaml │ ├── oxford_pets.yaml │ ├── caltech101.yaml │ ├── dtd.yaml │ ├── imagenet_a.yaml │ ├── imagenet_r.yaml │ ├── imagenetv2.yaml │ ├── oxford_flowers.yaml │ ├── fgvc_aircraft.yaml │ ├── stanford_cars.yaml │ └── imagenet_sketch.yaml └── trainers │ ├── CoOp │ ├── rn50_val.yaml │ ├── rn50.yaml │ ├── rn101.yaml │ ├── rn101_ep50.yaml │ ├── rn50_ep50.yaml │ ├── vit_b16_ep50.yaml │ ├── vit_b32.yaml │ ├── vit_b32_ep50.yaml │ ├── vit_l14.yaml │ ├── rn50_ep100.yaml │ ├── vit_b16_ep100.yaml │ ├── vit_l14_336.yaml │ ├── rn50_ctxv1.yaml │ ├── rn50_ep50_ctxv1.yaml │ ├── vit_b16_ctxv1.yaml │ ├── vit_b16_ep50_ctxv1.yaml │ ├── vit_b16_ep100_ctxv1.yaml │ └── vit_b16.yaml │ ├── MVLPT │ ├── rn50_val.yaml │ ├── rn50.yaml │ ├── rn101.yaml │ ├── rn50_ep50.yaml │ ├── rn101_ep50.yaml │ ├── vit_b32.yaml │ ├── vit_l14.yaml │ ├── rn50_ep100.yaml │ ├── vit_b16_ep100.yaml │ ├── vit_b16_ep50.yaml │ ├── vit_b32_ep50.yaml │ ├── vit_l14_336.yaml │ ├── rn50_ctxv1.yaml │ ├── rn50_ep50_ctxv1.yaml │ ├── vit_b16_ctxv1.yaml │ ├── vit_b16_ep50_ctxv1.yaml │ ├── vit_b16_ep100_ctxv1.yaml │ └── vit_b16.yaml │ └── CoCoOp │ ├── vit_b16_c4_ep10_batch1.yaml │ ├── vit_b16_c8_ep10_batch1.yaml │ ├── vit_b16_c16_ep10_batch1.yaml │ └── vit_b16_c4_ep10_batch1_ctxv1.yaml ├── figs ├── fig2-cropped-1.png ├── ablate_flops_all-1.png └── MVLPT_figures-cropped-1.png ├── scripts ├── cocoop │ ├── README.md │ ├── xd_train.sh │ ├── xd_test.sh │ ├── base2new_train.sh │ └── base2new_test.sh ├── coop │ ├── README.md │ ├── zeroshot.sh │ ├── eval.sh │ └── main.sh ├── mvlpt │ ├── zeroshot.sh │ ├── env_mvlpt.yml │ ├── main_mt_coopdata_cut.sh │ ├── main_single_coopdata_cut.sh │ ├── main_mt_elevater_cut.sh │ └── main_single_elevater_cut.sh ├── interpret_prompt.py ├── avg_ckpt.py ├── data.sh └── read_record.py ├── lpclip ├── linear_probe.sh ├── README.md └── feat_extractor.sh ├── requirements.txt ├── LICENSE └── README.md /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trainers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /configs/datasets/sun397.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "SUN397" 3 | -------------------------------------------------------------------------------- /configs/datasets/ucf101.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "UCF101" 3 | -------------------------------------------------------------------------------- /configs/datasets/eurosat.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "EuroSAT" 3 | -------------------------------------------------------------------------------- /configs/datasets/food101.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "Food101" 3 | -------------------------------------------------------------------------------- /configs/datasets/imagenet.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "ImageNet" 3 | -------------------------------------------------------------------------------- /configs/datasets/oxford_pets.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "OxfordPets" -------------------------------------------------------------------------------- /configs/datasets/caltech101.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "Caltech101" 3 | -------------------------------------------------------------------------------- /configs/datasets/dtd.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "DescribableTextures" 3 | -------------------------------------------------------------------------------- /configs/datasets/imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "ImageNetA" 3 | -------------------------------------------------------------------------------- /configs/datasets/imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "ImageNetR" 3 | -------------------------------------------------------------------------------- /configs/datasets/imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "ImageNetV2" 3 | -------------------------------------------------------------------------------- /configs/datasets/oxford_flowers.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "OxfordFlowers" -------------------------------------------------------------------------------- /configs/datasets/fgvc_aircraft.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "FGVCAircraft" 3 | -------------------------------------------------------------------------------- /configs/datasets/stanford_cars.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "StanfordCars" 3 | -------------------------------------------------------------------------------- /configs/datasets/imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | NAME: "ImageNetSketch" 3 | -------------------------------------------------------------------------------- /figs/fig2-cropped-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/figs/fig2-cropped-1.png -------------------------------------------------------------------------------- /scripts/cocoop/README.md: -------------------------------------------------------------------------------- 1 | These scripts are only for reproducing the results on the CVPR'22 paper. -------------------------------------------------------------------------------- /scripts/coop/README.md: -------------------------------------------------------------------------------- 1 | These scripts are only for reproducing the results on the IJCV'22 paper. -------------------------------------------------------------------------------- /figs/ablate_flops_all-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/figs/ablate_flops_all-1.png -------------------------------------------------------------------------------- /clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /figs/MVLPT_figures-cropped-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/figs/MVLPT_figures-cropped-1.png -------------------------------------------------------------------------------- /clip/__pycache__/clip.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/clip.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_optimizer 2 | 3 | __all__ = ['build_optimizer'] 4 | -------------------------------------------------------------------------------- /clip/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /clip/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/dtd.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/dtd.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/__pycache__/coop.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/coop.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/__pycache__/mvlpt.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/mvlpt.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/bamboo.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/bamboo.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/eurosat.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/eurosat.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/food101.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/food101.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/sun397.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/sun397.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/ucf101.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/ucf101.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/__pycache__/cocoop.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/cocoop.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/__pycache__/zsclip.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/zsclip.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/caltech101.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/caltech101.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/imagenet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/imagenet_a.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_a.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/imagenet_r.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_r.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/imagenetv2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenetv2.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /clip/__pycache__/simple_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/clip/__pycache__/simple_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/imagenet_21k.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_21k.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/oxford_pets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/oxford_pets.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/fgvc_aircraft.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/fgvc_aircraft.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/imagenet_sketch.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/imagenet_sketch.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/oxford_flowers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/oxford_flowers.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/stanford_cars.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/datasets/__pycache__/stanford_cars.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .comm import comm 2 | from .utils import create_logger 3 | 4 | __all__ = ['comm', 'create_logger'] 5 | -------------------------------------------------------------------------------- /trainers/__pycache__/imagenet_templates.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/__pycache__/imagenet_templates.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /trainers/vision_benchmark/common/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/common/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/common/__pycache__/constants.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/common/__pycache__/constants.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/__pycache__/metrics.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/metrics.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/__pycache__/prompts.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/prompts.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/evaluation/__pycache__/feature.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/evaluation/__pycache__/feature.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /trainers/vision_benchmark/evaluation/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/evaluation/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/gtsrb.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'gtsrb' 4 | ROOT: '../DATASET/gtsrb/' 5 | NUM_CLASSES: 43 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/hateful-memes.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | DATASET: 'hateful-memes' 3 | ROOT: '../DATASET/hateful_memes/' 4 | NUM_CLASSES: 2 5 | TEST: 6 | METRIC: 'roc_auc' -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/mnist.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'mnist' 4 | ROOT: '../DATASET/mnist/' 5 | NUM_CLASSES: 10 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/__pycache__/hfpt_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/hfpt_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/dtd.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'dtd' 5 | ROOT: '../DATASET/dtd-v1/' 6 | NUM_CLASSES: 47 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/__pycache__/simple_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sIncerass/MVLPT/HEAD/trainers/vision_benchmark/datasets/__pycache__/simple_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/cifar10.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'cifar-10' 5 | ROOT: '../../DATASET/cifar10/' 6 | NUM_CLASSES: 10 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/cifar100.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'cifar-100' 5 | ROOT: '../DATASET/cifar100/' 6 | NUM_CLASSES: 100 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/country211.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'country211' 4 | ROOT: '../DATASET/country211/' 5 | NUM_CLASSES: 211 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/eurosat-clip.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'eurosat_clip' 4 | ROOT: '../DATASET/eurosat_clip/' 5 | NUM_CLASSES: 10 6 | TEST: 7 | METRIC: 'accuracy' -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/fer2013.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'fer-2013' 5 | ROOT: '../DATASET/fer2013-v1/' 6 | NUM_CLASSES: 7 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/resisc45-clip.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'resisc45_clip' 4 | ROOT: '../DATASET/resisc45_clip/' 5 | NUM_CLASSES: 45 6 | TEST: 7 | METRIC: 'accuracy' -------------------------------------------------------------------------------- /trainers/vision_benchmark/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .default import _C as config 2 | from .default import update_config 3 | from .models import MODEL_SPECS 4 | 5 | __all__ = ['config', 'update_config', 'MODEL_SPECS'] 6 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/rendered-sst2.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'rendered-sst2' 4 | ROOT: '../DATASET/rendered-sst2/' 5 | NUM_CLASSES: 2 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/caltech101.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'caltech-101' 5 | ROOT: '../DATASET/caltech101-tf/' 6 | NUM_CLASSES: 102 7 | TEST: 8 | METRIC: 'mean-per-class' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/flower102.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'oxford-flower-102' 4 | ROOT: '../DATASET/flower102/' 5 | NUM_CLASSES: 102 6 | TEST: 7 | METRIC: 'mean-per-class' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/oxford-iiit-pets.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'oxford-iiit-pets' 4 | ROOT: '../DATASET/pet37/' 5 | NUM_CLASSES: 37 6 | TEST: 7 | METRIC: 'mean-per-class' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/patchcamelyon.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'patch-camelyon' 5 | ROOT: '../DATASET/patchcamelyon/' 6 | NUM_CLASSES: 2 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/stanfordcar.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'stanford-cars' 5 | ROOT: '../DATASET/stanfordcars/' 6 | NUM_CLASSES: 196 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/voc2007classification.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'voc-2007-classification' 4 | ROOT: '../DATASET/voc2007/' 5 | NUM_CLASSES: 20 6 | TEST: 7 | METRIC: '11point_mAP' 8 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/kitti-distance.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'kitti-distance' 4 | CENTER_CROP: false 5 | ROOT: '../DATASET/kitti_distance_20210923/' 6 | NUM_CLASSES: 4 7 | TEST: 8 | METRIC: 'accuracy' -------------------------------------------------------------------------------- /lpclip/linear_probe.sh: -------------------------------------------------------------------------------- 1 | feature_dir=clip_feat 2 | 3 | for DATASET in OxfordPets 4 | do 5 | python linear_probe.py \ 6 | --dataset ${DATASET} \ 7 | --feature_dir ${feature_dir} \ 8 | --num_step 8 \ 9 | --num_run 3 10 | done 11 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/food101.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATA_DIR: '' 3 | 4 | DATASET: 5 | DATASET: 'food-101' 6 | ROOT: '../DATASET/food101/' 7 | NUM_CLASSES: 101 8 | TEST: 9 | METRIC: 'accuracy' 10 | DEBUG: 11 | DEBUG: false 12 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/datasets/fgvc-aircraft-2013b.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATA_DIR: '' 3 | 4 | DATASET: 5 | DATASET: 'fgvc-aircraft-2013b-variants102' 6 | ROOT: '../DATASET/fgvc-aircraft-2013b-variants102/' 7 | NUM_CLASSES: 100 8 | TEST: 9 | METRIC: 'mean-per-class' 10 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/example.yaml: -------------------------------------------------------------------------------- 1 | INPUT: 2 | MEAN: 3 | - 0.485 4 | - 0.456 5 | - 0.406 6 | STD: 7 | - 0.229 8 | - 0.224 9 | - 0.225 10 | MODEL: 11 | NAME: cls_example 12 | NUM_PARAMS_IN_M: 11 13 | AUTHOR: 'MSFT' 14 | PRETRAINED_DATA: 'ImageNet22K' 15 | CREATION_TIME: '2019-05-27' -------------------------------------------------------------------------------- /trainers/vision_benchmark/common/constants.py: -------------------------------------------------------------------------------- 1 | from vision_datasets import DatasetHub 2 | import pathlib 3 | 4 | 5 | def get_dataset_hub(): 6 | vision_dataset_json = (pathlib.Path(__file__).resolve().parents[1] / 'resources' / 'datasets' / 'vision_datasets.json').read_text() 7 | hub = DatasetHub(vision_dataset_json) 8 | 9 | return hub 10 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompts import class_map, template_map, class_map_metric 2 | from .simple_tokenizer import SimpleTokenizer 3 | from .hfpt_tokenizer import HFPTTokenizer 4 | from .metrics import get_metric 5 | 6 | __all__ = ['class_map', 'template_map', 'SimpleTokenizer', 'HFPTTokenizer', 'class_map_metric', 'get_metric'] 7 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/languages/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from typing import Union, List 6 | 7 | from .simple_tokenizer import SimpleTokenizer 8 | from .hfpt_tokenizer import HFPTTokenizer 9 | 10 | from .build import build_tokenizer 11 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature import extract_features, extract_text_features, construct_dataloader, \ 2 | construct_multitask_dataset 3 | 4 | __all__ = ['extract_features', 'linear_classifier', 'lr_classifier', 'extract_text_features', 'clip_zeroshot_evaluator', 'construct_dataloader', 'full_model_finetune', 'linear_classifier_contrast', 5 | 'construct_multitask_dataset'] 6 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_rendered-sst2.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "negative", "gpt3": [" Not positive or neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral."]}, {"classname": "positive", "gpt3": [" Not negative or neutral.", " Not negative or neutral.", " Not negative nor neutral.", " Not negative nor neutral.", " Not negative or neutral."]}] -------------------------------------------------------------------------------- /scripts/coop/zeroshot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | DATA=/path/to/datasets 5 | TRAINER=ZeroshotCLIP 6 | DATASET=$1 7 | CFG=$2 # rn50, rn101, vit_b32 or vit_b16 8 | 9 | python train.py \ 10 | --root ${DATA} \ 11 | --trainer ${TRAINER} \ 12 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 13 | --config-file configs/trainers/CoOp/${CFG}.yaml \ 14 | --output-dir output/${TRAINER}/${CFG}/${DATASET} \ 15 | --eval-only -------------------------------------------------------------------------------- /scripts/mvlpt/zeroshot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | DATA=/path/to/datasets 5 | TRAINER=ZeroshotCLIP 6 | DATASET=$1 7 | CFG=$2 # rn50, rn101, vit_b32 or vit_b16 8 | 9 | python train.py \ 10 | --root ${DATA} \ 11 | --trainer ${TRAINER} \ 12 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 13 | --config-file configs/trainers/CoOp/${CFG}.yaml \ 14 | --output-dir output/${TRAINER}/${CFG}/${DATASET} \ 15 | --eval-only -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn50_val.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 200 4 | TEST: 5 | BATCH_SIZE: 200 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | MODEL: 16 | BACKBONE: 17 | NAME: "RN50" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn50_val.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 200 4 | TEST: 5 | BATCH_SIZE: 200 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | MODEL: 16 | BACKBONE: 17 | NAME: "RN50" -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/clip_example.yaml: -------------------------------------------------------------------------------- 1 | INPUT: 2 | MEAN: 3 | - 0.485 4 | - 0.456 5 | - 0.406 6 | STD: 7 | - 0.229 8 | - 0.224 9 | - 0.225 10 | MODEL: 11 | NAME: clip_example 12 | NUM_PARAMS_IN_M: 11.0 13 | AUTHOR: 'MSFT' 14 | PRETRAINED_DATA: 'ImageNet22K' 15 | CREATION_TIME: '2019-05-27' 16 | # Following configuration is needed for CLIP model. 17 | SPEC: 18 | TEXT: 19 | TOKENIZER: clip 20 | STYLE: clip 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ftfy 2 | regex 3 | tqdm 4 | pytorch_lightning==1.4.0 5 | torchmetrics==0.6.0 6 | transformers==4.8.1 7 | Pillow==8.3.1 8 | tqdm==4.53.0 9 | ipdb==0.13.7 10 | numpy==1.19.2 11 | einops==0.3.0 12 | pyarrow==2.0.0 13 | sacred==0.8.2 14 | pandas==1.1.5 15 | git+https://github.com/rwightman/pytorch-image-models.git 16 | scipy 17 | tensorboardX 18 | opencv-python 19 | datasets 20 | nltk 21 | git+https://github.com/KaiyangZhou/Dassl.pytorch.git 22 | vision_datasets 23 | gdown -------------------------------------------------------------------------------- /lpclip/README.md: -------------------------------------------------------------------------------- 1 | # Linear Probe CLIP 2 | 3 | To run linear probe baselines, make sure that your current working directory is `lpclip/`. 4 | 5 | Step 1: Extract Features using the CLIP Image Encoder 6 | ```bash 7 | sh feat_extractor.sh 8 | ``` 9 | 10 | Step 2: Train few-shot linear probe 11 | ```bash 12 | sh linear_probe.sh 13 | ``` 14 | 15 | We follow the instructions stated in the Appendix A3 (pp.38) of [the original CLIP paper](https://arxiv.org/pdf/2103.00020.pdf), with a careful hyperparameter sweep. 16 | 17 | Note: please pull the latest Dassl (version >= `606a2c6`). 18 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/languages/build.py: -------------------------------------------------------------------------------- 1 | from .hfpt_tokenizer import HFPTTokenizer 2 | from .simple_tokenizer import SimpleTokenizer 3 | 4 | 5 | def build_tokenizer(tokenizer_name): 6 | tokenizer = None 7 | if tokenizer_name == 'clip': 8 | tokenizer = SimpleTokenizer() 9 | elif 'hf_' in tokenizer_name: 10 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:]) 11 | elif 'hfc_' in tokenizer_name: 12 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:]) 13 | else: 14 | raise ValueError('Unknown tokenizer') 15 | 16 | return tokenizer 17 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/rendered-sst2_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "negative", "def_wiki": "Not positive nor neutral.", "path_wn": ["negative", "denial", "speech_act", "act", "event", "psychological_feature", "abstraction", "entity"], "def_wn": "a reply of denial"}, {"classname": "positive", "def_wiki": "Not negative or neutral.", "path_wn": ["positive", "adjective", "modifier", "content_word", "word", "language_unit", "part", "relation", "abstraction", "entity"], "def_wn": "the primary form of an adjective or adverb; denotes a quality without qualification, comparison, or relation to increase or diminution"}] -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn101.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN101" -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn101_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN101" -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn50_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn101.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN101" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn50_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b16_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b32.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/32" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b32_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/32" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_l14.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-L/14" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn101_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN101" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b32.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/32" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_l14.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-L/14" -------------------------------------------------------------------------------- /lpclip/feat_extractor.sh: -------------------------------------------------------------------------------- 1 | # sh feat_extractor.sh 2 | DATA=/path/to/datasets 3 | OUTPUT='./clip_feat/' 4 | SEED=1 5 | 6 | # oxford_pets oxford_flowers fgvc_aircraft dtd eurosat stanford_cars food101 sun397 caltech101 ucf101 imagenet 7 | for DATASET in oxford_pets 8 | do 9 | for SPLIT in train val test 10 | do 11 | python feat_extractor.py \ 12 | --split ${SPLIT} \ 13 | --root ${DATA} \ 14 | --seed ${SEED} \ 15 | --dataset-config-file ../configs/datasets/${DATASET}.yaml \ 16 | --config-file ../configs/trainers/CoOp/rn50_val.yaml \ 17 | --output-dir ${OUTPUT} \ 18 | --eval-only 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn50_ep100.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 100 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" 30 | -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b16_ep100.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 100 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn50_ep100.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 100 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" 30 | -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b16_ep100.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 100 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b16_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b32_ep50.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/32" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_l14_336.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (336, 336) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-L/14@336px" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_l14_336.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (336, 336) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-L/14@336px" -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/deit_base_patch16_224.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: 'OUTPUT/DEIT_BASE_PATCH16_224/' 3 | 4 | MODEL: 5 | NAME: deit_base_patch16_224 6 | NUM_PARAMS_IN_M: 86.5 7 | AUTHOR: 'timm' 8 | PRETRAINED_DATA: 'ImageNet1K' 9 | CREATION_TIME: '2020-10-13' 10 | SPEC: 11 | EMBED_DIM: 768 12 | 13 | TEST: 14 | BATCH_SIZE_PER_GPU: 128 15 | MODEL_FILE: '' 16 | 17 | TRAIN: 18 | BATCH_SIZE_PER_GPU: 64 19 | BEGIN_EPOCH: 0 20 | END_EPOCH: 10 21 | EXTRA_FINAL_TRAIN_EPOCH: 40 22 | OPTIMIZER: sgd 23 | WD: 0. 24 | MOMENTUM: 0.9 25 | NESTEROV: false 26 | SHUFFLE: true 27 | LR_SCHEDULER: 28 | METHOD: 'WarmupCosine' 29 | WARMUP_EPOCH: 5 30 | -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn50_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/CoOp/rn50_ep50_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn50_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/MVLPT/rn50_ep50_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "RN50" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b16_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b16_ep50_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b16_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b16_ep50_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 50 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b16_ep100_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 100 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b16_ep100_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 100 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COOP: 33 | CTX_INIT: "a photo of a" 34 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/patch-camelyon_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "lymph node", "def_wiki": "Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue filled with lymphocytes and macrophages that collect and destroy bacteria, viruses and foreign matter from lymph. When the body is fighting an infection, these lymphocytes multiply rapidly and produce a characteristic swelling of the lymph nodes.", "path_wn": "", "def_wn": ""}, {"classname": "lymph node containing metastatic tumor tissue", "def_wiki": "Thin, woven, gauze-like fabric.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vit_base_patch16_224.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: 'OUTPUT/VIT_BASE_PATCH16_224/' 3 | 4 | INPUT: 5 | MEAN: [0.5, 0.5, 0.5] 6 | STD: [0.5, 0.5, 0.5] 7 | 8 | MODEL: 9 | NAME: vit_base_patch16_224 10 | NUM_PARAMS_IN_M: 86.5 11 | AUTHOR: 'timm' 12 | PRETRAINED_DATA: 'ImageNet22K' 13 | CREATION_TIME: '2020-10-13' 14 | SPEC: 15 | EMBED_DIM: 768 16 | 17 | TEST: 18 | BATCH_SIZE_PER_GPU: 128 19 | MODEL_FILE: '' 20 | 21 | TRAIN: 22 | BATCH_SIZE_PER_GPU: 64 23 | BEGIN_EPOCH: 0 24 | END_EPOCH: 10 25 | EXTRA_FINAL_TRAIN_EPOCH: 40 26 | OPTIMIZER: sgd 27 | WD: 0. 28 | MOMENTUM: 0.9 29 | NESTEROV: false 30 | SHUFFLE: true 31 | LR_SCHEDULER: 32 | METHOD: 'WarmupCosine' 33 | WARMUP_EPOCH: 5 34 | -------------------------------------------------------------------------------- /configs/trainers/CoCoOp/vit_b16_c4_ep10_batch1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 1 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 10 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 20 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COCOOP: 33 | N_CTX: 4 34 | CTX_INIT: "" 35 | PREC: "fp16" -------------------------------------------------------------------------------- /configs/trainers/CoCoOp/vit_b16_c8_ep10_batch1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 1 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 10 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 20 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COCOOP: 33 | N_CTX: 8 34 | CTX_INIT: "" 35 | PREC: "fp16" -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vit_base_patch32_224.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VIT_BASE_PATCH32_224/' 3 | 4 | INPUT: 5 | MEAN: [0.5, 0.5, 0.5] 6 | STD: [0.5, 0.5, 0.5] 7 | 8 | MODEL: 9 | NAME: vit_base_patch32_224 10 | NUM_PARAMS_IN_M: 88.2 11 | AUTHOR: 'timm' 12 | PRETRAINED_DATA: 'ImageNet22K' 13 | CREATION_TIME: '2020-10-13' 14 | SPEC: 15 | EMBED_DIM: 768 16 | 17 | TEST: 18 | BATCH_SIZE_PER_GPU: 128 19 | MODEL_FILE: '' 20 | 21 | TRAIN: 22 | BATCH_SIZE_PER_GPU: 64 23 | BEGIN_EPOCH: 0 24 | END_EPOCH: 10 25 | EXTRA_FINAL_TRAIN_EPOCH: 40 26 | OPTIMIZER: sgd 27 | WD: 0. 28 | MOMENTUM: 0.9 29 | NESTEROV: false 30 | SHUFFLE: true 31 | LR_SCHEDULER: 32 | METHOD: 'WarmupCosine' 33 | WARMUP_EPOCH: 5 34 | -------------------------------------------------------------------------------- /configs/trainers/CoCoOp/vit_b16_c16_ep10_batch1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 1 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 10 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 20 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COCOOP: 33 | N_CTX: 16 34 | CTX_INIT: "" 35 | PREC: "fp16" -------------------------------------------------------------------------------- /configs/trainers/CoOp/vit_b16.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | DATASET: 32 | VAL_SET: "" 33 | TEST_SET: "val" 34 | TRAIN_SET: "train" 35 | CENTER_CROP: False 36 | -------------------------------------------------------------------------------- /configs/trainers/MVLPT/vit_b16.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 32 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 200 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 5 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | DATASET: 32 | VAL_SET: "" 33 | TEST_SET: "val" 34 | TRAIN_SET: "train" 35 | CENTER_CROP: False 36 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/kitti-distance_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "a photo i took of a car on my left or right side.", "def_wiki": "The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car nearby.", "def_wiki": "adjacent, near, close by", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car in the distance.", "def_wiki": "far away; a long distance away", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with no car.", "def_wiki": "A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /configs/trainers/CoCoOp/vit_b16_c4_ep10_batch1_ctxv1.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | TRAIN_X: 3 | BATCH_SIZE: 1 4 | TEST: 5 | BATCH_SIZE: 100 6 | NUM_WORKERS: 8 7 | 8 | INPUT: 9 | SIZE: (224, 224) 10 | INTERPOLATION: "bicubic" 11 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 12 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 13 | TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"] 14 | 15 | OPTIM: 16 | NAME: "sgd" 17 | LR: 0.002 18 | MAX_EPOCH: 10 19 | LR_SCHEDULER: "cosine" 20 | WARMUP_EPOCH: 1 21 | WARMUP_TYPE: "constant" 22 | WARMUP_CONS_LR: 1e-5 23 | 24 | TRAIN: 25 | PRINT_FREQ: 20 26 | 27 | MODEL: 28 | BACKBONE: 29 | NAME: "ViT-B/16" 30 | 31 | TRAINER: 32 | COCOOP: 33 | N_CTX: 4 34 | CTX_INIT: "a photo of a" 35 | PREC: "fp16" -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/hateful-memes_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": ["meme", "acculturation", "content", "cognition", "psychological_feature", "abstraction", "entity"], "def_wn": "a cultural unit (an idea or value or pattern of behavior) that is passed from one person to another by non-genetic means (as by imitation)"}, {"classname": "hatespeech meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /scripts/cocoop/xd_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../.. 4 | 5 | # custom config 6 | DATA=/path/to/datasets 7 | TRAINER=CoCoOp 8 | # TRAINER=CoOp 9 | 10 | DATASET=imagenet 11 | SEED=$1 12 | 13 | CFG=vit_b16_c4_ep10_batch1_ctxv1 14 | # CFG=vit_b16_ep50_ctxv1 # uncomment this when TRAINER=CoOp and DATASET=imagenet 15 | SHOTS=16 16 | 17 | 18 | DIR=output/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/seed${SEED} 19 | if [ -d "$DIR" ]; then 20 | echo "Oops! The results exist at ${DIR} (so skip this job)" 21 | else 22 | python train.py \ 23 | --root ${DATA} \ 24 | --seed ${SEED} \ 25 | --trainer ${TRAINER} \ 26 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 27 | --config-file configs/trainers/${TRAINER}/${CFG}.yaml \ 28 | --output-dir ${DIR} \ 29 | DATASET.NUM_SHOTS ${SHOTS} 30 | fi -------------------------------------------------------------------------------- /scripts/coop/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | DATA=/path/to/datasets 5 | TRAINER=CoOp 6 | SHOTS=16 7 | NCTX=16 8 | CSC=False 9 | CTP=end 10 | 11 | DATASET=$1 12 | CFG=$2 13 | 14 | for SEED in 1 2 3 15 | do 16 | python train.py \ 17 | --root ${DATA} \ 18 | --seed ${SEED} \ 19 | --trainer ${TRAINER} \ 20 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 21 | --config-file configs/trainers/${TRAINER}/${CFG}.yaml \ 22 | --output-dir output/evaluation/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/${DATASET}/seed${SEED} \ 23 | --model-dir output/imagenet/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} \ 24 | --load-epoch 50 \ 25 | --eval-only \ 26 | TRAINER.COOP.N_CTX ${NCTX} \ 27 | TRAINER.COOP.CSC ${CSC} \ 28 | TRAINER.COOP.CLASS_TOKEN_POSITION ${CTP} 29 | done -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/mae_vitb16.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/' 3 | 4 | MODEL: 5 | NAME: mae_vitb16 6 | NUM_PARAMS_IN_M: 86.6 7 | AUTHOR: 'Facebook' 8 | PRETRAINED_DATA: 'ImageNet22K' 9 | CREATION_TIME: '2020-10-13' 10 | SPEC: 11 | EMBED_DIM: 768 12 | PATCH_SIZE: 16 13 | DEPTH: 12 14 | NUM_HEADS: 12 15 | MLP_RATIO: 4 16 | QKV_BIAS: True 17 | GLOBAL_POOL: True 18 | 19 | TEST: 20 | BATCH_SIZE_PER_GPU: 128 21 | MODEL_FILE: 'https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth' 22 | 23 | TRAIN: 24 | BATCH_SIZE_PER_GPU: 64 25 | BEGIN_EPOCH: 0 26 | END_EPOCH: 10 27 | EXTRA_FINAL_TRAIN_EPOCH: 40 28 | OPTIMIZER: sgd 29 | WD: 0. 30 | MOMENTUM: 0.9 31 | NESTEROV: false 32 | SHUFFLE: true 33 | LR_SCHEDULER: 34 | METHOD: 'WarmupCosine' 35 | WARMUP_EPOCH: 5 36 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/mocov3_vitb16.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/' 3 | 4 | MODEL: 5 | NAME: mocov3_vitb16 6 | NUM_PARAMS_IN_M: 86.6 7 | AUTHOR: 'Facebook' 8 | PRETRAINED_DATA: 'ImageNet22K' 9 | CREATION_TIME: '2020-10-13' 10 | SPEC: 11 | EMBED_DIM: 768 12 | PATCH_SIZE: 16 13 | DEPTH: 12 14 | NUM_HEADS: 12 15 | MLP_RATIO: 4 16 | QKV_BIAS: True 17 | GLOBAL_POOL: True 18 | 19 | TEST: 20 | BATCH_SIZE_PER_GPU: 128 21 | MODEL_FILE: 'https://dl.fbaipublicfiles.com/moco-v3/vit-b-300ep/vit-b-300ep.pth.tar' 22 | 23 | TRAIN: 24 | BATCH_SIZE_PER_GPU: 64 25 | BEGIN_EPOCH: 0 26 | END_EPOCH: 10 27 | EXTRA_FINAL_TRAIN_EPOCH: 40 28 | OPTIMIZER: sgd 29 | WD: 0. 30 | MOMENTUM: 0.9 31 | NESTEROV: false 32 | SHUFFLE: true 33 | LR_SCHEDULER: 34 | METHOD: 'WarmupCosine' 35 | WARMUP_EPOCH: 5 36 | -------------------------------------------------------------------------------- /scripts/cocoop/xd_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../.. 4 | 5 | # custom config 6 | DATA=/path/to/datasets 7 | TRAINER=CoCoOp 8 | # TRAINER=CoOp 9 | 10 | DATASET=$1 11 | SEED=$2 12 | 13 | CFG=vit_b16_c4_ep10_batch1_ctxv1 14 | # CFG=vit_b16_ep50_ctxv1 # uncomment this when TRAINER=CoOp and DATASET=imagenet 15 | SHOTS=16 16 | 17 | 18 | DIR=output/evaluation/${TRAINER}/${CFG}_${SHOTS}shots/${DATASET}/seed${SEED} 19 | if [ -d "$DIR" ]; then 20 | echo "Oops! The results exist at ${DIR} (so skip this job)" 21 | else 22 | python train.py \ 23 | --root ${DATA} \ 24 | --seed ${SEED} \ 25 | --trainer ${TRAINER} \ 26 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 27 | --config-file configs/trainers/${TRAINER}/${CFG}.yaml \ 28 | --output-dir ${DIR} \ 29 | --model-dir output/imagenet/${TRAINER}/${CFG}_${SHOTS}shots/seed${SEED} \ 30 | --load-epoch 10 \ 31 | --eval-only 32 | fi -------------------------------------------------------------------------------- /scripts/cocoop/base2new_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../.. 4 | 5 | # custom config 6 | DATA=/path/to/datasets 7 | TRAINER=CoCoOp 8 | # TRAINER=CoOp 9 | 10 | DATASET=$1 11 | SEED=$2 12 | 13 | CFG=vit_b16_c4_ep10_batch1_ctxv1 14 | # CFG=vit_b16_ctxv1 # uncomment this when TRAINER=CoOp 15 | # CFG=vit_b16_ep50_ctxv1 # uncomment this when TRAINER=CoOp and DATASET=imagenet 16 | SHOTS=16 17 | 18 | 19 | DIR=output/base2new/train_base/${DATASET}/shots_${SHOTS}/${TRAINER}/${CFG}/seed${SEED} 20 | if [ -d "$DIR" ]; then 21 | echo "Oops! The results exist at ${DIR} (so skip this job)" 22 | else 23 | python train.py \ 24 | --root ${DATA} \ 25 | --seed ${SEED} \ 26 | --trainer ${TRAINER} \ 27 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 28 | --config-file configs/trainers/${TRAINER}/${CFG}.yaml \ 29 | --output-dir ${DIR} \ 30 | DATASET.NUM_SHOTS ${SHOTS} \ 31 | DATASET.SUBSAMPLE_CLASSES base 32 | fi -------------------------------------------------------------------------------- /scripts/cocoop/base2new_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../.. 4 | 5 | # custom config 6 | DATA=/path/to/datasets 7 | TRAINER=CoCoOp 8 | # TRAINER=CoOp 9 | 10 | DATASET=$1 11 | SEED=$2 12 | 13 | CFG=vit_b16_c4_ep10_batch1_ctxv1 14 | # CFG=vit_b16_ctxv1 # uncomment this when TRAINER=CoOp 15 | SHOTS=16 16 | LOADEP=10 17 | SUB=new 18 | 19 | 20 | COMMON_DIR=${DATASET}/shots_${SHOTS}/${TRAINER}/${CFG}/seed${SEED} 21 | MODEL_DIR=output/base2new/train_base/${COMMON_DIR} 22 | DIR=output/base2new/test_${SUB}/${COMMON_DIR} 23 | if [ -d "$DIR" ]; then 24 | echo "Oops! The results exist at ${DIR} (so skip this job)" 25 | else 26 | python train.py \ 27 | --root ${DATA} \ 28 | --seed ${SEED} \ 29 | --trainer ${TRAINER} \ 30 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 31 | --config-file configs/trainers/${TRAINER}/${CFG}.yaml \ 32 | --output-dir ${DIR} \ 33 | --model-dir ${MODEL_DIR} \ 34 | --load-epoch ${LOADEP} \ 35 | --eval-only \ 36 | DATASET.NUM_SHOTS ${SHOTS} \ 37 | DATASET.SUBSAMPLE_CLASSES ${SUB} 38 | fi -------------------------------------------------------------------------------- /scripts/coop/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | DATA=/path/to/datasets 5 | TRAINER=CoOp 6 | 7 | DATASET=$1 8 | CFG=$2 # config file 9 | CTP=$3 # class token position (end or middle) 10 | NCTX=$4 # number of context tokens 11 | SHOTS=$5 # number of shots (1, 2, 4, 8, 16) 12 | CSC=$6 # class-specific context (False or True) 13 | 14 | for SEED in 1 2 3 15 | do 16 | DIR=output/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} 17 | if [ -d "$DIR" ]; then 18 | echo "Oops! The results exist at ${DIR} (so skip this job)" 19 | else 20 | python train.py \ 21 | --root ${DATA} \ 22 | --seed ${SEED} \ 23 | --trainer ${TRAINER} \ 24 | --dataset-config-file configs/datasets/${DATASET}.yaml \ 25 | --config-file configs/trainers/${TRAINER}/${CFG}.yaml \ 26 | --output-dir ${DIR} \ 27 | TRAINER.COOP.N_CTX ${NCTX} \ 28 | TRAINER.COOP.CSC ${CSC} \ 29 | TRAINER.COOP.CLASS_TOKEN_POSITION ${CTP} \ 30 | DATASET.NUM_SHOTS ${SHOTS} 31 | fi 32 | done -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vitb16_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'ViT-B/16' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'OpenAI' 12 | PRETRAINED_DATA: 'CLIP-data' 13 | CREATION_TIME: '2021-01-05' 14 | 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 16 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | 31 | TEST: 32 | BATCH_SIZE_PER_GPU: 128 33 | MODEL_FILE: '' 34 | 35 | TRAIN: 36 | BATCH_SIZE_PER_GPU: 64 37 | BEGIN_EPOCH: 0 38 | END_EPOCH: 10 39 | EXTRA_FINAL_TRAIN_EPOCH: 40 40 | OPTIMIZER: sgd 41 | WD: 0. 42 | MOMENTUM: 0.9 43 | NESTEROV: false 44 | SHUFFLE: true 45 | LR_SCHEDULER: 46 | METHOD: 'WarmupCosine' 47 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Kaiyang Zhou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vitb32_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'ViT-B/32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'OpenAI' 12 | PRETRAINED_DATA: 'CLIP-data' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | 31 | TEST: 32 | BATCH_SIZE_PER_GPU: 128 33 | MODEL_FILE: '' 34 | 35 | TRAIN: 36 | BATCH_SIZE_PER_GPU: 64 37 | BEGIN_EPOCH: 0 38 | END_EPOCH: 10 39 | EXTRA_FINAL_TRAIN_EPOCH: 40 40 | OPTIMIZER: sgd 41 | WD: 0. 42 | MOMENTUM: 0.9 43 | NESTEROV: false 44 | SHUFFLE: true 45 | LR_SCHEDULER: 46 | METHOD: 'WarmupCosine' 47 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /trainers/vision_benchmark/common/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pprint 3 | 4 | from torch.utils.collect_env import get_pretty_env_info 5 | 6 | 7 | def log_arg_env_config(args, config, output_dir): 8 | logging.info("=> collecting env info (might take some time)") 9 | logging.info("\n" + get_pretty_env_info()) 10 | logging.info(pprint.pformat(args)) 11 | logging.info(config) 12 | logging.info(f'=> saving logging info into: {output_dir}') 13 | 14 | 15 | def submit_predictions(prediction_list, submit_by, config, track, task): 16 | from vision_benchmark.commands.submit_predictions import submit_predictions_to_leaderboard, submit_model_to_leaderboard 17 | 18 | submission = { 19 | 'dataset_name': config.DATASET.DATASET, 20 | 'model_name': config.MODEL.NAME, 21 | 'track': track, 22 | 'task': task, 23 | 'created_by': submit_by, 24 | 'predictions': [prediction_list] 25 | } 26 | 27 | logging.info('Submit model and predictions to leaderboard.') 28 | submit_predictions_to_leaderboard(submission) 29 | 30 | model_info = { 31 | "name": config.MODEL.NAME, 32 | "author": config.MODEL.AUTHOR, 33 | "num_params_in_millions": config.MODEL.NUM_PARAMS_IN_M, 34 | "pretrained_data": config.MODEL.PRETRAINED_DATA, 35 | "creation_time": config.MODEL.CREATION_TIME 36 | } 37 | 38 | submit_model_to_leaderboard(model_info) 39 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/fer-2013_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "angry", "def_wiki": "Displaying or feeling anger.", "path_wn": ["angry"], "def_wn": "feeling or showing anger"}, {"classname": "disgusted", "def_wiki": "Filled with disgust.", "path_wn": ["disgust", "dislike", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "fill with distaste"}, {"classname": "fearful", "def_wiki": "Frightening.", "path_wn": ["fearful"], "def_wn": "experiencing or showing fear"}, {"classname": "happy", "def_wiki": "Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", "path_wn": ["happy"], "def_wn": "enjoying or showing or marked by joy or pleasure"}, {"classname": "neutral", "def_wiki": "Not taking sides in a conflict such as war; nonaligned.", "path_wn": ["neutral", "person", "causal_agent", "physical_entity", "entity"], "def_wn": "one who does not side with any party in a war or dispute"}, {"classname": "sad", "def_wiki": "Emotionally negative.", "path_wn": ["sad"], "def_wn": "experiencing or showing sorrow or unhappiness; ; - Christina Rossetti"}, {"classname": "surprised", "def_wiki": "Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", "path_wn": ["surprise", "astonishment", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "cause to be surprised"}] -------------------------------------------------------------------------------- /datasets/imagenet_sketch.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 4 | from dassl.utils import listdir_nohidden 5 | 6 | from .imagenet import ImageNet 7 | 8 | 9 | @DATASET_REGISTRY.register() 10 | class ImageNetSketch(DatasetBase): 11 | """ImageNet-Sketch. 12 | 13 | This dataset is used for testing only. 14 | """ 15 | 16 | dataset_dir = "imagenet-sketch" 17 | 18 | def __init__(self, cfg): 19 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 20 | self.dataset_dir = os.path.join(root, self.dataset_dir) 21 | self.image_dir = os.path.join(self.dataset_dir, "images") 22 | 23 | text_file = os.path.join(self.dataset_dir, "classnames.txt") 24 | classnames = ImageNet.read_classnames(text_file) 25 | 26 | data = self.read_data(classnames) 27 | 28 | super().__init__(train_x=data, test=data) 29 | 30 | def read_data(self, classnames): 31 | image_dir = self.image_dir 32 | folders = listdir_nohidden(image_dir, sort=True) 33 | items = [] 34 | 35 | for label, folder in enumerate(folders): 36 | imnames = listdir_nohidden(os.path.join(image_dir, folder)) 37 | classname = classnames[folder] 38 | for imname in imnames: 39 | impath = os.path.join(image_dir, folder, imname) 40 | item = Datum(impath=impath, label=label, classname=classname) 41 | items.append(item) 42 | 43 | return items 44 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/utils/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from pathlib import Path 6 | 7 | import os 8 | import logging 9 | import time 10 | 11 | from .comm import comm 12 | 13 | 14 | def setup_logger(final_output_dir, rank, phase): 15 | time_str = time.strftime('%Y-%m-%d-%H-%M') 16 | log_file = f'{phase}_{time_str}_rank{rank}.txt' 17 | final_log_file = os.path.join(final_output_dir, log_file) 18 | head = "%(asctime)-15s:[P:%(process)d]:" + comm.head + ' %(message)s' 19 | logging.basicConfig( 20 | filename=str(final_log_file), format=head 21 | ) 22 | logger = logging.getLogger() 23 | logger.setLevel(logging.INFO) 24 | console = logging.StreamHandler() 25 | console.setFormatter( 26 | logging.Formatter(head) 27 | ) 28 | logging.getLogger('').addHandler(console) 29 | 30 | 31 | def create_logger(cfg, phase='train'): 32 | root_output_dir = Path(cfg.OUTPUT_DIR) 33 | dataset = cfg.DATASET.DATASET 34 | cfg_name = cfg.NAME 35 | 36 | final_output_dir = root_output_dir / dataset / cfg_name 37 | 38 | print('=> creating {} ...'.format(root_output_dir)) 39 | root_output_dir.mkdir(parents=True, exist_ok=True) 40 | print('=> creating {} ...'.format(final_output_dir)) 41 | final_output_dir.mkdir(parents=True, exist_ok=True) 42 | 43 | print('=> setup logger ...') 44 | setup_logger(final_output_dir, cfg.RANK, phase) 45 | 46 | return str(final_output_dir) 47 | 48 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_patch-camelyon.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "lymph node", "gpt3": [" A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infection by trapping ger", " Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue", " Thin, woven, gauze-like fabric.", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger"]}, {"classname": "lymph node containing metastatic tumor tissue", "gpt3": [" Thin, woven, gauze-like fabric.", " Thin, woven, gauze-like fabric.", " A small, oval, soft, elastic body of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of", " Thin, woven, gauze-like fabric.", " A small, oval, solid organ of the lymphatic system, distributed along the lymphatic vessels, that acts as a filter for bacteria, viruses, and foreign matter."]}] -------------------------------------------------------------------------------- /datasets/imagenetv2.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 4 | from dassl.utils import listdir_nohidden 5 | 6 | from .imagenet import ImageNet 7 | 8 | 9 | @DATASET_REGISTRY.register() 10 | class ImageNetV2(DatasetBase): 11 | """ImageNetV2. 12 | 13 | This dataset is used for testing only. 14 | """ 15 | 16 | dataset_dir = "imagenetv2" 17 | 18 | def __init__(self, cfg): 19 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 20 | self.dataset_dir = os.path.join(root, self.dataset_dir) 21 | image_dir = "imagenetv2-matched-frequency-format-val" 22 | self.image_dir = os.path.join(self.dataset_dir, image_dir) 23 | 24 | text_file = os.path.join(self.dataset_dir, "classnames.txt") 25 | classnames = ImageNet.read_classnames(text_file) 26 | 27 | data = self.read_data(classnames) 28 | 29 | super().__init__(train_x=data, test=data) 30 | 31 | def read_data(self, classnames): 32 | image_dir = self.image_dir 33 | folders = list(classnames.keys()) 34 | items = [] 35 | 36 | for label in range(1000): 37 | class_dir = os.path.join(image_dir, str(label)) 38 | imnames = listdir_nohidden(class_dir) 39 | folder = folders[label] 40 | classname = classnames[folder] 41 | for imname in imnames: 42 | impath = os.path.join(class_dir, imname) 43 | item = Datum(impath=impath, label=label, classname=classname) 44 | items.append(item) 45 | 46 | return items 47 | -------------------------------------------------------------------------------- /datasets/imagenet_r.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 4 | from dassl.utils import listdir_nohidden 5 | 6 | from .imagenet import ImageNet 7 | 8 | TO_BE_IGNORED = ["README.txt"] 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class ImageNetR(DatasetBase): 13 | """ImageNet-R(endition). 14 | 15 | This dataset is used for testing only. 16 | """ 17 | 18 | dataset_dir = "imagenet-rendition" 19 | 20 | def __init__(self, cfg): 21 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 22 | self.dataset_dir = os.path.join(root, self.dataset_dir) 23 | self.image_dir = os.path.join(self.dataset_dir, "imagenet-r") 24 | 25 | text_file = os.path.join(self.dataset_dir, "classnames.txt") 26 | classnames = ImageNet.read_classnames(text_file) 27 | 28 | data = self.read_data(classnames) 29 | 30 | super().__init__(train_x=data, test=data) 31 | 32 | def read_data(self, classnames): 33 | image_dir = self.image_dir 34 | folders = listdir_nohidden(image_dir, sort=True) 35 | folders = [f for f in folders if f not in TO_BE_IGNORED] 36 | items = [] 37 | 38 | for label, folder in enumerate(folders): 39 | imnames = listdir_nohidden(os.path.join(image_dir, folder)) 40 | classname = classnames[folder] 41 | for imname in imnames: 42 | impath = os.path.join(image_dir, folder, imname) 43 | item = Datum(impath=impath, label=label, classname=classname) 44 | items.append(item) 45 | 46 | return items 47 | -------------------------------------------------------------------------------- /datasets/imagenet_a.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 4 | from dassl.utils import listdir_nohidden 5 | 6 | from .imagenet import ImageNet 7 | 8 | TO_BE_IGNORED = ["README.txt"] 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class ImageNetA(DatasetBase): 13 | """ImageNet-A(dversarial). 14 | 15 | This dataset is used for testing only. 16 | """ 17 | 18 | dataset_dir = "imagenet-adversarial" 19 | 20 | def __init__(self, cfg): 21 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 22 | self.dataset_dir = os.path.join(root, self.dataset_dir) 23 | self.image_dir = os.path.join(self.dataset_dir, "imagenet-a") 24 | 25 | text_file = os.path.join(self.dataset_dir, "classnames.txt") 26 | classnames = ImageNet.read_classnames(text_file) 27 | 28 | data = self.read_data(classnames) 29 | 30 | super().__init__(train_x=data, test=data) 31 | 32 | def read_data(self, classnames): 33 | image_dir = self.image_dir 34 | folders = listdir_nohidden(image_dir, sort=True) 35 | folders = [f for f in folders if f not in TO_BE_IGNORED] 36 | items = [] 37 | 38 | for label, folder in enumerate(folders): 39 | imnames = listdir_nohidden(os.path.join(image_dir, folder)) 40 | classname = classnames[folder] 41 | for imname in imnames: 42 | impath = os.path.join(image_dir, folder, imname) 43 | item = Datum(impath=impath, label=label, classname=classname) 44 | items.append(item) 45 | 46 | return items 47 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vitb32_SLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'slip_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'SLIP' 12 | PRETRAINED_DATA: 'YFCC-15M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | SKIP_TOKENIZE: true 31 | DECLIP: 32 | image_encode: 33 | embed_dim: 512 34 | text_encode: 35 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 36 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 37 | text_model_utils: 38 | random: False 39 | freeze: False 40 | embed_dim: 512 41 | clip: 42 | use_allgather: False 43 | return_sim: True 44 | feature_dim: 768 45 | sim_dim: 256 46 | 47 | TEST: 48 | BATCH_SIZE_PER_GPU: 128 49 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/SLIP_YFCC15M_vitb32.pth.tar' 50 | 51 | TRAIN: 52 | BATCH_SIZE_PER_GPU: 64 53 | BEGIN_EPOCH: 0 54 | END_EPOCH: 10 55 | EXTRA_FINAL_TRAIN_EPOCH: 40 56 | OPTIMIZER: sgd 57 | WD: 0. 58 | MOMENTUM: 0.9 59 | NESTEROV: false 60 | SHUFFLE: true 61 | LR_SCHEDULER: 62 | METHOD: 'WarmupCosine' 63 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vitb32_DeCLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'declip_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'DeCLIP' 12 | PRETRAINED_DATA: 'DeCLIP-88M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 3072 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | SKIP_TOKENIZE: true 31 | DECLIP: 32 | image_encode: 33 | embed_dim: 3072 34 | text_encode: 35 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 36 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 37 | text_model_utils: 38 | random: False 39 | freeze: False 40 | embed_dim: 3072 41 | # clip: 42 | # use_allgather: True 43 | # text_mask_type: MLM 44 | # return_nn_bank: True 45 | # EDA: True 46 | # feature_dim: 3072 47 | 48 | TEST: 49 | BATCH_SIZE_PER_GPU: 128 50 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_vitb32.pth.tar' 51 | 52 | TRAIN: 53 | BATCH_SIZE_PER_GPU: 64 54 | BEGIN_EPOCH: 0 55 | END_EPOCH: 10 56 | EXTRA_FINAL_TRAIN_EPOCH: 40 57 | OPTIMIZER: sgd 58 | WD: 0. 59 | MOMENTUM: 0.9 60 | NESTEROV: false 61 | SHUFFLE: true 62 | LR_SCHEDULER: 63 | METHOD: 'WarmupCosine' 64 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vitb32_DeCLIP_YFCC15M.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'declip_yfcc_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'DeCLIP' 12 | PRETRAINED_DATA: 'YFCC-15M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 3072 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | SKIP_TOKENIZE: true 31 | DECLIP: 32 | image_encode: 33 | embed_dim: 512 34 | text_encode: 35 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 36 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 37 | text_model_utils: 38 | random: False 39 | freeze: False 40 | embed_dim: 512 41 | # clip: 42 | # use_allgather: True 43 | # text_mask_type: MLM 44 | # return_nn_bank: True 45 | # EDA: True 46 | # feature_dim: 512 47 | 48 | TEST: 49 | BATCH_SIZE_PER_GPU: 128 50 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_YFCC15M_vitb32.pth.tar' 51 | 52 | TRAIN: 53 | BATCH_SIZE_PER_GPU: 64 54 | BEGIN_EPOCH: 0 55 | END_EPOCH: 10 56 | EXTRA_FINAL_TRAIN_EPOCH: 40 57 | OPTIMIZER: sgd 58 | WD: 0. 59 | MOMENTUM: 0.9 60 | NESTEROV: false 61 | SHUFFLE: true 62 | LR_SCHEDULER: 63 | METHOD: 'WarmupCosine' 64 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /trainers/vision_benchmark/config/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from yacs.config import CfgNode as CN 6 | 7 | # high_resoluton_net related params for classification 8 | HIGH_RESOLUTION_NET = CN() 9 | HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*'] 10 | HIGH_RESOLUTION_NET.STEM_INPLANES = 64 11 | HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1 12 | HIGH_RESOLUTION_NET.WITH_HEAD = True 13 | 14 | HIGH_RESOLUTION_NET.STAGE2 = CN() 15 | HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1 16 | HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2 17 | HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4] 18 | HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64] 19 | HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC' 20 | HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'CAT' 21 | 22 | HIGH_RESOLUTION_NET.STAGE3 = CN() 23 | HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1 24 | HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3 25 | HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4] 26 | HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128] 27 | HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC' 28 | HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'CAT' 29 | 30 | HIGH_RESOLUTION_NET.STAGE4 = CN() 31 | HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1 32 | HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4 33 | HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] 34 | HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256] 35 | HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC' 36 | HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'CAT' 37 | 38 | RESNEXT = CN() 39 | RESNEXT.NUM_LAYERS = 50 40 | RESNEXT.BASE_WIDTH = 4 41 | RESNEXT.CARDINALITY = 32 42 | RESNEXT.KERNEL_SIZE_STEM = 7 43 | 44 | RESNET = CN() 45 | RESNET.NUM_LAYERS = 50 46 | RESNET.KERNEL_SIZE_STEM = 7 47 | 48 | 49 | MODEL_SPECS = { 50 | 'cls_hrnet': HIGH_RESOLUTION_NET, 51 | } 52 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/vitb32_FILIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'filip_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'FILIP' 12 | PRETRAINED_DATA: 'DeCLIP-88M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 768 17 | DENSE_EVAL: true 18 | VISION: 19 | MODEL: vit 20 | PATCH_SIZE: 32 21 | WIDTH: 384 22 | LAYERS: 12 23 | TEXT: 24 | TOKENIZER: clip 25 | STYLE: clip 26 | CONTEXT_LENGTH: 77 27 | VOCAB_SIZE: 49408 28 | WIDTH: 512 29 | HEADS: 8 30 | LAYERS: 12 31 | SKIP_TOKENIZE: true 32 | DECLIP: 33 | image_encode: 34 | embed_dim: 768 35 | text_encode: 36 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 37 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 38 | text_model_utils: 39 | random: False 40 | freeze: False 41 | embed_dim: 768 42 | clip: 43 | mask_rate: 0.5 44 | patch_number: 14 45 | use_allgather: False 46 | text_mask_type: MLM 47 | return_nn_bank: False 48 | return_dense: True 49 | feature_dim: 768 50 | select_topk: True 51 | 52 | TEST: 53 | BATCH_SIZE_PER_GPU: 128 54 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/FILIP_YFCC15M_vitb32.pth.tar' 55 | 56 | TRAIN: 57 | BATCH_SIZE_PER_GPU: 64 58 | BEGIN_EPOCH: 0 59 | END_EPOCH: 10 60 | EXTRA_FINAL_TRAIN_EPOCH: 40 61 | OPTIMIZER: sgd 62 | WD: 0. 63 | MOMENTUM: 0.9 64 | NESTEROV: false 65 | SHUFFLE: true 66 | LR_SCHEDULER: 67 | METHOD: 'WarmupCosine' 68 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/eurosat_clip_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "annual crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "forest", "def_wiki": "A dense uncultivated tract of trees and undergrowth, larger than woods.", "path_wn": ["forest", "vegetation", "collection", "group", "abstraction", "entity"], "def_wn": "the trees and other plants in a large densely wooded area"}, {"classname": "brushland or shrubland", "def_wiki": "Land that is covered mostly with shrubs.", "path_wn": "", "def_wn": ""}, {"classname": "highway or road", "def_wiki": "A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard: a country road is the same as a country lane.", "path_wn": "", "def_wn": ""}, {"classname": "industrial buildings or commercial buildings", "def_wiki": "The act or process by which something is built; construction.", "path_wn": "", "def_wn": ""}, {"classname": "pasture land", "def_wiki": "land used for grazing animals", "path_wn": "", "def_wn": ""}, {"classname": "permanent crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "residential buildings or homes or apartments", "def_wiki": "A complete domicile occupying only part of a building, especially one for rent; a flat.", "path_wn": "", "def_wn": ""}, {"classname": "river", "def_wiki": "A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", "path_wn": ["river", "stream", "body_of_water", "thing", "physical_entity", "entity"], "def_wn": "a large natural stream of water (larger than a creek)"}, {"classname": "lake or sea", "def_wiki": "A large body of salt water.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_hateful-memes.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}, {"classname": "hatespeech meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}] -------------------------------------------------------------------------------- /scripts/interpret_prompt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import torch 5 | 6 | from clip.simple_tokenizer import SimpleTokenizer 7 | from clip import clip 8 | 9 | 10 | def load_clip_to_cpu(backbone_name="RN50"): 11 | url = clip._MODELS[backbone_name] 12 | model_path = clip._download(url) 13 | 14 | try: 15 | # loading JIT archive 16 | model = torch.jit.load(model_path, map_location="cpu").eval() 17 | state_dict = None 18 | 19 | except RuntimeError: 20 | state_dict = torch.load(model_path, map_location="cpu") 21 | 22 | model = clip.build_model(state_dict or model.state_dict()) 23 | 24 | return model 25 | 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("fpath", type=str, help="Path to the learned prompt") 29 | parser.add_argument("topk", type=int, help="Select top-k similar words") 30 | args = parser.parse_args() 31 | 32 | fpath = args.fpath 33 | topk = args.topk 34 | 35 | assert os.path.exists(fpath) 36 | 37 | print(f"Return the top-{topk} matched words") 38 | 39 | tokenizer = SimpleTokenizer() 40 | clip_model = load_clip_to_cpu() 41 | token_embedding = clip_model.token_embedding.weight 42 | print(f"Size of token embedding: {token_embedding.shape}") 43 | 44 | prompt_learner = torch.load(fpath, map_location="cpu")["state_dict"] 45 | ctx = prompt_learner["ctx"] 46 | ctx = ctx.float() 47 | print(f"Size of context: {ctx.shape}") 48 | 49 | if ctx.dim() == 2: 50 | # Generic context 51 | distance = torch.cdist(ctx, token_embedding) 52 | print(f"Size of distance matrix: {distance.shape}") 53 | sorted_idxs = torch.argsort(distance, dim=1) 54 | sorted_idxs = sorted_idxs[:, :topk] 55 | 56 | for m, idxs in enumerate(sorted_idxs): 57 | words = [tokenizer.decoder[idx.item()] for idx in idxs] 58 | dist = [f"{distance[m, idx].item():.4f}" for idx in idxs] 59 | print(f"{m+1}: {words} {dist}") 60 | 61 | elif ctx.dim() == 3: 62 | # Class-specific context 63 | raise NotImplementedError 64 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_mnist.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "0", "gpt3": [" 0.", " The number zero (0).", " A particle used for marking the following verb as an infinitive.", " 0.", " To be in a state of confusion."]}, {"classname": "1", "gpt3": [" 1.", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The act of ingesting.", " The number one (1)."]}, {"classname": "2", "gpt3": [" A particle used for marking the following verb as an infinitive.", " The number two (2).", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The first person to visit the moon."]}, {"classname": "3", "gpt3": [" 0.", " The CIA.", " Because, as, since.", " A particle used for marking the following verb as an infinitive.", " Because, as, since."]}, {"classname": "4", "gpt3": [" The number four (4).", " The first of the four basic operations of arithmetic, that is, the operation of finding the remainder when one number is divided by another.", " Because, as, since.", " A type of small, flat, round cake.", " To be in a state of disrepair."]}, {"classname": "5", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " A kind of animal."]}, {"classname": "6", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be able to.", " The first person to be killed in the novel.", " The number six (6).", " Because, as, since."]}, {"classname": "7", "gpt3": [" To be in a state of disrepair.", " To be ingested.", " To be in a state of confusion.", " To be in a state of being.", " Because, as, since."]}, {"classname": "8", "gpt3": [" To ingest; to be ingested.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " Because, as, since."]}, {"classname": "9", "gpt3": [" The number nine (9).", " A particle used for marking the preceding noun as a subject.", " The number nine (9).", " To be in a state of being.", " The first person to do something."]}] -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/mnist_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "0", "def_wiki": "0.", "path_wn": ["nothing", "relative_quantity", "measure", "abstraction", "entity"], "def_wn": "a mathematical element that when added to another number yields the same number"}, {"classname": "1", "def_wiki": "The number one (1).", "path_wn": ["one", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the smallest whole number or a numeral representing this number"}, {"classname": "2", "def_wiki": "A particle used for marking the following verb as an infinitive.", "path_wn": ["two", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one or a numeral representing this number"}, {"classname": "3", "def_wiki": null, "path_wn": ["three", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one and one"}, {"classname": "4", "def_wiki": "Because, as, since.", "path_wn": ["four", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of three and one"}, {"classname": "5", "def_wiki": null, "path_wn": ["five", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of four and one"}, {"classname": "6", "def_wiki": "MI6; the agency or a particular agent.", "path_wn": ["six", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of five and one"}, {"classname": "7", "def_wiki": null, "path_wn": ["seven", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of six and one"}, {"classname": "8", "def_wiki": "To ingest; to be ingested.", "path_wn": ["eight", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of seven and one"}, {"classname": "9", "def_wiki": null, "path_wn": ["nine", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of eight and one"}] -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_kitti-distance.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "a photo i took of a car on my left or right side.", "gpt3": [" The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", " beside, next to, by", " a wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project."]}, {"classname": "a photo i took with a car nearby.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " near; close", " Nearby; close by; close at hand; close to hand; close by; close to; close on; close to at hand; close to hand; close by; close to; close on; close to at hand; close to hand;"]}, {"classname": "a photo i took with a car in the distance.", "gpt3": [" far away; a long distance away", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " far away; a long distance away", " distant, far away, far off", " far away; a long distance away"]}, {"classname": "a photo i took with no car.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car; no cars", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation."]}] -------------------------------------------------------------------------------- /datasets/food101.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 5 | from dassl.utils import mkdir_if_missing 6 | 7 | from .oxford_pets import OxfordPets 8 | from .dtd import DescribableTextures as DTD 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class Food101(DatasetBase): 13 | 14 | dataset_dir = "food-101" 15 | 16 | def __init__(self, cfg): 17 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 18 | self.dataset_dir = os.path.join(root, self.dataset_dir) 19 | self.image_dir = os.path.join(self.dataset_dir, "images") 20 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_Food101.json") 21 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 22 | mkdir_if_missing(self.split_fewshot_dir) 23 | 24 | if os.path.exists(self.split_path): 25 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 26 | else: 27 | train, val, test = DTD.read_and_split_data(self.image_dir) 28 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 29 | 30 | num_shots = cfg.DATASET.NUM_SHOTS 31 | if num_shots >= 1: 32 | seed = cfg.SEED 33 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 34 | 35 | if os.path.exists(preprocessed): 36 | print(f"Loading preprocessed few-shot data from {preprocessed}") 37 | with open(preprocessed, "rb") as file: 38 | data = pickle.load(file) 39 | train, val = data["train"], data["val"] 40 | else: 41 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 42 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 43 | data = {"train": train, "val": val} 44 | print(f"Saving preprocessed few-shot data to {preprocessed}") 45 | with open(preprocessed, "wb") as file: 46 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 47 | 48 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 49 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 50 | 51 | super().__init__(train_x=train, val=val, test=test) 52 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/model/clip_swin_tiny.yaml: -------------------------------------------------------------------------------- 1 | INPUT: 2 | MEAN: 3 | - 0.485 4 | - 0.456 5 | - 0.406 6 | STD: 7 | - 0.229 8 | - 0.224 9 | - 0.225 10 | MODEL: 11 | NAME: clip_swin 12 | NUM_PARAMS_IN_M: 11.0 13 | AUTHOR: 'MSFT' 14 | PRETRAINED_DATA: 'ImageNet22K_YFCC15M' 15 | CREATION_TIME: '2021-10-27' 16 | # Following configuration is needed for CLIP model. 17 | PRETRAINED: '' 18 | PRETRAINED_LAYERS: ['*'] 19 | SPEC: 20 | EMBED_DIM: 512 21 | GATHER_TENSORS: True 22 | TEXT: 23 | TOKENIZER: clip 24 | CONTEXT_LENGTH: 77 25 | WIDTH: 512 26 | HEADS: 8 27 | LAYERS: 12 28 | VISION: 29 | PATCH_SIZE: 4 30 | IN_CHANS: 3 31 | EMBED_DIM: 96 32 | DEPTHS: [2, 2, 6, 2] 33 | NUM_HEADS: [3, 6, 12, 24] 34 | WINDOW_SIZE: 7 35 | MLP_RATIO: 4. 36 | QKV_BIAS: True 37 | APE: False 38 | PATCH_NORM: True 39 | DROP_RATE: 0.0 40 | DROP_PATH_RATE: 0.0 41 | 42 | KNOWLEDGE: 43 | WORDNET: 44 | USE_HIERARCHY: False # False 45 | USE_DEFINITION: False # True 46 | 47 | # DATASET: 48 | # DATASET: 'imagenet' 49 | # ROOT: ../../data/zeroshot/classification/imagenet 50 | OUTPUT_DIR: /home/chunyl/azure_mount/chunyleu_output/cvinwild/ic_benchmark/debug/swin_tiny/unicl_imagenet21k 51 | # ../../output/hcl_exp/hcl_yfcc15m_half_imagenet22k_half/wordnet_h_true_d_false 52 | TEST: 53 | MODEL_FILE: '/home/chunyl/azure_mount/chunyleu_output/ckpts/benchmark/swin_tiny/unicl_imagenet21k/model_state_dict.pt' 54 | BATCH_SIZE_PER_GPU: 128 55 | 56 | TRAIN: 57 | BATCH_SIZE_PER_GPU: 64 58 | BEGIN_EPOCH: 0 59 | END_EPOCH: 10 60 | EXTRA_FINAL_TRAIN_EPOCH: 40 61 | OPTIMIZER: sgd 62 | WD: 0. 63 | MOMENTUM: 0.9 64 | NESTEROV: false 65 | SHUFFLE: true 66 | LR_SCHEDULER: 67 | METHOD: 'WarmupCosine' 68 | WARMUP_EPOCH: 5 69 | 70 | # hcl_imagenet_21k_wiki 71 | # hcl_imagenet21k 72 | # hcl_yfcc15m_half_imagenet21k_half_multitask 73 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_half_imagenet22k_half/model_state_dict.pt' 74 | 75 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k_multitask/model_state_dict.pt' 76 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k/model_state_dict.pt' 77 | 78 | # hcl_imagenet22k hcl_yfcc15m hcl_yfcc15m_half_imagenet21k_half hcl_yfcc15m_half_imagenet22k_half hcl_yfcc15m_imagenet21k hcl_yfcc15m_imagenet22k hcl_yfcc15m_imagenet22k_multitask 79 | # hcl_imagenet1k 80 | -------------------------------------------------------------------------------- /scripts/avg_ckpt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from dassl.utils import ( 4 | MetricMeter, AverageMeter, tolist_if_not, count_num_param, load_checkpoint, 5 | save_checkpoint, mkdir_if_missing, resume_from_checkpoint, 6 | load_pretrained_weights 7 | ) 8 | from collections import OrderedDict 9 | import os.path as osp 10 | 11 | seeds = [1, 2, 3] 12 | 13 | ckpt_dir = "//tmp//Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101/" 14 | # ckpt_path = f"{ckpt_dir}/VPT/vit_b16_5shots/nctx16_csc_ctp/" 15 | ckpt_path = f"{ckpt_dir}/CoOp/vit_b16_5shots/nctx16_csc_ctp/" 16 | 17 | ckpt_dir = "prompt_learner/" 18 | ckpt_name = "model-best.pth.tar" 19 | import numpy as np 20 | 21 | def average_ckpt(state_dict, ignore=['optimizer', 'scheduler']): 22 | new_dict = dict() 23 | print(state_dict['val_result'], state_dict['epoch']) 24 | for key in state_dict: 25 | if key in ignore: 26 | continue 27 | if isinstance(state_dict[key][0], int): 28 | new_dict[key] = int(np.average(state_dict[key])) 29 | elif isinstance(state_dict[key][0], float): 30 | new_dict[key] = np.average(state_dict[key]) 31 | elif isinstance(state_dict[key][0], dict): 32 | avg_dict = dict() 33 | for ckpt_id in range(len(state_dict[key])): 34 | for param_key in state_dict[key][ckpt_id]: 35 | if param_key not in avg_dict: 36 | avg_dict[param_key] = [] 37 | avg_dict[param_key].append( state_dict[key][ckpt_id][param_key] ) 38 | for param_key in avg_dict: 39 | # print(avg_dict[param_key][0].shape) 40 | avg_dict[param_key] = torch.stack( avg_dict[param_key] ).mean(dim=0) 41 | # print(avg_dict[param_key].shape) 42 | new_dict[key] = dict(avg_dict) 43 | return new_dict 44 | 45 | state = {} 46 | for seed in seeds: 47 | model_path = f"{ckpt_path}/seed{seed}/{ckpt_dir}{ckpt_name}" 48 | checkpoint = load_checkpoint(model_path) 49 | for key in checkpoint: 50 | if key not in state: state[key] = [] 51 | state[key].append( checkpoint[key] ) 52 | 53 | avg_ckpt = average_ckpt(state) 54 | 55 | print(avg_ckpt.keys()) 56 | print(avg_ckpt['val_result']) 57 | print(osp.join(ckpt_path, ckpt_dir)) 58 | save_checkpoint( 59 | { 60 | "state_dict": avg_ckpt['state_dict'], 61 | "epoch": avg_ckpt['epoch'], 62 | "val_result": avg_ckpt['val_result'], 63 | }, 64 | osp.join(ckpt_path, ckpt_dir), 65 | is_best=True, 66 | ) -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_fer-2013.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "angry", "gpt3": [" Feeling or showing anger.", " Displaying or feeling anger.", " Feeling or showing anger; wrathful; irate.", " Displaying or feeling anger.", " A feeling of displeasure and antagonism aroused by a wrong; wrath; ire; rage."]}, {"classname": "disgusted", "gpt3": [" Having a feeling of loathing or revulsion; repulsed.", " Feeling or displaying disgust.", " Affected with a feeling of loathing or repugnance; having a feeling of extreme repugnance or aversion.", " Filled with disgust.", " Feeling or showing a strong dislike of something."]}, {"classname": "fearful", "gpt3": [" Causing fear.", " Causing fear.", " Frightening.", " Frightening.", " Causing fear or dread; frightening."]}, {"classname": "happy", "gpt3": [" Emotionally positive.", " Feeling or showing pleasure and satisfaction.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous."]}, {"classname": "neutral", "gpt3": [" Not showing any emotion.", " Having no particular feeling or interest; not caring one way or the other; indifferent.", " Not taking sides in a conflict such as war; nonaligned.", " Neither positive nor negative.", " Neither good nor bad."]}, {"classname": "sad", "gpt3": [" Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; gloomy; dismal; dreary; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; melancholy; dismal; gloomy; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; d"]}, {"classname": "surprised", "gpt3": [" Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Astonished; astounded; amazed; dumbfounded; flabbergasted; flummoxed; flabbergasted; thunderstruck; dumbstruck; thunderstruck; dumbstruck; thunderstruck; dumbstruck;", " Surprised.", " Surprised."]}] -------------------------------------------------------------------------------- /datasets/caltech101.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 5 | from dassl.utils import mkdir_if_missing 6 | 7 | from .oxford_pets import OxfordPets 8 | from .dtd import DescribableTextures as DTD 9 | 10 | IGNORED = ["BACKGROUND_Google", "Faces_easy"] 11 | NEW_CNAMES = { 12 | "airplanes": "airplane", 13 | "Faces": "face", 14 | "Leopards": "leopard", 15 | "Motorbikes": "motorbike", 16 | } 17 | 18 | 19 | @DATASET_REGISTRY.register() 20 | class Caltech101(DatasetBase): 21 | 22 | dataset_dir = "caltech-101" 23 | 24 | def __init__(self, cfg): 25 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 26 | self.dataset_dir = os.path.join(root, self.dataset_dir) 27 | self.image_dir = os.path.join(self.dataset_dir, "101_ObjectCategories") 28 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_Caltech101.json") 29 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 30 | mkdir_if_missing(self.split_fewshot_dir) 31 | 32 | if os.path.exists(self.split_path): 33 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 34 | else: 35 | train, val, test = DTD.read_and_split_data(self.image_dir, ignored=IGNORED, new_cnames=NEW_CNAMES) 36 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 37 | 38 | num_shots = cfg.DATASET.NUM_SHOTS 39 | if num_shots >= 1: 40 | seed = cfg.SEED 41 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 42 | 43 | if os.path.exists(preprocessed): 44 | print(f"Loading preprocessed few-shot data from {preprocessed}") 45 | with open(preprocessed, "rb") as file: 46 | data = pickle.load(file) 47 | train, val = data["train"], data["val"] 48 | else: 49 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 50 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 51 | data = {"train": train, "val": val} 52 | print(f"Saving preprocessed few-shot data to {preprocessed}") 53 | with open(preprocessed, "wb") as file: 54 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 55 | 56 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 57 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 58 | 59 | super().__init__(train_x=train, val=val, test=test) 60 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/commands/prepare_submit.py: -------------------------------------------------------------------------------- 1 | """ 2 | submit predictions to leaderboard service 3 | """ 4 | import argparse 5 | from collections import defaultdict 6 | import json 7 | import logging 8 | import pathlib 9 | import zipfile 10 | import itertools 11 | import numpy as np 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser(description='Submit predictions to leaderboard service.') 15 | parser.add_argument('--combine_path', required=True, help='Prediction json file path.', type=pathlib.Path) 16 | parser.add_argument('--combine_name', default='all_predictions', required=False, help='Output file name.', type=str) 17 | args = parser.parse_args() 18 | 19 | return args 20 | 21 | 22 | # if you find the accuracy is not enough, pleae consider increasing `prec`. 23 | def json_prec_dump(data, prec=6): 24 | return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec))) 25 | 26 | 27 | def main(): 28 | logging.basicConfig(level=logging.INFO) 29 | args = parse_args() 30 | 31 | all_predictions = defaultdict(list) 32 | for prediction_file in args.combine_path.iterdir(): 33 | if prediction_file.suffix != '.json': 34 | print(f'Ignoring file {prediction_file.name} by suffix.') 35 | continue 36 | prediction_data = json.loads(prediction_file.read_text()) 37 | all_predictions[prediction_data['dataset_name']].append(prediction_data) 38 | 39 | all_combine_predictions = [] 40 | 41 | KNOWN_AVERAGE_KEYS = ['num_trainable_params'] 42 | KNOWN_MERGE_KEYS = ['rnd_seeds', 'predictions'] 43 | KNOWN_DIFF_KEYS = KNOWN_AVERAGE_KEYS + KNOWN_MERGE_KEYS 44 | 45 | for ds, prediction_data in all_predictions.items(): 46 | prediction_keys = list(prediction_data[0]) 47 | combined_dict = dict() 48 | for key in prediction_keys: 49 | values = [x[key] for x in prediction_data] 50 | if key not in KNOWN_DIFF_KEYS: 51 | assert all(x == values[0] for x in values) 52 | values = values[0] 53 | else: 54 | if key in KNOWN_MERGE_KEYS: 55 | values = list(itertools.chain.from_iterable(values)) 56 | elif key in KNOWN_AVERAGE_KEYS: 57 | values = np.asarray(values).mean() 58 | else: 59 | assert False 60 | combined_dict[key] = values 61 | all_combine_predictions.append(combined_dict) 62 | 63 | all_predictions = {"data": all_combine_predictions} 64 | all_predictions = json_prec_dump(all_predictions) 65 | save_path = args.combine_path / f'{args.combine_name}.zip' 66 | zf = zipfile.ZipFile(save_path, "w", zipfile.ZIP_DEFLATED) 67 | zf.writestr('all_predictions.json', all_predictions) 68 | zf.close() 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/common/data_class_base.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import dataclasses 3 | 4 | 5 | class DataClassBase: 6 | def __post_init__(self): 7 | self.validate() 8 | 9 | @classmethod 10 | def from_dict(cls, data_content): 11 | c = {} 12 | for field in dataclasses.fields(cls): 13 | d_type = DataClassBase._get_dataclass_type(field.type) 14 | if field.name in data_content: 15 | c[field.name] = d_type.from_dict(data_content[field.name]) if d_type else data_content[field.name] 16 | 17 | assert len(data_content) == len(c), f"{data_content.keys()} vs {c.keys()}" 18 | return cls(**c) 19 | 20 | def to_dict(self, skip_default=True): 21 | result = {} 22 | for f in dataclasses.fields(self): 23 | value = getattr(self, f.name) 24 | if dataclasses.is_dataclass(value): 25 | value = value.to_dict() 26 | elif isinstance(value, (list, tuple)): 27 | value = type(value)(v.to_dict() if dataclasses.is_dataclass(v) else v for v in value) 28 | if not skip_default or value != f.default: 29 | result[f.name] = value 30 | return result 31 | 32 | def validate(self): 33 | # Check the field types. 34 | for field in dataclasses.fields(self): 35 | if hasattr(field.type, '__origin__') and field.type.__origin__ in (tuple, collections.abc.Sequence): 36 | expected_types = field.type.__origin__ 37 | elif hasattr(field.type, '__args__'): 38 | # Optional[].__args__ is (, NoneType) 39 | expected_types = field.type.__args__ 40 | else: 41 | expected_types = field.type 42 | 43 | if not isinstance(self.__dict__[field.name], expected_types): 44 | raise TypeError(f"Unexpected field type for {field.name}: Expected: {expected_types}. Actual: {type(self.__dict__[field.name])}") 45 | 46 | def _raise_value_error(self, config_name, msg=None): 47 | error_msg = f"Invalid {config_name}: {getattr(self, config_name)}." 48 | if msg: 49 | error_msg += ' ' + msg 50 | 51 | raise ValueError(error_msg) 52 | 53 | def _check_value(self, value_name, checker): 54 | value = getattr(self, value_name) 55 | if not checker(value): 56 | raise ValueError(f"Invalid {value_name}: {value}.") 57 | 58 | def _get_dataclass_type(field_type): 59 | """Returns dataclass type if the given type is dataclass or Optional[dataclass].""" 60 | if dataclasses.is_dataclass(field_type): 61 | return field_type 62 | if hasattr(field_type, '__args__'): 63 | args = field_type.__args__ 64 | if len(args) == 2 and type(None) in args: 65 | return next((t for t in args if dataclasses.is_dataclass(t)), None) 66 | return None 67 | -------------------------------------------------------------------------------- /trainers/imagenet_templates.py: -------------------------------------------------------------------------------- 1 | # source: https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb 2 | 3 | IMAGENET_TEMPLATES = [ 4 | "a bad photo of a {}.", 5 | "a photo of many {}.", 6 | "a sculpture of a {}.", 7 | "a photo of the hard to see {}.", 8 | "a low resolution photo of the {}.", 9 | "a rendering of a {}.", 10 | "graffiti of a {}.", 11 | "a bad photo of the {}.", 12 | "a cropped photo of the {}.", 13 | "a tattoo of a {}.", 14 | "the embroidered {}.", 15 | "a photo of a hard to see {}.", 16 | "a bright photo of a {}.", 17 | "a photo of a clean {}.", 18 | "a photo of a dirty {}.", 19 | "a dark photo of the {}.", 20 | "a drawing of a {}.", 21 | "a photo of my {}.", 22 | "the plastic {}.", 23 | "a photo of the cool {}.", 24 | "a close-up photo of a {}.", 25 | "a black and white photo of the {}.", 26 | "a painting of the {}.", 27 | "a painting of a {}.", 28 | "a pixelated photo of the {}.", 29 | "a sculpture of the {}.", 30 | "a bright photo of the {}.", 31 | "a cropped photo of a {}.", 32 | "a plastic {}.", 33 | "a photo of the dirty {}.", 34 | "a jpeg corrupted photo of a {}.", 35 | "a blurry photo of the {}.", 36 | "a photo of the {}.", 37 | "a good photo of the {}.", 38 | "a rendering of the {}.", 39 | "a {} in a video game.", 40 | "a photo of one {}.", 41 | "a doodle of a {}.", 42 | "a close-up photo of the {}.", 43 | "a photo of a {}.", 44 | "the origami {}.", 45 | "the {} in a video game.", 46 | "a sketch of a {}.", 47 | "a doodle of the {}.", 48 | "a origami {}.", 49 | "a low resolution photo of a {}.", 50 | "the toy {}.", 51 | "a rendition of the {}.", 52 | "a photo of the clean {}.", 53 | "a photo of a large {}.", 54 | "a rendition of a {}.", 55 | "a photo of a nice {}.", 56 | "a photo of a weird {}.", 57 | "a blurry photo of a {}.", 58 | "a cartoon {}.", 59 | "art of a {}.", 60 | "a sketch of the {}.", 61 | "a embroidered {}.", 62 | "a pixelated photo of a {}.", 63 | "itap of the {}.", 64 | "a jpeg corrupted photo of the {}.", 65 | "a good photo of a {}.", 66 | "a plushie {}.", 67 | "a photo of the nice {}.", 68 | "a photo of the small {}.", 69 | "a photo of the weird {}.", 70 | "the cartoon {}.", 71 | "art of the {}.", 72 | "a drawing of the {}.", 73 | "a photo of the large {}.", 74 | "a black and white photo of a {}.", 75 | "the plushie {}.", 76 | "a dark photo of a {}.", 77 | "itap of a {}.", 78 | "graffiti of the {}.", 79 | "a toy {}.", 80 | "itap of my {}.", 81 | "a photo of a cool {}.", 82 | "a photo of a small {}.", 83 | "a tattoo of the {}.", 84 | ] 85 | 86 | IMAGENET_TEMPLATES_SELECT = [ 87 | "itap of a {}.", 88 | "a bad photo of the {}.", 89 | "a origami {}.", 90 | "a photo of the large {}.", 91 | "a {} in a video game.", 92 | "art of the {}.", 93 | "a photo of the small {}.", 94 | ] 95 | -------------------------------------------------------------------------------- /scripts/data.sh: -------------------------------------------------------------------------------- 1 | DATA=/shared/sheng/coop_data/ 2 | mkdir -p $DATA 3 | # DATA=/work/tianjun/few-shot-learning/prompt-moe/CoOp/data/ 4 | cd $DATA 5 | 6 | # pip install gdown 7 | 8 | mkdir -p caltech-101 9 | cd caltech-101 10 | # wget http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz 11 | wget https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip 12 | unzip caltech-101.zip 13 | mv caltech-101/101_ObjectCategories.tar.gz . 14 | gdown 1hyarUivQE36mY6jSomru6Fjd-JzwcCzN 15 | tar -xvf 101_ObjectCategories.tar.gz 16 | cd $DATA 17 | 18 | mkdir -p oxford_pets 19 | cd oxford_pets 20 | wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz 21 | wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz 22 | gdown 1501r8Ber4nNKvmlFVQZ8SeUHTcdTTEqs 23 | tar -xvf images.tar.gz 24 | tar -xvf annotations.tar.gz 25 | cd $DATA 26 | 27 | mkdir -p stanford_cars 28 | cd stanford_cars 29 | wget http://ai.stanford.edu/~jkrause/car196/cars_train.tgz 30 | wget http://ai.stanford.edu/~jkrause/car196/cars_test.tgz 31 | wget https://ai.stanford.edu/~jkrause/cars/car_devkit.tgz 32 | wget http://ai.stanford.edu/~jkrause/car196/cars_test_annos_withlabels.mat 33 | gdown 1ObCFbaAgVu0I-k_Au-gIUcefirdAuizT 34 | tar -xvf cars_train.tgz 35 | tar -xvf cars_test.tgz 36 | tar -xvf car_devkit.tgz 37 | cd $DATA 38 | 39 | mkdir -p oxford_flowers 40 | cd oxford_flowers 41 | wget https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz 42 | wget https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat 43 | gdown 1AkcxCXeK_RCGCEC_GvmWxjcjaNhu-at0 44 | gdown 1Pp0sRXzZFZq15zVOzKjKBu4A9i01nozT 45 | tar -xvf 102flowers.tgz 46 | cd $DATA 47 | 48 | 49 | wget http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz 50 | tar -xvf food-101.tar.gz 51 | cd food-101 52 | gdown 1QK0tGi096I0Ba6kggatX1ee6dJFIcEJl 53 | cd $DATA 54 | 55 | wget https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz 56 | tar -xvf fgvc-aircraft-2013b.tar.gz 57 | mv fgvc-aircraft-2013b/data fgvc_aircraft 58 | cd $DATA 59 | 60 | mkdir -p sun397 61 | cd sun397 62 | wget http://vision.princeton.edu/projects/2010/SUN/SUN397.tar.gz 63 | wget https://vision.princeton.edu/projects/2010/SUN/download/Partitions.zip 64 | gdown 1y2RD81BYuiyvebdN-JymPfyWYcd8_MUq 65 | tar -xvf SUN397.tar.gz 66 | unzip Partitions.zip 67 | cd $DATA 68 | 69 | 70 | wget https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz 71 | tar -xvf dtd-r1.0.1.tar.gz 72 | cd dtd 73 | gdown 1u3_QfB467jqHgNXC00UIzbLZRQCg2S7x 74 | cd $DATA 75 | 76 | mkdir -p eurosat 77 | cd eurosat 78 | wget http://madm.dfki.de/files/sentinel/EuroSAT.zip 79 | unzip EuroSAT.zip 80 | gdown 1Ip7yaCWFi0eaOFUGga0lUdVi_DDQth1o 81 | cd $DATA 82 | 83 | mkdir -p ucf101 84 | cd ucf101 85 | gdown 10Jqome3vtUA2keJkNanAiFpgbyC9Hc2O 86 | unzip UCF-101-midframes.zip 87 | gdown 1I0S0q91hJfsV9Gf4xDIjgDq4AqBNJb1y 88 | cd $DATA 89 | 90 | mkdir -p imagenetv2 91 | cd imagenetv2 92 | wget https://s3-us-west-2.amazonaws.com/imagenetv2public/imagenetv2-matched-frequency.tar.gz 93 | tar -xvf imagenetv2-matched-frequency.tar.gz 94 | gdown 1-61f_ol79pViBFDG_IDlUQSwoLcn2XXF -------------------------------------------------------------------------------- /datasets/fgvc_aircraft.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 5 | from dassl.utils import mkdir_if_missing 6 | 7 | from .oxford_pets import OxfordPets 8 | 9 | 10 | @DATASET_REGISTRY.register() 11 | class FGVCAircraft(DatasetBase): 12 | 13 | dataset_dir = "fgvc_aircraft" 14 | 15 | def __init__(self, cfg): 16 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 17 | self.dataset_dir = os.path.join(root, self.dataset_dir) 18 | self.image_dir = os.path.join(self.dataset_dir, "images") 19 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 20 | mkdir_if_missing(self.split_fewshot_dir) 21 | 22 | classnames = [] 23 | with open(os.path.join(self.dataset_dir, "variants.txt"), "r") as f: 24 | lines = f.readlines() 25 | for line in lines: 26 | classnames.append(line.strip()) 27 | cname2lab = {c: i for i, c in enumerate(classnames)} 28 | 29 | train = self.read_data(cname2lab, "images_variant_train.txt") 30 | val = self.read_data(cname2lab, "images_variant_val.txt") 31 | test = self.read_data(cname2lab, "images_variant_test.txt") 32 | 33 | num_shots = cfg.DATASET.NUM_SHOTS 34 | if num_shots >= 1: 35 | seed = cfg.SEED 36 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 37 | 38 | if os.path.exists(preprocessed): 39 | print(f"Loading preprocessed few-shot data from {preprocessed}") 40 | with open(preprocessed, "rb") as file: 41 | data = pickle.load(file) 42 | train, val = data["train"], data["val"] 43 | else: 44 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 45 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 46 | data = {"train": train, "val": val} 47 | print(f"Saving preprocessed few-shot data to {preprocessed}") 48 | with open(preprocessed, "wb") as file: 49 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 50 | 51 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 52 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 53 | 54 | super().__init__(train_x=train, val=val, test=test) 55 | 56 | def read_data(self, cname2lab, split_file): 57 | filepath = os.path.join(self.dataset_dir, split_file) 58 | items = [] 59 | 60 | with open(filepath, "r") as f: 61 | lines = f.readlines() 62 | for line in lines: 63 | line = line.strip().split(" ") 64 | imname = line[0] + ".jpg" 65 | classname = " ".join(line[1:]) 66 | impath = os.path.join(self.image_dir, imname) 67 | label = cname2lab[classname] 68 | item = Datum(impath=impath, label=label, classname=classname) 69 | items.append(item) 70 | 71 | return items 72 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/evaluation/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.utils.data 4 | from PIL import Image 5 | from torchvision import transforms 6 | 7 | 8 | class Voc2007Classification(torch.utils.data.Dataset): 9 | def __init__(self, data_root, image_set="train", transform=None): 10 | """ 11 | Pascal voc2007 training/validation data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 12 | test data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 13 | """ 14 | self.data_root = self._update_path(data_root, image_set) 15 | self.transform = transform 16 | self.labels = self._read_annotation(image_set) 17 | self.images = list(self.labels.keys()) 18 | 19 | @staticmethod 20 | def _update_path(data_root, image_set): 21 | if image_set == "train" or image_set == "val": 22 | data_root += "train/VOCdevkit/VOC2007" 23 | elif image_set == "test": 24 | data_root += "test/VOCdevkit 2/VOC2007" 25 | else: 26 | raise Exception("Incorrect image set!") 27 | return data_root 28 | 29 | def __getitem__(self, index): 30 | img_path = os.path.join(self.data_root, 'JPEGImages/' + self.images[index] + '.jpg') 31 | image = Image.open(img_path).convert("RGB") 32 | if self.transform is not None: 33 | image = self.transform(image) 34 | else: 35 | image = transforms.ToTensor()(image) 36 | label = self.labels[self.images[index]] 37 | label = torch.LongTensor(label) 38 | return image, label 39 | 40 | def __len__(self): 41 | return len(self.images) 42 | 43 | def _read_annotation(self, image_set="train"): 44 | """ 45 | Annotation interpolation, refer to: 46 | http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00093000000000000000 47 | """ 48 | object_categories = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 49 | 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 50 | 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] 51 | annotation_folder = os.path.join(self.data_root, "ImageSets/Main/") 52 | files = [file_name for file_name in os.listdir(annotation_folder) if file_name.endswith("_" + image_set + ".txt")] 53 | labels_all = dict() 54 | for file_name in files: 55 | label_str = file_name.split("_")[0] 56 | label_int = object_categories.index(label_str) 57 | with open(annotation_folder + "/" + file_name, "r") as fread: 58 | for line in fread.readlines(): 59 | index = line[:6] 60 | if index not in labels_all.keys(): 61 | labels_all[index] = [0] * len(object_categories) 62 | flag = 1 63 | if line[7:9] and int(line[7:9]) != 1: 64 | flag = -1 65 | if flag == 1: 66 | labels_all[index][label_int] = 1 67 | return labels_all 68 | 69 | -------------------------------------------------------------------------------- /datasets/eurosat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 5 | from dassl.utils import mkdir_if_missing 6 | 7 | from .oxford_pets import OxfordPets 8 | from .dtd import DescribableTextures as DTD 9 | 10 | NEW_CNAMES = { 11 | "AnnualCrop": "Annual Crop Land", 12 | "Forest": "Forest", 13 | "HerbaceousVegetation": "Herbaceous Vegetation Land", 14 | "Highway": "Highway or Road", 15 | "Industrial": "Industrial Buildings", 16 | "Pasture": "Pasture Land", 17 | "PermanentCrop": "Permanent Crop Land", 18 | "Residential": "Residential Buildings", 19 | "River": "River", 20 | "SeaLake": "Sea or Lake", 21 | } 22 | 23 | 24 | @DATASET_REGISTRY.register() 25 | class EuroSAT(DatasetBase): 26 | 27 | dataset_dir = "eurosat" 28 | 29 | def __init__(self, cfg): 30 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 31 | self.dataset_dir = os.path.join(root, self.dataset_dir) 32 | self.image_dir = os.path.join(self.dataset_dir, "2750") 33 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_EuroSAT.json") 34 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 35 | mkdir_if_missing(self.split_fewshot_dir) 36 | 37 | if os.path.exists(self.split_path): 38 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 39 | else: 40 | train, val, test = DTD.read_and_split_data(self.image_dir, new_cnames=NEW_CNAMES) 41 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 42 | 43 | num_shots = cfg.DATASET.NUM_SHOTS 44 | if num_shots >= 1: 45 | seed = cfg.SEED 46 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 47 | 48 | if os.path.exists(preprocessed): 49 | print(f"Loading preprocessed few-shot data from {preprocessed}") 50 | with open(preprocessed, "rb") as file: 51 | data = pickle.load(file) 52 | train, val = data["train"], data["val"] 53 | else: 54 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 55 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 56 | data = {"train": train, "val": val} 57 | print(f"Saving preprocessed few-shot data to {preprocessed}") 58 | with open(preprocessed, "wb") as file: 59 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 60 | 61 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 62 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 63 | 64 | super().__init__(train_x=train, val=val, test=test) 65 | 66 | def update_classname(self, dataset_old): 67 | dataset_new = [] 68 | for item_old in dataset_old: 69 | cname_old = item_old.classname 70 | cname_new = NEW_CLASSNAMES[cname_old] 71 | item_new = Datum(impath=item_old.impath, label=item_old.label, classname=cname_new) 72 | dataset_new.append(item_new) 73 | return dataset_new 74 | -------------------------------------------------------------------------------- /scripts/mvlpt/env_mvlpt.yml: -------------------------------------------------------------------------------- 1 | name: mvlpt 2 | channels: 3 | - defaults 4 | dependencies: 5 | - conda-forge/linux-64::_libgcc_mutex==0.1=main 6 | - defaults/linux-64::ca-certificates==2022.07.19=h06a4308_0 7 | - defaults/linux-64::intel-openmp==2021.2.0=h06a4308_610 8 | - defaults/linux-64::libstdcxx-ng==9.3.0=hd4cf53a_17 9 | - pytorch/noarch::pytorch-mutex==1.0=cuda 10 | - defaults/linux-64::libgomp==9.3.0=h5101ec6_17 11 | - defaults/linux-64::mkl==2021.2.0=h06a4308_296 12 | - defaults/linux-64::_openmp_mutex==4.5=1_gnu 13 | - conda-forge/linux-64::blas==1.0=mkl 14 | - defaults/linux-64::libgcc-ng==9.3.0=h5101ec6_17 15 | - conda-forge/linux-64::bzip2==1.0.8=h7f98852_4 16 | - defaults/linux-64::cudatoolkit==11.3.1=h2bc3f7f_2 17 | - conda-forge/linux-64::gmp==6.2.1=h58526e2_0 18 | - defaults/linux-64::jpeg==9b=h024ee3a_2 19 | - conda-forge/linux-64::lame==3.100=h7f98852_1001 20 | - defaults/linux-64::libffi==3.2.1=hf484d3e_1007 21 | - conda-forge/linux-64::libiconv==1.16=h516909a_0 22 | - conda-forge/linux-64::libuv==1.41.0=h7f98852_0 23 | - conda-forge/linux-64::libwebp-base==1.2.0=h7f98852_2 24 | - conda-forge/linux-64::lz4-c==1.9.3=h9c3ff4c_0 25 | - defaults/linux-64::ncurses==6.2=he6710b0_1 26 | - conda-forge/linux-64::nettle==3.6=he412f7d_0 27 | - defaults/linux-64::ninja-base==1.10.2=hd09550d_5 28 | - defaults/linux-64::openssl==1.1.1q=h7f8727e_0 29 | - defaults/linux-64::xz==5.2.5=h7b6447c_0 30 | - defaults/linux-64::zlib==1.2.11=h7b6447c_3 31 | - conda-forge/linux-64::gnutls==3.6.13=h85f3911_1 32 | - defaults/linux-64::libedit==3.1.20210216=h27cfd23_1 33 | - conda-forge/linux-64::libpng==1.6.37=h21135ba_2 34 | - conda-forge/linux-64::openh264==2.1.1=h780b84a_0 35 | - defaults/linux-64::readline==7.0=h7b6447c_5 36 | - defaults/linux-64::tk==8.6.10=hbc83047_0 37 | - conda-forge/linux-64::zstd==1.4.9=ha95c52a_0 38 | - conda-forge/linux-64::freetype==2.10.4=h0708190_1 39 | - defaults/linux-64::libtiff==4.2.0=h85742a9_0 40 | - defaults/linux-64::sqlite==3.33.0=h62c20be_0 41 | - pytorch/linux-64::ffmpeg==4.3=hf484d3e_0 42 | - defaults/linux-64::lcms2==2.12=h3be6417_0 43 | - defaults/linux-64::python==3.8.0=h0371630_2 44 | - defaults/linux-64::certifi==2022.6.15=py38h06a4308_0 45 | - defaults/linux-64::ninja==1.10.2=h06a4308_5 46 | - conda-forge/noarch::olefile==0.46=pyh9f0ad1d_1 47 | - conda-forge/linux-64::python_abi==3.8=1_cp38 48 | - conda-forge/noarch::six==1.16.0=pyh6c4a22f_0 49 | - conda-forge/noarch::typing_extensions==4.3.0=pyha770c72_0 50 | - defaults/noarch::wheel==0.36.2=pyhd3eb1b0_0 51 | - conda-forge/linux-64::mkl-service==2.4.0=py38h497a2fe_0 52 | - defaults/linux-64::pillow==8.2.0=py38he98fc37_0 53 | - pytorch/linux-64::pytorch==1.10.0=py3.8_cuda11.3_cudnn8.2.0_0 54 | - defaults/linux-64::setuptools==52.0.0=py38h06a4308_0 55 | - defaults/linux-64::numpy-base==1.20.2=py38hfae3a4d_0 56 | - defaults/linux-64::pip==21.1.2=py38h06a4308_0 57 | - conda-forge/linux-64::mkl_random==1.2.2=py38h1abd341_0 58 | - defaults/linux-64::mkl_fft==1.3.0=py38h42c9631_2 59 | - defaults/linux-64::numpy==1.20.2=py38h2d18471_0 60 | - pytorch/linux-64::torchaudio==0.10.0=py38_cu113 61 | - pytorch/linux-64::torchvision==0.11.0=py38_cu113 62 | prefix: /home/sheng/anaconda3/envs/mvlpt 63 | 64 | -------------------------------------------------------------------------------- /scripts/mvlpt/main_mt_coopdata_cut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | # DATA=/path/to/datasets 5 | #TRAINER=UPT 6 | #TRAINER=VPT 7 | # TRAINER=CoOp 8 | TRAINER=$1 9 | 10 | output_dir=~/opensource/ckpt/ 11 | #root=/shared/sheng/coop_data 12 | # root=/tmp/ic/ 13 | root=//tmp/coop_data 14 | 15 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance'] 16 | CFG=$2 # config file 17 | NCTX=$3 # number of context tokens 18 | SHOTS=$4 # number of shots (5, 20, 50) 19 | 20 | # DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 21 | DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 22 | # for SEED in 1 2 3 23 | # for SEED in 1 24 | for SEED in $5 25 | do 26 | DIR=$output_dir/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} 27 | # if [ -d "$DIR" ]; then 28 | # echo "Oops! The results exist at ${DIR} (so skip this job)" 29 | # else 30 | if [ $TRAINER = "UPT" ]; then 31 | python3 train.py \ 32 | --root $root \ 33 | --seed ${SEED} \ 34 | --trainer MVLPT \ 35 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 36 | --output-dir ${DIR} \ 37 | --dataset ${DATASET} \ 38 | --shots ${SHOTS} \ 39 | --dataset-coop \ 40 | --multi-task \ 41 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 42 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 43 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 44 | TRAINER.MVLPT.COOP.CSC False \ 45 | TEST.NO_TEST False \ 46 | TEST.FINAL_MODEL "best_val" \ 47 | TRAINER.CUT_CONTEXTLEN True 48 | elif [ $TRAINER = "VPT" ]; then 49 | python3 train.py \ 50 | --root $root \ 51 | --seed ${SEED} \ 52 | --trainer MVLPT \ 53 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 54 | --output-dir ${DIR} \ 55 | --dataset ${DATASET} \ 56 | --shots ${SHOTS} \ 57 | --dataset-coop \ 58 | --multi-task \ 59 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 60 | TRAINER.MVLPT.COOP.N_CTX 0 \ 61 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 62 | TRAINER.MVLPT.COOP.CSC False \ 63 | TEST.NO_TEST False \ 64 | TEST.FINAL_MODEL "best_val" 65 | else 66 | python3 train.py \ 67 | --root $root \ 68 | --seed ${SEED} \ 69 | --trainer MVLPT \ 70 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 71 | --output-dir ${DIR} \ 72 | --dataset ${DATASET} \ 73 | --shots ${SHOTS} \ 74 | --dataset-coop \ 75 | --multi-task \ 76 | TRAINER.MVLPT.VPT.N_CTX 0 \ 77 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 78 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 79 | TRAINER.MVLPT.COOP.CSC False \ 80 | TEST.NO_TEST False \ 81 | TEST.FINAL_MODEL "best_val" \ 82 | TRAINER.CUT_CONTEXTLEN True 83 | fi 84 | done 85 | -------------------------------------------------------------------------------- /scripts/mvlpt/main_single_coopdata_cut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | # DATA=/path/to/datasets 5 | #TRAINER=UPT 6 | #TRAINER=VPT 7 | # TRAINER=CoOp 8 | TRAINER=$1 9 | 10 | output_dir=~/opensource/ckpt/ 11 | #root=/shared/sheng/coop_data 12 | # root=/tmp/ic/ 13 | root=//tmp/coop_data 14 | 15 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance'] 16 | CFG=$2 # config file 17 | NCTX=$3 # number of context tokens 18 | SHOTS=$4 # number of shots (5, 20, 50) 19 | 20 | # DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 21 | # DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 22 | DATASET=$6 23 | MODEL_DIR="--model-dir ${output_dir}/${PRETRAIN_DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp/" 24 | # MODEL_DIR="" 25 | # for SEED in 1 2 3 26 | # for SEED in 1 27 | for SEED in $5 28 | do 29 | DIR=$output_dir/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} 30 | # if [ -d "$DIR" ]; then 31 | # echo "Oops! The results exist at ${DIR} (so skip this job)" 32 | # else 33 | if [ $TRAINER = "UPT" ]; then 34 | python3 train.py \ 35 | --root $root \ 36 | --seed ${SEED} \ 37 | --trainer MVLPT \ 38 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 39 | --output-dir ${DIR} \ 40 | --dataset ${DATASET} \ 41 | --shots ${SHOTS} \ 42 | --dataset-coop \ 43 | ${MODEL_DIR} \ 44 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 45 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 46 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 47 | TRAINER.MVLPT.COOP.CSC False \ 48 | TEST.NO_TEST False \ 49 | TEST.FINAL_MODEL "best_val" \ 50 | TRAINER.CUT_CONTEXTLEN True 51 | elif [ $TRAINER = "VPT" ]; then 52 | python3 train.py \ 53 | --root $root \ 54 | --seed ${SEED} \ 55 | --trainer MVLPT \ 56 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 57 | --output-dir ${DIR} \ 58 | --dataset ${DATASET} \ 59 | --shots ${SHOTS} \ 60 | --dataset-coop \ 61 | ${MODEL_DIR} \ 62 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 63 | TRAINER.MVLPT.COOP.N_CTX 0 \ 64 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 65 | TRAINER.MVLPT.COOP.CSC False \ 66 | TEST.NO_TEST False \ 67 | TEST.FINAL_MODEL "best_val" 68 | else 69 | python3 train.py \ 70 | --root $root \ 71 | --seed ${SEED} \ 72 | --trainer MVLPT \ 73 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 74 | --output-dir ${DIR} \ 75 | --dataset ${DATASET} \ 76 | --shots ${SHOTS} \ 77 | --dataset-coop \ 78 | ${MODEL_DIR} \ 79 | TRAINER.MVLPT.VPT.N_CTX 0 \ 80 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 81 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 82 | TRAINER.MVLPT.COOP.CSC False \ 83 | TEST.NO_TEST False \ 84 | TEST.FINAL_MODEL "best_val" \ 85 | TRAINER.CUT_CONTEXTLEN True 86 | fi 87 | done 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multitask Prompt Learning for Vision-Language Models 2 | 3 | This repo contains the codebase of a series of research projects focused on adapting vision-language models like [CLIP](https://arxiv.org/abs/2103.00020) to downstream datasets via *multitask prompt learning*: 4 | 5 | * [Multitask Vision-Language Prompt Tuning](https://arxiv.org/pdf/2211.11720.pdf) 6 | 7 |

8 | 9 |

10 | 11 |

12 | 13 |

                                                (a) CoOp                         (b) VPT                         (c) UPT
14 |

15 | 16 | ## How to Install 17 | This code is built on top of the toolbox [Dassl.pytorch](https://github.com/KaiyangZhou/Dassl.pytorch) and [CoOp](https://github.com/KaiyangZhou/CoOp) so you need to install the [`dassl`](https://github.com/KaiyangZhou/Dassl.pytorch#installation) and [PyTorch](https://pytorch.org/) environment first. After that, run `pip install -r requirements.txt` under `MVLPT/` to install a few more packages required by [CLIP](https://github.com/openai/CLIP) (this should be done when `dassl` is activated). Then, you are ready to go. 18 | 19 | Follow [DATASETS.md](DATASETS.md) to install the datasets from [CoOp](https://github.com/KaiyangZhou/CoOp/tree/main/datasets) for multitask source prompt initialization or run the following script after install `gdown`. 20 | ```bash 21 | bash scripts/data.sh 22 | ``` 23 | 24 | Note that the dataset for target [ELEVATER](https://arxiv.org/pdf/2204.08790.pdf) benchmark will be downloaded automatically in `MVLPT/trainers/vision_benchmark/`. 25 | ## How to Run 26 | 27 | Click a paper below to see the detailed instructions on how to run the code to reproduce the results. 28 | 29 | * [Multitask Vision-Language Prompt Tuning](MVLPT.md) 30 | 31 | ## Models and Results 32 | 33 | - The pre-trained weights of MVLPT (MCoOp, MVPT, MUPT) on 11 tasks based on ViT-B/16 and ViT-B/32 can be downloaded altogether via this [link](https://drive.google.com/file/d/1YWVLsVcsTEP_z3ehIDgGpFTNalTG_1IE/view?usp=sharing). The weights can be used to reproduce the results in Table 1 of MVLPT's paper (i.e., the results on ImageNet and its four variants with domain shift). To load the weights and run the evaluation code, you will need to specify `--model-dir` and `--load-epoch` (see this [script](https://github.com/sIncerass/MVLPT/blob/main/scripts/mvlpt/main_single_elevater_cut.sh) for example). 34 | 35 |

36 | 37 |

38 | 39 | 40 | ## Citation 41 | If you use this code in your research, please kindly cite the following papers 42 | 43 | ```bash 44 | @article{shen2022mvlpt, 45 | title={Multitask Vision-Language Prompt Tuning}, 46 | author = {Shen, Sheng and Yang, Shijia and Zhang, Tianjun and Zhai, Bohan and Gonzalez, Joseph E. and Keutzer, Kurt and Darrell, Trevor}, 47 | journal={arXiv preprint arXiv:2211.11720}, 48 | year={2022} 49 | } 50 | ``` 51 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/hfpt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from transformers import AutoTokenizer 4 | import torch 5 | 6 | 7 | class HFPTTokenizer(object): 8 | def __init__(self, pt_name=None): 9 | 10 | self.pt_name = pt_name 11 | self.added_sep_token = 0 12 | self.added_cls_token = 0 13 | self.enable_add_tokens = False 14 | self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name)) 15 | 16 | if (pt_name is None): 17 | self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') 18 | else: 19 | self.tokenizer = AutoTokenizer.from_pretrained(pt_name) 20 | 21 | # Adding tokens to GPT causing NaN training loss. 22 | # Disable for now until further investigation. 23 | if (self.enable_add_tokens): 24 | if (self.tokenizer.sep_token is None): 25 | self.tokenizer.add_special_tokens({'sep_token': ''}) 26 | self.added_sep_token = 1 27 | 28 | if (self.tokenizer.cls_token is None): 29 | self.tokenizer.add_special_tokens({'cls_token': ''}) 30 | self.added_cls_token = 1 31 | 32 | if (self.gpt_special_case): 33 | self.tokenizer.pad_token = self.tokenizer.eos_token 34 | self.tokenizer.sep_token = self.tokenizer.eos_token 35 | 36 | def get_eot_token(self): 37 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0] 38 | 39 | def get_sot_token(self): 40 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0] 41 | 42 | def get_eot_token_list(self): 43 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False) 44 | 45 | def get_sot_token_list(self): 46 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False) 47 | 48 | def get_tokenizer_obj(self): 49 | return self.tokenizer 50 | 51 | # Language model needs to know if new tokens 52 | # were added to the dictionary. 53 | def check_added_tokens(self): 54 | return self.added_sep_token + self.added_cls_token 55 | 56 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 57 | if isinstance(texts, str): 58 | texts = [texts] 59 | 60 | padding = 'max_length' 61 | 62 | seqstart = [] 63 | seqend = [] 64 | 65 | max_length = context_length 66 | 67 | if (self.added_cls_token > 0): 68 | seqstart = self.get_sot_token_list() 69 | max_length = max_length - 1 70 | 71 | if (self.added_sep_token > 0): 72 | seqend = self.get_eot_token_list() 73 | max_length = max_length - 1 74 | 75 | tokens = self.tokenizer( 76 | texts, padding=padding, 77 | truncation=True, 78 | max_length=max_length 79 | )['input_ids'] 80 | 81 | for i in range(len(tokens)): 82 | tokens[i] = seqstart + tokens[i] + seqend 83 | 84 | if (self.gpt_special_case): 85 | for i in range(len(tokens)): 86 | tokens[i][-1] = self.get_eot_token() 87 | 88 | result = torch.Tensor(tokens).type(torch.LongTensor) 89 | 90 | return result 91 | 92 | def get_vocab_size(self): 93 | return self.tokenizer.vocab_size 94 | 95 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 96 | return self.tokenize(texts, context_length) 97 | -------------------------------------------------------------------------------- /datasets/stanford_cars.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from scipy.io import loadmat 4 | 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 6 | from dassl.utils import mkdir_if_missing 7 | 8 | from .oxford_pets import OxfordPets 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class StanfordCars(DatasetBase): 13 | 14 | dataset_dir = "stanford_cars" 15 | 16 | def __init__(self, cfg): 17 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 18 | self.dataset_dir = os.path.join(root, self.dataset_dir) 19 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_StanfordCars.json") 20 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 21 | mkdir_if_missing(self.split_fewshot_dir) 22 | 23 | if os.path.exists(self.split_path): 24 | train, val, test = OxfordPets.read_split(self.split_path, self.dataset_dir) 25 | else: 26 | trainval_file = os.path.join(self.dataset_dir, "devkit", "cars_train_annos.mat") 27 | test_file = os.path.join(self.dataset_dir, "cars_test_annos_withlabels.mat") 28 | meta_file = os.path.join(self.dataset_dir, "devkit", "cars_meta.mat") 29 | trainval = self.read_data("cars_train", trainval_file, meta_file) 30 | test = self.read_data("cars_test", test_file, meta_file) 31 | train, val = OxfordPets.split_trainval(trainval) 32 | OxfordPets.save_split(train, val, test, self.split_path, self.dataset_dir) 33 | 34 | num_shots = cfg.DATASET.NUM_SHOTS 35 | if num_shots >= 1: 36 | seed = cfg.SEED 37 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 38 | 39 | if os.path.exists(preprocessed): 40 | print(f"Loading preprocessed few-shot data from {preprocessed}") 41 | with open(preprocessed, "rb") as file: 42 | data = pickle.load(file) 43 | train, val = data["train"], data["val"] 44 | else: 45 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 46 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 47 | data = {"train": train, "val": val} 48 | print(f"Saving preprocessed few-shot data to {preprocessed}") 49 | with open(preprocessed, "wb") as file: 50 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 51 | 52 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 53 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 54 | 55 | super().__init__(train_x=train, val=val, test=test) 56 | 57 | def read_data(self, image_dir, anno_file, meta_file): 58 | anno_file = loadmat(anno_file)["annotations"][0] 59 | meta_file = loadmat(meta_file)["class_names"][0] 60 | items = [] 61 | 62 | for i in range(len(anno_file)): 63 | imname = anno_file[i]["fname"][0] 64 | impath = os.path.join(self.dataset_dir, image_dir, imname) 65 | label = anno_file[i]["class"][0, 0] 66 | label = int(label) - 1 # convert to 0-based index 67 | classname = meta_file[label][0] 68 | names = classname.split(" ") 69 | year = names.pop(-1) 70 | names.insert(0, year) 71 | classname = " ".join(names) 72 | item = Datum(impath=impath, label=label, classname=classname) 73 | items.append(item) 74 | 75 | return items 76 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/languages/prompt_engineering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | def get_prompt_templates(): 6 | prompt_templates = [ 7 | '{}.', 8 | 'a photo of a {}.', 9 | 'a bad photo of a {}.', 10 | 'a photo of many {}.', 11 | 'a sculpture of a {}.', 12 | 'a photo of the hard to see {}.', 13 | 'a low resolution photo of the {}.', 14 | 'a rendering of a {}.', 15 | 'graffiti of a {}.', 16 | 'a bad photo of the {}.', 17 | 'a cropped photo of the {}.', 18 | 'a tattoo of a {}.', 19 | 'the embroidered {}.', 20 | 'a photo of a hard to see {}.', 21 | 'a bright photo of a {}.', 22 | 'a photo of a clean {}.', 23 | 'a photo of a dirty {}.', 24 | 'a dark photo of the {}.', 25 | 'a drawing of a {}.', 26 | 'a photo of my {}.', 27 | 'the plastic {}.', 28 | 'a photo of the cool {}.', 29 | 'a close-up photo of a {}.', 30 | 'a black and white photo of the {}.', 31 | 'a painting of the {}.', 32 | 'a painting of a {}.', 33 | 'a pixelated photo of the {}.', 34 | 'a sculpture of the {}.', 35 | 'a bright photo of the {}.', 36 | 'a cropped photo of a {}.', 37 | 'a plastic {}.', 38 | 'a photo of the dirty {}.', 39 | 'a jpeg corrupted photo of a {}.', 40 | 'a blurry photo of the {}.', 41 | 'a photo of the {}.', 42 | 'a good photo of the {}.', 43 | 'a rendering of the {}.', 44 | 'a {} in a video game.', 45 | 'a photo of one {}.', 46 | 'a doodle of a {}.', 47 | 'a close-up photo of the {}.', 48 | 'the origami {}.', 49 | 'the {} in a video game.', 50 | 'a sketch of a {}.', 51 | 'a doodle of the {}.', 52 | 'a origami {}.', 53 | 'a low resolution photo of a {}.', 54 | 'the toy {}.', 55 | 'a rendition of the {}.', 56 | 'a photo of the clean {}.', 57 | 'a photo of a large {}.', 58 | 'a rendition of a {}.', 59 | 'a photo of a nice {}.', 60 | 'a photo of a weird {}.', 61 | 'a blurry photo of a {}.', 62 | 'a cartoon {}.', 63 | 'art of a {}.', 64 | 'a sketch of the {}.', 65 | 'a embroidered {}.', 66 | 'a pixelated photo of a {}.', 67 | 'itap of the {}.', 68 | 'a jpeg corrupted photo of the {}.', 69 | 'a good photo of a {}.', 70 | 'a plushie {}.', 71 | 'a photo of the nice {}.', 72 | 'a photo of the small {}.', 73 | 'a photo of the weird {}.', 74 | 'the cartoon {}.', 75 | 'art of the {}.', 76 | 'a drawing of the {}.', 77 | 'a photo of the large {}.', 78 | 'a black and white photo of a {}.', 79 | 'the plushie {}.', 80 | 'a dark photo of a {}.', 81 | 'itap of a {}.', 82 | 'graffiti of the {}.', 83 | 'a toy {}.', 84 | 'itap of my {}.', 85 | 'a photo of a cool {}.', 86 | 'a photo of a small {}.', 87 | 'a tattoo of the {}.', 88 | ] 89 | return prompt_templates 90 | 91 | 92 | def prompt_engineering(classnames): 93 | prompt_templates = get_prompt_templates() 94 | temp_idx = np.random.randint(len(prompt_templates)) 95 | 96 | if isinstance(classnames, list): 97 | classname = random.choice(classnames) 98 | else: 99 | classname = classnames 100 | 101 | return prompt_templates[temp_idx].replace('{}', classname.replace(',', '').replace('+', ' ')) 102 | -------------------------------------------------------------------------------- /datasets/sun397.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 5 | from dassl.utils import mkdir_if_missing 6 | 7 | from .oxford_pets import OxfordPets 8 | 9 | 10 | @DATASET_REGISTRY.register() 11 | class SUN397(DatasetBase): 12 | 13 | dataset_dir = "sun397" 14 | 15 | def __init__(self, cfg): 16 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 17 | self.dataset_dir = os.path.join(root, self.dataset_dir) 18 | self.image_dir = os.path.join(self.dataset_dir, "SUN397") 19 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_SUN397.json") 20 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 21 | mkdir_if_missing(self.split_fewshot_dir) 22 | 23 | if os.path.exists(self.split_path): 24 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 25 | else: 26 | classnames = [] 27 | with open(os.path.join(self.dataset_dir, "ClassName.txt"), "r") as f: 28 | lines = f.readlines() 29 | for line in lines: 30 | line = line.strip()[1:] # remove / 31 | classnames.append(line) 32 | cname2lab = {c: i for i, c in enumerate(classnames)} 33 | trainval = self.read_data(cname2lab, "Training_01.txt") 34 | test = self.read_data(cname2lab, "Testing_01.txt") 35 | train, val = OxfordPets.split_trainval(trainval) 36 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 37 | 38 | num_shots = cfg.DATASET.NUM_SHOTS 39 | if num_shots >= 1: 40 | seed = cfg.SEED 41 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 42 | 43 | if os.path.exists(preprocessed): 44 | print(f"Loading preprocessed few-shot data from {preprocessed}") 45 | with open(preprocessed, "rb") as file: 46 | data = pickle.load(file) 47 | train, val = data["train"], data["val"] 48 | else: 49 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 50 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 51 | data = {"train": train, "val": val} 52 | print(f"Saving preprocessed few-shot data to {preprocessed}") 53 | with open(preprocessed, "wb") as file: 54 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 55 | 56 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 57 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 58 | 59 | super().__init__(train_x=train, val=val, test=test) 60 | 61 | def read_data(self, cname2lab, text_file): 62 | text_file = os.path.join(self.dataset_dir, text_file) 63 | items = [] 64 | 65 | with open(text_file, "r") as f: 66 | lines = f.readlines() 67 | for line in lines: 68 | imname = line.strip()[1:] # remove / 69 | classname = os.path.dirname(imname) 70 | label = cname2lab[classname] 71 | impath = os.path.join(self.image_dir, imname) 72 | 73 | names = classname.split("/")[1:] # remove 1st letter 74 | names = names[::-1] # put words like indoor/outdoor at first 75 | classname = " ".join(names) 76 | 77 | item = Datum(impath=impath, label=label, classname=classname) 78 | items.append(item) 79 | 80 | return items 81 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/datasets/languages/hfpt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from transformers import AutoTokenizer 4 | import torch 5 | 6 | 7 | class HFPTTokenizer(object): 8 | def __init__(self, pt_name = None): 9 | 10 | self.pt_name = pt_name 11 | self.added_sep_token = 0 12 | self.added_cls_token = 0 13 | self.enable_add_tokens = False 14 | self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name)) 15 | 16 | if (pt_name is None): 17 | self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') 18 | else: 19 | self.tokenizer = AutoTokenizer.from_pretrained(pt_name) 20 | 21 | # Adding tokens to GPT causing NaN training loss. 22 | # Disable for now until further investigation. 23 | if (self.enable_add_tokens): 24 | if (self.tokenizer.sep_token is None): 25 | self.tokenizer.add_special_tokens({'sep_token': ''}) 26 | self.added_sep_token = 1 27 | 28 | if (self.tokenizer.cls_token is None): 29 | self.tokenizer.add_special_tokens({'cls_token': ''}) 30 | self.added_cls_token = 1 31 | 32 | if (self.gpt_special_case): 33 | self.tokenizer.pad_token = self.tokenizer.eos_token 34 | self.tokenizer.sep_token = self.tokenizer.eos_token 35 | 36 | def get_eot_token(self): 37 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0] 38 | 39 | def get_sot_token(self): 40 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0] 41 | 42 | def get_eot_token_list(self): 43 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False) 44 | 45 | def get_sot_token_list(self): 46 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False) 47 | 48 | def get_tokenizer_obj(self): 49 | return self.tokenizer 50 | 51 | # Language model needs to know if new tokens 52 | # were added to the dictionary. 53 | def check_added_tokens(self): 54 | return self.added_sep_token + self.added_cls_token 55 | 56 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 57 | if isinstance(texts, str): 58 | texts = [texts] 59 | 60 | padding = 'max_length' 61 | 62 | seqstart = [] 63 | seqtok = [] 64 | seqend = [] 65 | 66 | max_length = context_length 67 | 68 | if (self.added_cls_token > 0): 69 | seqstart = self.get_sot_token_list() 70 | max_length = max_length - 1 71 | 72 | if (self.added_sep_token > 0): 73 | seqend = self.get_eot_token_list() 74 | max_length = max_length - 1 75 | 76 | tokens = self.tokenizer( 77 | texts, padding=padding, 78 | truncation=True, 79 | max_length=max_length 80 | )['input_ids'] 81 | 82 | for i in range(len(tokens)): 83 | tokens[i] = seqstart + tokens[i] + seqend 84 | 85 | if (self.gpt_special_case): 86 | for i in range(len(tokens)): 87 | tokens[i][-1] = self.get_eot_token() 88 | 89 | #print(str(tokens)) 90 | 91 | result = torch.Tensor(tokens).type(torch.LongTensor) 92 | 93 | return result 94 | 95 | def get_vocab_size(self): 96 | return self.tokenizer.vocab_size 97 | 98 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 99 | return self.tokenize(texts, context_length) 100 | -------------------------------------------------------------------------------- /datasets/ucf101.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import re 4 | 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 6 | from dassl.utils import mkdir_if_missing 7 | 8 | from .oxford_pets import OxfordPets 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class UCF101(DatasetBase): 13 | 14 | dataset_dir = "ucf101" 15 | 16 | def __init__(self, cfg): 17 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 18 | self.dataset_dir = os.path.join(root, self.dataset_dir) 19 | self.image_dir = os.path.join(self.dataset_dir, "UCF-101-midframes") 20 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_UCF101.json") 21 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 22 | mkdir_if_missing(self.split_fewshot_dir) 23 | 24 | if os.path.exists(self.split_path): 25 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 26 | else: 27 | cname2lab = {} 28 | filepath = os.path.join(self.dataset_dir, "ucfTrainTestlist/classInd.txt") 29 | with open(filepath, "r") as f: 30 | lines = f.readlines() 31 | for line in lines: 32 | label, classname = line.strip().split(" ") 33 | label = int(label) - 1 # conver to 0-based index 34 | cname2lab[classname] = label 35 | 36 | trainval = self.read_data(cname2lab, "ucfTrainTestlist/trainlist01.txt") 37 | test = self.read_data(cname2lab, "ucfTrainTestlist/testlist01.txt") 38 | train, val = OxfordPets.split_trainval(trainval) 39 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 40 | 41 | num_shots = cfg.DATASET.NUM_SHOTS 42 | if num_shots >= 1: 43 | seed = cfg.SEED 44 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 45 | 46 | if os.path.exists(preprocessed): 47 | print(f"Loading preprocessed few-shot data from {preprocessed}") 48 | with open(preprocessed, "rb") as file: 49 | data = pickle.load(file) 50 | train, val = data["train"], data["val"] 51 | else: 52 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 53 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 54 | data = {"train": train, "val": val} 55 | print(f"Saving preprocessed few-shot data to {preprocessed}") 56 | with open(preprocessed, "wb") as file: 57 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 58 | 59 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 60 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 61 | 62 | super().__init__(train_x=train, val=val, test=test) 63 | 64 | def read_data(self, cname2lab, text_file): 65 | text_file = os.path.join(self.dataset_dir, text_file) 66 | items = [] 67 | 68 | with open(text_file, "r") as f: 69 | lines = f.readlines() 70 | for line in lines: 71 | line = line.strip().split(" ")[0] # trainlist: filename, label 72 | action, filename = line.split("/") 73 | label = cname2lab[action] 74 | 75 | elements = re.findall("[A-Z][^A-Z]*", action) 76 | renamed_action = "_".join(elements) 77 | 78 | filename = filename.replace(".avi", ".jpg") 79 | impath = os.path.join(self.image_dir, renamed_action, filename) 80 | 81 | item = Datum(impath=impath, label=label, classname=renamed_action) 82 | items.append(item) 83 | 84 | return items 85 | -------------------------------------------------------------------------------- /scripts/mvlpt/main_mt_elevater_cut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | # DATA=/path/to/datasets 5 | #TRAINER=UPT 6 | #TRAINER=VPT 7 | # TRAINER=CoOp 8 | TRAINER=$1 9 | 10 | output_dir=./CoCoOp_mt_20 11 | #root=/shared/sheng/coop_data 12 | # root=/tmp/ic/ 13 | # root=/tmp//coop_data 14 | root=/rscratch/shijiayang/Prompt/new0/prompt-moe/CoOp/outputs/datasets 15 | 16 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance'] 17 | CFG=$2 # config file 18 | NCTX=$3 # number of context tokens 19 | SHOTS=$4 # number of shots (5, 20, 50) 20 | 21 | # DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 22 | # DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 23 | DATASET="hateful-memes,cifar-10,mnist,oxford-flower-102,oxford-iiit-pets,resisc45_clip,country211,food-101,stanford-cars,caltech-101,dtd,voc-2007-classification,cifar-100,fgvc-aircraft-2013b-variants102,patch-camelyon,rendered-sst2,gtsrb,eurosat_clip,fer-2013,kitti-distance" 24 | for SEED in 1 2 3 25 | # for SEED in 1 26 | # for SEED in $5 27 | do 28 | DIR=$output_dir/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} 29 | # if [ -d "$DIR" ]; then 30 | # echo "Oops! The results exist at ${DIR} (so skip this job)" 31 | # else 32 | if [ $TRAINER = "UPT" ]; then 33 | python3 train.py \ 34 | --root $root \ 35 | --seed ${SEED} \ 36 | --trainer MVLPT \ 37 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 38 | --output-dir ${DIR} \ 39 | --dataset ${DATASET} \ 40 | --shots ${SHOTS} \ 41 | --multi-task \ 42 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 43 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 44 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 45 | TRAINER.MVLPT.COOP.CSC False \ 46 | TEST.NO_TEST False \ 47 | TEST.FINAL_MODEL "best_val" \ 48 | TRAINER.CUT_CONTEXTLEN True 49 | elif [ $TRAINER = "VPT" ]; then 50 | python3 train.py \ 51 | --root $root \ 52 | --seed ${SEED} \ 53 | --trainer MVLPT \ 54 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 55 | --output-dir ${DIR} \ 56 | --dataset ${DATASET} \ 57 | --shots ${SHOTS} \ 58 | --multi-task \ 59 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 60 | TRAINER.MVLPT.COOP.N_CTX 0 \ 61 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 62 | TRAINER.MVLPT.COOP.CSC False \ 63 | TEST.NO_TEST False \ 64 | TEST.FINAL_MODEL "best_val" 65 | 66 | elif [ $TRAINER = "COCOOP" ]; then 67 | python3 train.py \ 68 | --root $root \ 69 | --seed ${SEED} \ 70 | --trainer MVLPT \ 71 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 72 | --output-dir ${DIR} \ 73 | --dataset ${DATASET} \ 74 | --shots ${SHOTS} \ 75 | --multi-task \ 76 | TRAINER.MVLPT.COCOOP.N_CTX ${NCTX} \ 77 | TEST.NO_TEST False \ 78 | TEST.FINAL_MODEL "best_val" 79 | else 80 | python3 train.py \ 81 | --root $root \ 82 | --seed ${SEED} \ 83 | --trainer MVLPT \ 84 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 85 | --output-dir ${DIR} \ 86 | --dataset ${DATASET} \ 87 | --shots ${SHOTS} \ 88 | --multi-task \ 89 | TRAINER.MVLPT.VPT.N_CTX 0 \ 90 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 91 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 92 | TRAINER.MVLPT.COOP.CSC False \ 93 | TEST.NO_TEST False \ 94 | TEST.FINAL_MODEL "best_val" \ 95 | TRAINER.CUT_CONTEXTLEN True 96 | fi 97 | done 98 | -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/external/cifar-10_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "airplane", "def_wiki": "A powered heavier-than-air aircraft with fixed wings.", "path_wn": ["airplane", "heavier-than-air_craft", "aircraft", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an aircraft that has a fixed wing and is powered by propellers or jets"}, {"classname": "automobile", "def_wiki": "A type of vehicle designed to move on the ground under its own stored power and intended to carry a driver, a small number of additional passengers, and a very limited amount of other load. A car or motorcar.", "path_wn": ["car", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a motor vehicle with four wheels; usually propelled by an internal combustion engine"}, {"classname": "bird", "def_wiki": "A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", "path_wn": ["bird", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"}, {"classname": "cat", "def_wiki": "An animal of the family Felidae:\n Synonym: felid", "path_wn": ["cat", "feline", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats"}, {"classname": "deer", "def_wiki": "A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", "path_wn": ["deer", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "distinguished from Bovidae by the male's having solid deciduous antlers"}, {"classname": "dog", "def_wiki": "A mammal, Canis familiaris or Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.", "path_wn": ["dog", "canine", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds"}, {"classname": "frog", "def_wiki": "A small tailless amphibian of the order Anura that typically hops.", "path_wn": ["frog", "amphibian", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "any of various tailless stout-bodied amphibians with long hind limbs for leaping; semiaquatic and terrestrial species"}, {"classname": "horse", "def_wiki": "Any of several animals related to Equus ferus caballus.", "path_wn": ["horse", "equine", "odd-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "solid-hoofed herbivorous quadruped domesticated since prehistoric times"}, {"classname": "ship", "def_wiki": "A water-borne vessel generally larger than a boat.", "path_wn": ["ship", "vessel", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a vessel that carries passengers or freight"}, {"classname": "truck", "def_wiki": "A small wheel or roller, specifically the wheel of a gun carriage.", "path_wn": ["truck", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an automotive vehicle suitable for hauling"}] -------------------------------------------------------------------------------- /trainers/vision_benchmark/resources/knowledge/gpt3/GPT3_eurosat_clip.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "annual crop land", "gpt3": [" arable land", " arable land", " land used for growing crops that are harvested once a year", " Land that is used to grow crops for one year.", " arable land"]}, {"classname": "forest", "gpt3": [" a large area of land covered with trees", " land covered with trees", " a tract of land covered with trees and underbrush", " A large area of land covered with trees.", " A tract of land covered with trees and undergrowth, larger than woods."]}, {"classname": "brushland or shrubland", "gpt3": [" A land area covered with low-growing woody plants, such as bushes, small trees, and shrubs.", " Land that is covered mostly with shrubs.", " land covered with bushes, shrubs, and small trees.", " land covered with low, scrubby vegetation", " land covered with low, scrubby vegetation, especially thorny bushes."]}, {"classname": "highway or road", "gpt3": [" A way or course for the passage of vehicles, persons, and merchandise, usually including paved or graveled surface, curbs, and usually sidewalks.", " a way (usually public) for the transportation of people or goods", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way or path for the travel of people or vehicles."]}, {"classname": "industrial buildings or commercial buildings", "gpt3": [" The act or process by which something is built; construction.", " A building or group of buildings where goods are produced, processed or manufactured, or where services are provided.", " A building or group of buildings where goods are manufactured, processed or repaired, or goods and services are sold.", " The act or process by which something is built; construction.", " Buildings used for manufacturing or storing and selling goods."]}, {"classname": "pasture land", "gpt3": [" grassland used for grazing livestock", " Land used for grazing.", " land used for grazing animals", " Land used for grazing.", " Land used for grazing."]}, {"classname": "permanent crop land", "gpt3": [" land used for growing crops that are not harvested annually", " arable land", " Land that is used for growing crops that are not intended for sale or for feeding livestock.", " Land used for growing crops that are not intended for harvest in the current year.", " land used for growing crops that are not harvested annually, such as trees, shrubs, and vines."]}, {"classname": "residential buildings or homes or apartments", "gpt3": [" A complete domicile occupying only part of a building, especially one for rent; a flat.", " A building or group of buildings containing a number of homes.", " The act or process by which something is built; construction.", " buildings where people live", " A complete domicile occupying only part of a building, especially one for rent; a flat."]}, {"classname": "river", "gpt3": [" A large natural stream of water flowing in a channel to the sea, a lake, or another river.", " a natural stream of water larger than a creek", " A natural stream of water of considerable volume flowing in a definite course from higher to lower ground.", " A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", " A large natural stream of water (larger than a creek) flowing in a channel on the surface of the earth."]}, {"classname": "lake or sea", "gpt3": [" A large body of salt water.", " a large body of water surrounded by land", " A large body of water surrounded by land.", " A large body of water surrounded by land.", " A large body of water surrounded by land."]}] -------------------------------------------------------------------------------- /datasets/oxford_flowers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import random 4 | from scipy.io import loadmat 5 | from collections import defaultdict 6 | 7 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 8 | from dassl.utils import read_json, mkdir_if_missing 9 | 10 | from .oxford_pets import OxfordPets 11 | 12 | 13 | @DATASET_REGISTRY.register() 14 | class OxfordFlowers(DatasetBase): 15 | 16 | dataset_dir = "oxford_flowers" 17 | 18 | def __init__(self, cfg): 19 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 20 | self.dataset_dir = os.path.join(root, self.dataset_dir) 21 | self.image_dir = os.path.join(self.dataset_dir, "jpg") 22 | self.label_file = os.path.join(self.dataset_dir, "imagelabels.mat") 23 | self.lab2cname_file = os.path.join(self.dataset_dir, "cat_to_name.json") 24 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_OxfordFlowers.json") 25 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 26 | mkdir_if_missing(self.split_fewshot_dir) 27 | 28 | if os.path.exists(self.split_path): 29 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 30 | else: 31 | train, val, test = self.read_data() 32 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 33 | 34 | num_shots = cfg.DATASET.NUM_SHOTS 35 | if num_shots >= 1: 36 | seed = cfg.SEED 37 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 38 | 39 | if os.path.exists(preprocessed): 40 | print(f"Loading preprocessed few-shot data from {preprocessed}") 41 | with open(preprocessed, "rb") as file: 42 | data = pickle.load(file) 43 | train, val = data["train"], data["val"] 44 | else: 45 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 46 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 47 | data = {"train": train, "val": val} 48 | print(f"Saving preprocessed few-shot data to {preprocessed}") 49 | with open(preprocessed, "wb") as file: 50 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 51 | 52 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 53 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 54 | 55 | super().__init__(train_x=train, val=val, test=test) 56 | 57 | def read_data(self): 58 | tracker = defaultdict(list) 59 | label_file = loadmat(self.label_file)["labels"][0] 60 | for i, label in enumerate(label_file): 61 | imname = f"image_{str(i + 1).zfill(5)}.jpg" 62 | impath = os.path.join(self.image_dir, imname) 63 | label = int(label) 64 | tracker[label].append(impath) 65 | 66 | print("Splitting data into 50% train, 20% val, and 30% test") 67 | 68 | def _collate(ims, y, c): 69 | items = [] 70 | for im in ims: 71 | item = Datum(impath=im, label=y - 1, classname=c) # convert to 0-based label 72 | items.append(item) 73 | return items 74 | 75 | lab2cname = read_json(self.lab2cname_file) 76 | train, val, test = [], [], [] 77 | for label, impaths in tracker.items(): 78 | random.shuffle(impaths) 79 | n_total = len(impaths) 80 | n_train = round(n_total * 0.5) 81 | n_val = round(n_total * 0.2) 82 | n_test = n_total - n_train - n_val 83 | assert n_train > 0 and n_val > 0 and n_test > 0 84 | cname = lab2cname[str(label)] 85 | train.extend(_collate(impaths[:n_train], label, cname)) 86 | val.extend(_collate(impaths[n_train : n_train + n_val], label, cname)) 87 | test.extend(_collate(impaths[n_train + n_val :], label, cname)) 88 | 89 | return train, val, test 90 | -------------------------------------------------------------------------------- /trainers/zsclip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from dassl.engine import TRAINER_REGISTRY, TrainerX 5 | from dassl.optim import build_optimizer, build_lr_scheduler 6 | 7 | from clip import clip 8 | from clip.model import convert_weights 9 | 10 | from .coop import load_clip_to_cpu 11 | from .imagenet_templates import IMAGENET_TEMPLATES, IMAGENET_TEMPLATES_SELECT 12 | 13 | CUSTOM_TEMPLATES = { 14 | "OxfordPets": "a photo of a {}, a type of pet.", 15 | "OxfordFlowers": "a photo of a {}, a type of flower.", 16 | "FGVCAircraft": "a photo of a {}, a type of aircraft.", 17 | "DescribableTextures": "{} texture.", 18 | "EuroSAT": "a centered satellite photo of {}.", 19 | "StanfordCars": "a photo of a {}.", 20 | "Food101": "a photo of {}, a type of food.", 21 | "SUN397": "a photo of a {}.", 22 | "Caltech101": "a photo of a {}.", 23 | "UCF101": "a photo of a person doing {}.", 24 | "ImageNet": "a photo of a {}.", 25 | "ImageNetSketch": "a photo of a {}.", 26 | "ImageNetV2": "a photo of a {}.", 27 | "ImageNetA": "a photo of a {}.", 28 | "ImageNetR": "a photo of a {}.", 29 | } 30 | 31 | 32 | @TRAINER_REGISTRY.register() 33 | class ZeroshotCLIP(TrainerX): 34 | def build_model(self): 35 | cfg = self.cfg 36 | classnames = self.dm.dataset.classnames 37 | 38 | print(f"Loading CLIP (backbone: {cfg.MODEL.BACKBONE.NAME})") 39 | clip_model = load_clip_to_cpu(cfg) 40 | clip_model.to(self.device) 41 | 42 | temp = CUSTOM_TEMPLATES[cfg.DATASET.NAME] 43 | prompts = [temp.format(c.replace("_", " ")) for c in classnames] 44 | print(f"Prompts: {prompts}") 45 | prompts = torch.cat([clip.tokenize(p) for p in prompts]) 46 | prompts = prompts.to(self.device) 47 | 48 | with torch.no_grad(): 49 | text_features = clip_model.encode_text(prompts) 50 | text_features = text_features / text_features.norm(dim=-1, keepdim=True) 51 | 52 | self.text_features = text_features 53 | self.clip_model = clip_model 54 | 55 | def model_inference(self, image): 56 | image_features = self.clip_model.encode_image(image) 57 | image_features = image_features / image_features.norm(dim=-1, keepdim=True) 58 | logit_scale = self.clip_model.logit_scale.exp() 59 | logits = logit_scale * image_features @ self.text_features.t() 60 | return logits 61 | 62 | 63 | @TRAINER_REGISTRY.register() 64 | class ZeroshotCLIP2(ZeroshotCLIP): 65 | """Prompt ensembling.""" 66 | 67 | # templates = IMAGENET_TEMPLATES 68 | templates = IMAGENET_TEMPLATES_SELECT 69 | 70 | def build_model(self): 71 | cfg = self.cfg 72 | classnames = self.dm.dataset.classnames 73 | 74 | print(f"Loading CLIP (backbone: {cfg.MODEL.BACKBONE.NAME})") 75 | clip_model = load_clip_to_cpu(cfg) 76 | clip_model.to(self.device) 77 | 78 | for params in clip_model.parameters(): 79 | params.requires_grad_(False) 80 | 81 | # add custom-made prompt 82 | if cfg.DATASET.NAME != "ImageNet": 83 | self.templates += [CUSTOM_TEMPLATES[cfg.DATASET.NAME]] 84 | 85 | num_temp = len(self.templates) 86 | print(f"Prompt ensembling (n={num_temp})") 87 | 88 | mean_text_features = 0 89 | for i, temp in enumerate(self.templates): 90 | prompts = [temp.format(c.replace("_", " ")) for c in classnames] 91 | prompts = torch.cat([clip.tokenize(p) for p in prompts]).to(self.device) 92 | text_features = clip_model.encode_text(prompts) 93 | text_features = text_features / text_features.norm(dim=-1, keepdim=True) 94 | mean_text_features = mean_text_features + text_features 95 | mean_text_features = mean_text_features / num_temp 96 | mean_text_features = mean_text_features / mean_text_features.norm(dim=-1, keepdim=True) 97 | 98 | self.text_features = mean_text_features 99 | self.clip_model = clip_model 100 | -------------------------------------------------------------------------------- /scripts/mvlpt/main_single_elevater_cut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # custom config 4 | # DATA=/path/to/datasets 5 | #TRAINER=UPT 6 | #TRAINER=VPT 7 | # TRAINER=CoOp 8 | TRAINER=$1 9 | 10 | # output_dir=./CoCoOp_single_task_20 11 | #root=/shared/sheng/coop_data 12 | # root=/tmp/ic/ 13 | # root=//tmp/coop_data 14 | root=/rscratch/shijiayang/Prompt/new0/prompt-moe/CoOp/outputs/datasets 15 | output_dir=./CoCoOp_single_task_20 16 | 17 | # DATASET=$1 # ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance'] 18 | CFG=$2 # config file 19 | NCTX=$3 # number of context tokens 20 | SHOTS=$4 # number of shots (5, 20, 50) 21 | 22 | # PRETRAIN_DATASET="Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 23 | PRETRAIN_DATASET="ImageNet,Caltech101,Food101,StanfordCars,OxfordPets,OxfordFlowers,FGVCAircraft,SUN397,DescribableTextures,EuroSAT,UCF101" 24 | # PRETRAIN_DATASET="hateful-memes,cifar-10,mnist,oxford-flower-102,oxford-iiit-pets,resisc45_clip,country211,food-101,stanford-cars,caltech-101,dtd,voc-2007-classification,cifar-100,patch-camelyon,rendered-sst2,gtsrb,eurosat_clip,fer-2013,kitti-distance" 25 | DATASET=$6 26 | MODEL_DIR="--model-dir ${output_dir}/${PRETRAIN_DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp/" 27 | # for SEED in 1 2 3 28 | # for SEED in 1 29 | for SEED in $5 30 | do 31 | DIR=$output_dir/${DATASET}/${TRAINER}/${CFG}_${SHOTS}shots/nctx${NCTX}_csc${CSC}_ctp${CTP}/seed${SEED} 32 | # if [ -d "$DIR" ]; then 33 | # echo "Oops! The results exist at ${DIR} (so skip this job)" 34 | # else 35 | if [ $TRAINER = "UPT" ]; then 36 | python3 train.py \ 37 | --root $root \ 38 | --seed ${SEED} \ 39 | --trainer MVLPT \ 40 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 41 | --output-dir ${DIR} \ 42 | --dataset ${DATASET} \ 43 | --shots ${SHOTS} \ 44 | ${MODEL_DIR} \ 45 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 46 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 47 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 48 | TRAINER.MVLPT.COOP.CSC False \ 49 | TEST.NO_TEST False \ 50 | TEST.FINAL_MODEL "best_val" \ 51 | TRAINER.CUT_CONTEXTLEN True 52 | elif [ $TRAINER = "VPT" ]; then 53 | python3 train.py \ 54 | --root $root \ 55 | --seed ${SEED} \ 56 | --trainer MVLPT \ 57 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 58 | --output-dir ${DIR} \ 59 | --dataset ${DATASET} \ 60 | --shots ${SHOTS} \ 61 | ${MODEL_DIR} \ 62 | TRAINER.MVLPT.VPT.N_CTX ${NCTX} \ 63 | TRAINER.MVLPT.COOP.N_CTX 0 \ 64 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 65 | TRAINER.MVLPT.COOP.CSC False \ 66 | TEST.NO_TEST False \ 67 | TEST.FINAL_MODEL "best_val" 68 | 69 | elif [ $TRAINER = "COCOOP" ]; then 70 | python3 train.py \ 71 | --root $root \ 72 | --seed ${SEED} \ 73 | --trainer MVLPT \ 74 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 75 | --output-dir ${DIR} \ 76 | --dataset ${DATASET} \ 77 | --shots ${SHOTS} \ 78 | TRAINER.MVLPT.COCOOP.N_CTX ${NCTX} \ 79 | TEST.NO_TEST False \ 80 | TEST.FINAL_MODEL "best_val" 81 | else 82 | python3 train.py \ 83 | --root $root \ 84 | --seed ${SEED} \ 85 | --trainer MVLPT \ 86 | --config-file configs/trainers/MVLPT/${CFG}.yaml \ 87 | --output-dir ${DIR} \ 88 | --dataset ${DATASET} \ 89 | --shots ${SHOTS} \ 90 | TRAINER.MVLPT.VPT.N_CTX 0 \ 91 | TRAINER.MVLPT.COOP.N_CTX ${NCTX} \ 92 | TRAINER.MVLPT.COOP.CLASS_TOKEN_POSITION 'middle' \ 93 | TRAINER.MVLPT.COOP.CSC False \ 94 | TEST.NO_TEST False \ 95 | TEST.FINAL_MODEL "best_val" \ 96 | TRAINER.CUT_CONTEXTLEN True 97 | fi 98 | done 99 | -------------------------------------------------------------------------------- /datasets/dtd.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import random 4 | 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 6 | from dassl.utils import listdir_nohidden, mkdir_if_missing 7 | 8 | from .oxford_pets import OxfordPets 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class DescribableTextures(DatasetBase): 13 | 14 | dataset_dir = "dtd" 15 | 16 | def __init__(self, cfg): 17 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 18 | self.dataset_dir = os.path.join(root, self.dataset_dir) 19 | self.image_dir = os.path.join(self.dataset_dir, "images") 20 | self.split_path = os.path.join(self.dataset_dir, "split_zhou_DescribableTextures.json") 21 | self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 22 | mkdir_if_missing(self.split_fewshot_dir) 23 | 24 | if os.path.exists(self.split_path): 25 | train, val, test = OxfordPets.read_split(self.split_path, self.image_dir) 26 | else: 27 | train, val, test = self.read_and_split_data(self.image_dir) 28 | OxfordPets.save_split(train, val, test, self.split_path, self.image_dir) 29 | 30 | num_shots = cfg.DATASET.NUM_SHOTS 31 | if num_shots >= 1: 32 | seed = cfg.SEED 33 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 34 | 35 | if os.path.exists(preprocessed): 36 | print(f"Loading preprocessed few-shot data from {preprocessed}") 37 | with open(preprocessed, "rb") as file: 38 | data = pickle.load(file) 39 | train, val = data["train"], data["val"] 40 | else: 41 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 42 | val = self.generate_fewshot_dataset(val, num_shots=min(num_shots, 4)) 43 | data = {"train": train, "val": val} 44 | print(f"Saving preprocessed few-shot data to {preprocessed}") 45 | with open(preprocessed, "wb") as file: 46 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 47 | 48 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 49 | train, val, test = OxfordPets.subsample_classes(train, val, test, subsample=subsample) 50 | 51 | super().__init__(train_x=train, val=val, test=test) 52 | 53 | @staticmethod 54 | def read_and_split_data(image_dir, p_trn=0.5, p_val=0.2, ignored=[], new_cnames=None): 55 | # The data are supposed to be organized into the following structure 56 | # ============= 57 | # images/ 58 | # dog/ 59 | # cat/ 60 | # horse/ 61 | # ============= 62 | categories = listdir_nohidden(image_dir) 63 | categories = [c for c in categories if c not in ignored] 64 | categories.sort() 65 | 66 | p_tst = 1 - p_trn - p_val 67 | print(f"Splitting into {p_trn:.0%} train, {p_val:.0%} val, and {p_tst:.0%} test") 68 | 69 | def _collate(ims, y, c): 70 | items = [] 71 | for im in ims: 72 | item = Datum(impath=im, label=y, classname=c) # is already 0-based 73 | items.append(item) 74 | return items 75 | 76 | train, val, test = [], [], [] 77 | for label, category in enumerate(categories): 78 | category_dir = os.path.join(image_dir, category) 79 | images = listdir_nohidden(category_dir) 80 | images = [os.path.join(category_dir, im) for im in images] 81 | random.shuffle(images) 82 | n_total = len(images) 83 | n_train = round(n_total * p_trn) 84 | n_val = round(n_total * p_val) 85 | n_test = n_total - n_train - n_val 86 | assert n_train > 0 and n_val > 0 and n_test > 0 87 | 88 | if new_cnames is not None and category in new_cnames: 89 | category = new_cnames[category] 90 | 91 | train.extend(_collate(images[:n_train], label, category)) 92 | val.extend(_collate(images[n_train : n_train + n_val], label, category)) 93 | test.extend(_collate(images[n_train + n_val :], label, category)) 94 | 95 | return train, val, test 96 | -------------------------------------------------------------------------------- /scripts/read_record.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | import glob 4 | out_name = 'coop_eval_baseline' 5 | # ckpt_folder='/tmp/select_5/CoOp/' 6 | # TRAINER="UPT" 7 | TRAINER="CoOp" 8 | # TRAINER="VPT" 9 | 10 | 11 | # SHOTS=20 12 | SHOTS=5 13 | # SHOTS=1 14 | CONFIG="vit_b16" 15 | NCTX=4 if TRAINER=="UPT" else 16 16 | # NCTX=4 17 | # eval_cat="IN1K_ADAPT" 18 | # eval_cat="COOP_ADAPT" 19 | # eval_cat="COOP_ADAPT_SEED" 20 | eval_cat="IN1KCOOP_ADAPT_A100" 21 | # eval_cat="IN1KCOOP_ADAPT_A100_SEED" 22 | # eval_cat="IN1KCOOP_ADAPT_ZEROSHOT" 23 | # eval_cat="IN1KCOOP_ADAPT_ZEROSHOT_SEED" 24 | 25 | # eval_cat="IN1K_ADAPT_ZERO_SHOT" 26 | # eval_cat="CLIP_ZEROSHOT" 27 | # eval_cat="EVAL_BEST" 28 | # eval_cat="COOP_ADAPT_ZEROSHOT" 29 | # eval_cat="COOP_ADAPT_ZEROSHOT_SEED" 30 | # eval_cat="COOP_ADAPT_A100" 31 | # eval_cat="COOP_ADAPT_A100_SEED" 32 | 33 | ckpt_folder=f'/tmp/outputs/COOP_ELEVATER/{TRAINER}/{eval_cat}/' 34 | ckpt_setting=f'/{CONFIG}_{SHOTS}shots/nctx{NCTX}_csc_ctp/' 35 | 36 | print(f'{ckpt_folder}/cifar-10/{ckpt_setting}') 37 | seeds = ["1", "2", "3"] 38 | # seeds = ["0"] 39 | if "ZERO" in eval_cat: 40 | accuracy_index = -1 41 | else: 42 | accuracy_index = -2 43 | # accuracy_index = -1 44 | # seeds = ["0"] 45 | # out_name = 'vpt_eval' 46 | # ckpt_folder='/tmp/select_5/UPT/' 47 | # ckpt_setting='vit_b16_20shots/nctx16_csc_ctp' 48 | COOP_ELEVATER_DATASET = ['hateful-memes', 'cifar-10', 'mnist', 'resisc45_clip', 'country211', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'fer-2013', 'kitti-distance'] 49 | 50 | def main(): 51 | # dataset = ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance'] 52 | with open(f'./scripts/{out_name}.csv', 'w', encoding='UTF8') as f: 53 | writer = csv.writer(f) 54 | # dataset = ['hateful-memes', 'cifar-10', 'mnist', 'oxford-flower-102', 'oxford-iiit-pets', 'resisc45_clip', 'country211', 'food-101', 'stanford-cars', 'fgvc-aircraft-2013b-variants102', 'caltech-101', 'dtd', 'voc-2007-classification', 'cifar-100', 'patch-camelyon', 'rendered-sst2', 'gtsrb', 'eurosat_clip', 'fer-2013', 'kitti-distance'] 55 | dataset = COOP_ELEVATER_DATASET 56 | writer.writerow([" "]+dataset) 57 | missed = 0 58 | for seed in seeds: 59 | temp_row = [] 60 | temp_row.append(f"seed {seed}") 61 | 62 | for data1 in dataset: 63 | # temp_row.append(data1+" seed"+seed) 64 | 65 | # for data2 in dataset: 66 | #for seed in ["1", "2", "3"]: 67 | # with open("/rscratch/shijiayang/Prompt/new0/prompt-moe/CoOp/outputs/evaluation/"+data1+"_"+data2+"/CoOp/vit_b16_20shots/nctx16_cscFalse_ctpmiddle/seed"+seed+"/log.txt") as open_file: 68 | missed_ = True 69 | log_files = glob.glob(f"{ckpt_folder}/{data1}/{ckpt_setting}/seed{seed}/log.txt*") 70 | 71 | for log_file in log_files: 72 | with open(log_file) as open_file: 73 | # with open(f"{ckpt_folder}/{data1}/{ckpt_setting}/seed{seed}/log.txt") as open_file: 74 | lines = open_file.readlines() 75 | # assert "results" in lines[accuracy_index] 76 | number = re.findall('([+-]?[0-9]*\.[0-9]*)', lines[accuracy_index]) 77 | # print(number, lines[-1]) 78 | if "results" in lines[accuracy_index] and "test" in lines[accuracy_index-2]: 79 | try: 80 | temp_row.append(float(number[0])) 81 | missed_ = False 82 | break 83 | except Exception as e: 84 | # temp_row.append(" ") 85 | continue 86 | if missed_: 87 | temp_row.append(" ") 88 | missed += 1 89 | print("missed", data1, "seed", seed) 90 | # break 91 | writer.writerow(temp_row) 92 | print(f"okay we missed {missed} entries") 93 | 94 | 95 | if __name__ == "__main__": 96 | main() -------------------------------------------------------------------------------- /datasets/imagenet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from collections import OrderedDict 4 | 5 | from dassl.data.datasets import DATASET_REGISTRY, Datum, DatasetBase 6 | from dassl.utils import listdir_nohidden, mkdir_if_missing 7 | 8 | from .oxford_pets import OxfordPets 9 | 10 | 11 | @DATASET_REGISTRY.register() 12 | class ImageNet(DatasetBase): 13 | 14 | dataset_dir = "imagenet" 15 | 16 | def __init__(self, cfg): 17 | root = os.path.abspath(os.path.expanduser(cfg.DATASET.ROOT)) 18 | self.dataset_dir = os.path.join(root, self.dataset_dir) 19 | self.image_dir = self.dataset_dir 20 | # self.image_dir = os.path.join(self.dataset_dir, "images") 21 | 22 | # self.preprocessed = os.path.join(self.dataset_dir, "preprocessed.pkl") 23 | # self.split_fewshot_dir = os.path.join(self.dataset_dir, "split_fewshot") 24 | 25 | self.preprocessed = os.path.join(self.dataset_dir.replace('group', 'sheng'), "preprocessed.pkl") 26 | self.split_fewshot_dir = os.path.join(self.dataset_dir.replace('group', 'sheng'), "split_fewshot") 27 | mkdir_if_missing(self.split_fewshot_dir) 28 | 29 | if os.path.exists(self.preprocessed): 30 | with open(self.preprocessed, "rb") as f: 31 | preprocessed = pickle.load(f) 32 | train = preprocessed["train"] 33 | test = preprocessed["test"] 34 | else: 35 | # text_file = os.path.join(self.dataset_dir, "classnames.txt") 36 | 37 | # HACK: hack for trevor's group machine dir's 38 | text_file = "./scripts/classnames.txt" 39 | classnames = self.read_classnames(text_file) 40 | train = self.read_data(classnames, "train") 41 | # Follow standard practice to perform evaluation on the val set 42 | # Also used as the val set (so evaluate the last-step model) 43 | test = self.read_data(classnames, "val") 44 | 45 | preprocessed = {"train": train, "test": test} 46 | with open(self.preprocessed, "wb") as f: 47 | pickle.dump(preprocessed, f, protocol=pickle.HIGHEST_PROTOCOL) 48 | 49 | num_shots = cfg.DATASET.NUM_SHOTS 50 | if num_shots >= 1: 51 | seed = cfg.SEED 52 | preprocessed = os.path.join(self.split_fewshot_dir, f"shot_{num_shots}-seed_{seed}.pkl") 53 | 54 | if os.path.exists(preprocessed): 55 | print(f"Loading preprocessed few-shot data from {preprocessed}") 56 | with open(preprocessed, "rb") as file: 57 | data = pickle.load(file) 58 | train = data["train"] 59 | else: 60 | train = self.generate_fewshot_dataset(train, num_shots=num_shots) 61 | data = {"train": train} 62 | print(f"Saving preprocessed few-shot data to {preprocessed}") 63 | with open(preprocessed, "wb") as file: 64 | pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL) 65 | 66 | subsample = cfg.DATASET.SUBSAMPLE_CLASSES 67 | train, test = OxfordPets.subsample_classes(train, test, subsample=subsample) 68 | 69 | super().__init__(train_x=train, val=test, test=test) 70 | 71 | @staticmethod 72 | def read_classnames(text_file): 73 | """Return a dictionary containing 74 | key-value pairs of : . 75 | """ 76 | classnames = OrderedDict() 77 | with open(text_file, "r") as f: 78 | lines = f.readlines() 79 | for line in lines: 80 | line = line.strip().split(" ") 81 | folder = line[0] 82 | classname = " ".join(line[1:]) 83 | classnames[folder] = classname 84 | return classnames 85 | 86 | def read_data(self, classnames, split_dir): 87 | split_dir = os.path.join(self.image_dir, split_dir) 88 | folders = sorted(f.name for f in os.scandir(split_dir) if f.is_dir()) 89 | items = [] 90 | 91 | for label, folder in enumerate(folders): 92 | imnames = listdir_nohidden(os.path.join(split_dir, folder)) 93 | classname = classnames[folder] 94 | for imname in imnames: 95 | impath = os.path.join(split_dir, folder, imname) 96 | item = Datum(impath=impath, label=label, classname=classname) 97 | items.append(item) 98 | 99 | return items 100 | --------------------------------------------------------------------------------