├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── run.sh ├── run_gpt3.sh ├── run_multi.sh ├── setup.py ├── submission_file_readme.md ├── tox.ini └── vision_benchmark ├── __init__.py ├── commands ├── __init__.py ├── extract_gpt3_knowledge.py ├── finetune.py ├── linear_probe.py ├── prepare_submit.py └── zeroshot.py ├── common ├── __init__.py ├── constants.py ├── data_class_base.py ├── prediction_submission.py └── utils.py ├── config ├── __init__.py ├── default.py └── models.py ├── datasets ├── __init__.py ├── bpe_simple_vocab_16e6.txt.gz ├── hfpt_tokenizer.py ├── languages │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── build.py │ ├── hfpt_tokenizer.py │ ├── prompt_engineering.py │ └── simple_tokenizer.py ├── prompts.py └── simple_tokenizer.py ├── evaluation ├── __init__.py ├── clip_zeroshot_evaluator.py ├── dataset.py ├── feature.py ├── full_model_finetune.py └── metric.py ├── models ├── __init__.py ├── clip_example.py ├── clip_react.py ├── clip_swin.py ├── cls_example.py ├── cls_swin.py ├── declip.py ├── declip_model │ ├── __init__.py │ ├── clip.py │ ├── declip.py │ ├── defilip.py │ ├── filip.py │ ├── image_encoder │ │ ├── base_transformer.py │ │ └── visual_transformer.py │ ├── slip.py │ ├── text_encoder │ │ ├── base_transformer.py │ │ └── text_transformer.py │ └── utils │ │ ├── nnclr_modules │ │ ├── __init__.py │ │ ├── memory_bank.py │ │ ├── memory_bank_cuda.py │ │ └── nn_memory_bank.py │ │ └── text_utils │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── mask_tokens.py │ │ └── simple_tokenizer.py ├── mae.py └── mocov3.py ├── optim ├── __init__.py └── build.py ├── resources ├── datasets │ ├── caltech101.yaml │ ├── cifar10.yaml │ ├── cifar100.yaml │ ├── country211.yaml │ ├── dtd.yaml │ ├── eurosat-clip.yaml │ ├── fer2013.yaml │ ├── fgvc-aircraft-2013b.yaml │ ├── flower102.yaml │ ├── food101.yaml │ ├── gtsrb.yaml │ ├── hateful-memes.yaml │ ├── imagenet-1k.yaml │ ├── kitti-distance.yaml │ ├── mnist.yaml │ ├── oxford-iiit-pets.yaml │ ├── patchcamelyon.yaml │ ├── rendered-sst2.yaml │ ├── resisc45-clip.yaml │ ├── stanfordcar.yaml │ ├── vision_datasets.json │ └── voc2007classification.yaml ├── knowledge │ ├── external │ │ ├── caltech-101_knowledge.tsv │ │ ├── cifar-100_knowledge.tsv │ │ ├── cifar-10_knowledge.tsv │ │ ├── country211_knowledge.tsv │ │ ├── dtd_knowledge.tsv │ │ ├── eurosat_clip_knowledge.tsv │ │ ├── fer-2013_knowledge.tsv │ │ ├── fgvc-aircraft-2013b-variants102_knowledge.tsv │ │ ├── food-101_knowledge.tsv │ │ ├── gtsrb_knowledge.tsv │ │ ├── hateful-memes_knowledge.tsv │ │ ├── imagenet-1k_knowledge.tsv │ │ ├── kitti-distance_knowledge.tsv │ │ ├── mnist_knowledge.tsv │ │ ├── oxford-flower-102_knowledge.tsv │ │ ├── oxford-iiit-pets_knowledge.tsv │ │ ├── patch-camelyon_knowledge.tsv │ │ ├── rendered-sst2_knowledge.tsv │ │ ├── resisc45_clip_knowledge.tsv │ │ ├── stanford-cars_knowledge.tsv │ │ └── voc-2007-classification_knowledge.tsv │ └── gpt3 │ │ ├── GPT3_caltech-101.tsv │ │ ├── GPT3_cifar-10.tsv │ │ ├── GPT3_cifar-100.tsv │ │ ├── GPT3_country211.tsv │ │ ├── GPT3_dtd.tsv │ │ ├── GPT3_eurosat_clip.tsv │ │ ├── GPT3_fer-2013.tsv │ │ ├── GPT3_fgvc-aircraft-2013b-variants102.tsv │ │ ├── GPT3_food-101.tsv │ │ ├── GPT3_gtsrb.tsv │ │ ├── GPT3_hateful-memes.tsv │ │ ├── GPT3_imagenet-1k.tsv │ │ ├── GPT3_kitti-distance.tsv │ │ ├── GPT3_mnist.tsv │ │ ├── GPT3_oxford-flower-102.tsv │ │ ├── GPT3_oxford-iiit-pets.tsv │ │ ├── GPT3_patch-camelyon.tsv │ │ ├── GPT3_rendered-sst2.tsv │ │ ├── GPT3_resisc45_clip.tsv │ │ ├── GPT3_stanford-cars.tsv │ │ └── GPT3_voc-2007-classification.tsv └── model │ ├── clip_example.yaml │ ├── clip_swin_tiny.yaml │ ├── deit_base_patch16_224.yaml │ ├── example.yaml │ ├── mae_vitb16.yaml │ ├── mocov3_vitb16.yaml │ ├── react_vitG14_OpenCLIP.yaml │ ├── react_vitb16_CLIP.yaml │ ├── react_vitb32_CLIP.yaml │ ├── react_vitl14_CLIP.yaml │ ├── vit_base_patch16_224.yaml │ ├── vit_base_patch32_224.yaml │ ├── vitb16_CLIP.yaml │ ├── vitb32_CLIP.yaml │ ├── vitb32_DeCLIP.yaml │ ├── vitb32_DeCLIP_YFCC15M.yaml │ ├── vitb32_FILIP.yaml │ └── vitb32_SLIP.yaml └── utils ├── __init__.py ├── comm.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | .idea 3 | *.iml 4 | out 5 | gen 6 | 7 | ### Vim template 8 | [._]*.s[a-w][a-z] 9 | [._]s[a-w][a-z] 10 | *.un~ 11 | Session.vim 12 | .netrwhist 13 | *~ 14 | 15 | ### IPythonNotebook template 16 | # Temporary data 17 | .ipynb_checkpoints/ 18 | 19 | ### Python template 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Distribution / packaging 29 | .Python 30 | env/ 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | #lib/ 38 | #lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *,cover 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # *.ipynb 80 | *.params 81 | .vscode/ 82 | *.code-workspace/ 83 | 84 | lib/pycocotools/_mask.c 85 | lib/nms/cpu_nms.c 86 | 87 | OUTPUT 88 | OUTPUT/* 89 | models/* 90 | DATASET 91 | DATASET/* 92 | # external/ 93 | 94 | outputs 95 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Computer-Vision-in-the-Wild 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | yacs~=0.1.8 2 | scikit-learn 3 | timm~=0.4.12 4 | numpy~=1.21.0 5 | sharedmem 6 | git+https://github.com/openai/CLIP.git 7 | git+https://github.com/haotian-liu/CLIP_vlp.git 8 | torch~=1.7.0 9 | PyYAML~=5.4.1 10 | Pillow~=9.0.1 11 | torchvision~=0.8.0 12 | vision-evaluation==0.2.9 13 | vision-datasets==0.2.17 14 | tqdm~=4.62.3 15 | transformers~=4.11.3 16 | protobuf~=3.20.1 17 | ftfy~=6.1.1 18 | nltk~=3.7 19 | openai # to call gpt3 for knowledge extraction 20 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | ############## Configuration section begins ################## 2 | 3 | # Model Config: [vitb32_CLIP, vitb16_CLIP, mae_vitb16, mocov3_vitb16, vit_base_patch16_224, vit_base_patch32_224, deit_base_patch16_224] 4 | model_cfg=vitb32_CLIP 5 | 6 | # Mode: [linear_probe, finetune, zeroshot] 7 | mode=zeroshot 8 | 9 | # Use FP32 [default: True] 10 | use_fp32=True 11 | 12 | # Dataset: [caltech101] 13 | dataset=caltech101 14 | 15 | # Model checkpoint 16 | model_ckpt=. 17 | 18 | # output directory 19 | output_dir=./outputs 20 | 21 | ############ Configurations for hyperparameter tuning begin ############ 22 | # set to True to disable the automatic hyperparameter tuning 23 | # and set the learning rate and weight accordingly below 24 | # This option is only effective for linear probe and finetuning. 25 | 26 | disable_hyperparameter_tuning=False 27 | learning_rate=0.1 28 | l2_weight_decay=1e-6 29 | 30 | ############ Configurations for hyperparameter tuning end ############ 31 | 32 | ############ Configurations for linear_probe/finetune begin ############ 33 | 34 | # Random seed: [0,1,2] 35 | random_seed=0 36 | 37 | # Shots: {5, 20, 50} for few shot, and -1 for full-shot 38 | num_shots=5 39 | 40 | # Whether to init the linear head with the text encoder 41 | init_head_with_text_encoder=True 42 | 43 | # whether to merge the encoder and the linear head 44 | merge_encoder_and_proj=False 45 | 46 | ############ Configurations for linear_probe/finetune end ############ 47 | 48 | ############ Configurations for adding knowledge begin ############ 49 | # Please change the knowledge source accordingly. 50 | 51 | use_wordnet_hierachy=False 52 | use_wordnet_definition=False 53 | use_wiktionary_definition=False 54 | use_gpt3=False 55 | use_gpt3_count=0 56 | 57 | ############ Configurations for adding knowledge end ############ 58 | 59 | ############## Configuration section ends ################## 60 | 61 | 62 | # Launching the job...... 63 | 64 | cd vision_benchmark 65 | 66 | if [[ "$mode" = "linear_probe" ]]; then 67 | python commands/linear_probe.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.FREEZE_IMAGE_BACKBONE True TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt 68 | elif [[ "$mode" = "finetune" ]]; then 69 | python commands/finetune.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt 70 | 71 | elif [[ "$mode" = "zeroshot" ]]; then 72 | python commands/zeroshot.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml MODEL.CLIP_FP32 $use_fp32 DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt 73 | else 74 | echo Unknown mode! Please check and set mode to one of {linear_probe, finetune, zeroshot}. 75 | exit -1 76 | fi; -------------------------------------------------------------------------------- /run_gpt3.sh: -------------------------------------------------------------------------------- 1 | 2 | output_dir=/home/chunyl/project/OUTPUT_DIR/GPT3 # the path that the generated gpt3 knowledge is saved 3 | apikey=XXXX # Please use your GPT3 API key 4 | 5 | ds='cifar10' 6 | # ['eurosat-clip','country211','kitti-distance','oxford-iiit-pets','ping-attack-on-titan-plus','ping-whiskey-plus','rendered-sst2','resisc45-clip','voc2007classification','caltech101','cifar10','cifar100','dtd','fer2013','fgvc-aircraft-2013b','flower102','food101','gtsrb','hateful-memes','mnist','patchcamelyon','stanfordcar'] 7 | 8 | 9 | cd vision_benchmark 10 | 11 | 12 | python commands/extract_gpt3_knowledge.py --ds resources/datasets/$ds.yaml --apikey $apikey --n_shot 3 --n_ensemble 5 \ 13 | --target local DATASET.ROOT $output_dir/datasets/ds OUTPUT_DIR $output_dir/log 14 | 15 | 16 | 17 | # pip install openai 18 | # pip install nltk, spacy 19 | # python -m spacy download en -------------------------------------------------------------------------------- /run_multi.sh: -------------------------------------------------------------------------------- 1 | ############## Configuration section begins ################## 2 | 3 | # Model Config: [vitb32_CLIP, vitb16_CLIP, mae_vitb16, mocov3_vitb16, vit_base_patch16_224, vit_base_patch32_224, deit_base_patch16_224] 4 | model_cfg=vitb32_CLIP 5 | 6 | # Mode: [linear_probe, finetune, zeroshot] 7 | mode=zeroshot 8 | 9 | # Use FP32 [default: True] 10 | use_fp32=True 11 | 12 | # Dataset: [caltech101] 13 | dataset=$DATASET 14 | 15 | # Model checkpoint 16 | model_ckpt=. 17 | 18 | # output directory 19 | output_dir=$OUTPUT_DIR 20 | 21 | ############ Configurations for hyperparameter tuning begin ############ 22 | # set to True to disable the automatic hyperparameter tuning 23 | # and set the learning rate and weight accordingly below 24 | # This option is only effective for linear probe and finetuning. 25 | 26 | disable_hyperparameter_tuning=False 27 | learning_rate=0.1 28 | l2_weight_decay=1e-6 29 | 30 | ############ Configurations for hyperparameter tuning end ############ 31 | 32 | ############ Configurations for linear_probe/finetune begin ############ 33 | 34 | # Random seed: [0,1,2] 35 | random_seed=0 36 | 37 | # Shots: {5, 20, 50} for few shot, and -1 for full-shot 38 | num_shots=5 39 | 40 | # Whether to init the linear head with the text encoder 41 | init_head_with_text_encoder=True 42 | 43 | # whether to merge the encoder and the linear head 44 | merge_encoder_and_proj=False 45 | 46 | ############ Configurations for linear_probe/finetune end ############ 47 | 48 | ############ Configurations for adding knowledge begin ############ 49 | # Please change the knowledge source accordingly. 50 | 51 | use_wordnet_hierachy=False 52 | use_wordnet_definition=False 53 | use_wiktionary_definition=False 54 | use_gpt3=False 55 | use_gpt3_count=0 56 | 57 | ############ Configurations for adding knowledge end ############ 58 | 59 | ############## Configuration section ends ################## 60 | 61 | 62 | # Launching the job...... 63 | 64 | cd vision_benchmark 65 | 66 | if [[ "$mode" = "linear_probe" ]]; then 67 | python commands/linear_probe.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.FREEZE_IMAGE_BACKBONE True TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt 68 | elif [[ "$mode" = "finetune" ]]; then 69 | python commands/finetune.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt 70 | 71 | elif [[ "$mode" = "zeroshot" ]]; then 72 | python commands/zeroshot.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml MODEL.CLIP_FP32 $use_fp32 DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt 73 | else 74 | echo Unknown mode! Please check and set mode to one of {linear_probe, finetune, zeroshot}. 75 | exit -1 76 | fi; -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | VERSION = '0.0.1' 4 | 5 | setuptools.setup(name='vision_benchmark', 6 | author='chunyl', 7 | author_email='chunyl@microsoft.com', 8 | version=VERSION, 9 | python_requires='>=3.6', 10 | packages=setuptools.find_packages(exclude=['test', 'test.*']), 11 | package_data={'': ['resources/*']}, 12 | install_requires=[ 13 | 'yacs~=0.1.8', 14 | 'scikit-learn', 15 | 'timm>=0.3.4', 16 | 'numpy>=1.18.0', 17 | 'sharedmem', 18 | 'torch>=1.7.0', 19 | 'PyYAML~=5.4.1', 20 | 'Pillow', 21 | 'torchvision>=0.8.0', 22 | 'vision-datasets>=0.2.0', 23 | 'vision-evaluation>=0.2.2', 24 | 'tqdm~=4.62.3', 25 | 'transformers~=4.11.3' 26 | ], 27 | entry_points={ 28 | 'console_scripts': [ 29 | 'vb_linear_probe=vision_benchmark.commands.linear_probe:main', 30 | 'vb_zero_shot_eval=vision_benchmark.commands.zeroshot_eval:main', 31 | 'vb_eval=vision_benchmark.commands.eval:main', 32 | 'vb_submit_to_leaderboard=vision_benchmark.commands.submit_predictions:main', 33 | 'vb_image_caption_eval=vision_benchmark.commands.image_caption_eval:main', 34 | ] 35 | }) 36 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .git,build,dist,venv,.idea 3 | max-line-length = 200 4 | 5 | [pytest] 6 | junit_family = xunit2 -------------------------------------------------------------------------------- /vision_benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/__init__.py -------------------------------------------------------------------------------- /vision_benchmark/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/commands/__init__.py -------------------------------------------------------------------------------- /vision_benchmark/commands/finetune.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linear Probe with sklearn Logistic Regression or linear model. 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import argparse 9 | import logging 10 | 11 | import numpy as np 12 | import os 13 | import random 14 | 15 | from vision_datasets import DatasetTypes 16 | from vision_benchmark.common.constants import get_dataset_hub 17 | from vision_benchmark.utils import comm, create_logger 18 | from vision_benchmark.evaluation import construct_dataloader, full_model_finetune 19 | from vision_benchmark.config import config, update_config 20 | # These 2 lines are a walk-around for "Too many open files error". Refer: https://github.com/pytorch/pytorch/issues/11201 21 | import torch.multiprocessing 22 | from vision_benchmark.common.utils import log_arg_env_config, submit_predictions 23 | 24 | torch.multiprocessing.set_sharing_strategy('file_system') 25 | 26 | MULTILABEL_DATASETS = {"chestx-ray8"} 27 | 28 | 29 | def add_finetuning_args(parser): 30 | parser.add_argument('--ds', required=False, help='Evaluation dataset configure file name.', type=str) 31 | parser.add_argument('--model', required=True, help='Evaluation model configure file name', type=str) 32 | parser.add_argument('--submit-predictions', help='submit predictions and model info to leaderboard.', default=False, action='store_true') 33 | parser.add_argument('--submit-by', help='Person who submits the results.', type=str) 34 | parser.add_argument('--no-tuning', help='No hyperparameter-tuning.', default=False, type=lambda x:x.lower()=="true") 35 | parser.add_argument('--l2', help='(Inverse) L2 regularization strength. This option is only useful when option --no-tuning is True.', default=0.316, type=float) 36 | parser.add_argument('--lr', help='Test with a specific learning rate. This option is only useful when option --no-tuning is True.', default=0.001, type=float) 37 | parser.add_argument('--run', help='Run id', default=1, type=int) 38 | parser.add_argument('--fix_seed', help='Fix the random seed. [-1] not fixing the seeds', default=0, type=int) 39 | parser.add_argument('--save-predictions', help='save predictions logits for analysis.', default=True, action='store_true') 40 | 41 | parser.add_argument('opts', 42 | help="Modify config options using the command-line", 43 | default=None, 44 | nargs=argparse.REMAINDER) 45 | 46 | 47 | def main(): 48 | parser = argparse.ArgumentParser(description='Test a classification model, with finetuning.') 49 | add_finetuning_args(parser) 50 | args = parser.parse_args() 51 | 52 | args.cfg = args.ds 53 | update_config(config, args) 54 | args.cfg = args.model 55 | update_config(config, args) 56 | config.defrost() 57 | config.NAME = '' 58 | config.freeze() 59 | 60 | if args.submit_predictions: 61 | assert args.submit_by 62 | 63 | if args.fix_seed != -1: 64 | random.seed(args.fix_seed) 65 | np.random.seed(args.fix_seed) 66 | torch.manual_seed(args.fix_seed) 67 | torch.cuda.manual_seed_all(args.fix_seed) 68 | 69 | n_samples = str(config.DATASET.NUM_SAMPLES_PER_CLASS) if config.DATASET.NUM_SAMPLES_PER_CLASS > 0 else 'full' 70 | exp_name = 'finetuning_' + n_samples 71 | if config.TRAIN.TWO_LR: exp_name += '_two_lr' 72 | final_output_dir = create_logger(config, exp_name) 73 | 74 | if config.DATASET.NUM_SAMPLES_PER_CLASS == 1: 75 | config.defrost() 76 | config.DATASET.NUM_SAMPLES_PER_CLASS = 2 77 | config.DATASET.MERGE_TRAIN_VAL_FINAL_RUN = False 78 | config.freeze() 79 | 80 | if comm.is_main_process(): 81 | log_arg_env_config(args, config, final_output_dir) 82 | 83 | if config.DATASET.DATASET == 'patch-camelyon' and config.DATASET.NUM_SAMPLES_PER_CLASS == -1: 84 | # deal with patch camelyon large dataset (search using 10000-shot subset, final run with the full dataset) 85 | logging.info(f'Detecting large dataset with {config.DATASET.NUM_SAMPLES_PER_CLASS}-shot.') 86 | config.defrost() 87 | config.DATASET.NUM_SAMPLES_PER_CLASS = 10000 88 | config.freeze() 89 | logging.info(f'Used the subset ({config.DATASET.NUM_SAMPLES_PER_CLASS}-shot) to train the model.') 90 | 91 | logging.info(f'{config.DATASET.DATASET} is a dataset.') 92 | train_dataloader, val_dataloader, test_dataloader = construct_dataloader(config) 93 | 94 | # Run full model finetuning 95 | logging.info('Finetuning with full model. This may take several minutes to hours depending on the size of your data.') 96 | best_acc, model_info = full_model_finetune(train_dataloader, val_dataloader, test_dataloader, args.no_tuning, args.lr, args.l2, config) 97 | 98 | test_predictions = model_info['best_logits'] 99 | 100 | if args.save_predictions: 101 | import json 102 | 103 | # a hack to control the json dump float accuracy 104 | # if you find the accuracy is not enough, pleae consider increasing `prec`. 105 | def json_prec_dump(data, prec=6): 106 | return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec))) 107 | 108 | results_dict = { 109 | 'model_name': config.MODEL.NAME, 110 | 'dataset_name': config.DATASET.DATASET, 111 | 'num_trainable_params': model_info.get('n_trainable_params', None), 112 | 'num_params': model_info.get('n_params', None), 113 | 'num_visual_params': model_info.get('n_visual_params', None), 114 | 'num_backbone_params': model_info.get('n_backbone_params', None), 115 | 'n_shot': config.DATASET.NUM_SAMPLES_PER_CLASS, 116 | 'rnd_seeds': [config.DATASET.RANDOM_SEED_SAMPLING], 117 | 'predictions': [test_predictions.tolist()], 118 | } 119 | json_string = json_prec_dump(results_dict) 120 | 121 | prediction_folder = os.path.join(config.OUTPUT_DIR, 'predictions', exp_name) 122 | os.makedirs(prediction_folder, exist_ok=True) 123 | with open(os.path.join(prediction_folder, f'seed{config.DATASET.RANDOM_SEED_SAMPLING}_{config.DATASET.DATASET}.json' ) , 'w') as outfile: 124 | outfile.write(json_string) 125 | 126 | if __name__ == '__main__': 127 | main() 128 | -------------------------------------------------------------------------------- /vision_benchmark/commands/linear_probe.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linear Probe with sklearn Logistic Regression or linear model. 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import argparse 9 | import logging 10 | 11 | import numpy as np 12 | import random 13 | 14 | import os 15 | 16 | from vision_benchmark.utils import comm, create_logger 17 | from vision_benchmark.evaluation import construct_dataloader, full_model_finetune 18 | from vision_benchmark.config import config, update_config 19 | # These 2 lines are a walk-around for "Too many open files error". Refer: https://github.com/pytorch/pytorch/issues/11201 20 | import torch.multiprocessing 21 | from vision_benchmark.common.utils import log_arg_env_config, submit_predictions 22 | 23 | torch.multiprocessing.set_sharing_strategy('file_system') 24 | 25 | MULTILABEL_DATASETS = {"chestx-ray8"} 26 | 27 | 28 | def add_linear_probing_args(parser): 29 | parser.add_argument('--ds', required=False, help='Evaluation dataset configure file name.', type=str) 30 | parser.add_argument('--model', required=True, help='Evaluation model configure file name', type=str) 31 | parser.add_argument('--submit-predictions', help='submit predictions and model info to leaderboard.', default=False, action='store_true') 32 | parser.add_argument('--submit-by', help='Person who submits the results.', type=str) 33 | 34 | parser.add_argument('--no-tuning', help='No hyperparameter-tuning.', default=False, type=lambda x:x.lower()=="true") 35 | parser.add_argument('--l2', help='(Inverse) L2 regularization strength. This option is only useful when option --no-tuning is True.', default=0.316, type=float) 36 | parser.add_argument('--lr', help='Test with a specific learning rate. This option is only useful when option --no-tuning is True.', default=0.001, type=float) 37 | parser.add_argument('--run', help='Run id', default=1, type=int) 38 | parser.add_argument('--fix_seed', help='Fix the random seed. [-1] not fixing the seeds', default=0, type=int) 39 | parser.add_argument('--save-predictions', help='save predictions logits for analysis.', default=True, action='store_true') 40 | 41 | parser.add_argument('opts', 42 | help="Modify config options using the command-line", 43 | default=None, 44 | nargs=argparse.REMAINDER) 45 | 46 | def main(): 47 | parser = argparse.ArgumentParser(description='Test a classification model, with linear probing.') 48 | add_linear_probing_args(parser) 49 | args = parser.parse_args() 50 | 51 | args.cfg = args.ds 52 | update_config(config, args) 53 | args.cfg = args.model 54 | update_config(config, args) 55 | config.defrost() 56 | config.NAME = '' 57 | config.freeze() 58 | 59 | if args.submit_predictions: 60 | assert args.submit_by 61 | 62 | if args.fix_seed != -1: 63 | random.seed(args.fix_seed) 64 | np.random.seed(args.fix_seed) 65 | torch.manual_seed(args.fix_seed) 66 | torch.cuda.manual_seed_all(args.fix_seed) 67 | 68 | n_samples = str(config.DATASET.NUM_SAMPLES_PER_CLASS) if config.DATASET.NUM_SAMPLES_PER_CLASS >= 0 else 'full' 69 | exp_name = 'linear_probe_' + n_samples 70 | 71 | if config.DATASET.NUM_SAMPLES_PER_CLASS == 1: 72 | config.defrost() 73 | config.DATASET.NUM_SAMPLES_PER_CLASS = 2 74 | config.DATASET.MERGE_TRAIN_VAL_FINAL_RUN = False 75 | config.freeze() 76 | 77 | # Follow MAE's design choice: not using global pool in linear probe 78 | if config.MODEL.NAME.startswith('mae_'): 79 | config.defrost() 80 | config.MODEL.SPEC.GLOBAL_POOL = False 81 | config.freeze() 82 | 83 | final_output_dir = create_logger(config, exp_name) 84 | if comm.is_main_process(): 85 | log_arg_env_config(args, config, final_output_dir) 86 | 87 | if config.DATASET.DATASET == 'patch-camelyon' and config.DATASET.NUM_SAMPLES_PER_CLASS == -1: 88 | # deal with patch camelyon large dataset (search using 10000-shot subset, final run with the full dataset) 89 | logging.info(f'Detecting large dataset with {config.DATASET.NUM_SAMPLES_PER_CLASS}-shot.') 90 | config.defrost() 91 | config.DATASET.NUM_SAMPLES_PER_CLASS = 10000 92 | config.freeze() 93 | logging.info(f'Used the subset ({config.DATASET.NUM_SAMPLES_PER_CLASS}-shot) to train the model.') 94 | 95 | # Run linear probe 96 | train_dataloader, val_dataloader, test_dataloader = construct_dataloader(config) 97 | 98 | best_acc, model_info = full_model_finetune(train_dataloader, val_dataloader, test_dataloader, args.no_tuning, args.lr, args.l2, config) 99 | test_predictions = model_info['best_logits'] 100 | 101 | if args.save_predictions: 102 | import json 103 | 104 | # a hack to control the json dump float accuracy 105 | # if you find the accuracy is not enough, pleae consider increasing `prec`. 106 | def json_prec_dump(data, prec=6): 107 | return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec))) 108 | 109 | results_dict = { 110 | 'model_name': config.MODEL.NAME, 111 | 'dataset_name': config.DATASET.DATASET, 112 | 'num_trainable_params': model_info.get('n_trainable_params', None), 113 | 'num_params': model_info.get('n_params', None), 114 | 'num_visual_params': model_info.get('n_visual_params', None), 115 | 'num_backbone_params': model_info.get('n_backbone_params', None), 116 | 'n_shot': config.DATASET.NUM_SAMPLES_PER_CLASS, 117 | 'rnd_seeds': [config.DATASET.RANDOM_SEED_SAMPLING], 118 | 'predictions': [test_predictions.tolist()], 119 | } 120 | json_string = json_prec_dump(results_dict) 121 | 122 | prediction_folder = os.path.join(config.OUTPUT_DIR, 'predictions', exp_name) 123 | os.makedirs(prediction_folder, exist_ok=True) 124 | with open(os.path.join(prediction_folder, f'seed{config.DATASET.RANDOM_SEED_SAMPLING}_{config.DATASET.DATASET}.json' ) , 'w') as outfile: 125 | outfile.write(json_string) 126 | 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /vision_benchmark/commands/prepare_submit.py: -------------------------------------------------------------------------------- 1 | """ 2 | submit predictions to leaderboard service 3 | """ 4 | import argparse 5 | from collections import defaultdict 6 | import json 7 | import logging 8 | import pathlib 9 | import zipfile 10 | import itertools 11 | import numpy as np 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser(description='Submit predictions to leaderboard service.') 15 | parser.add_argument('--combine_path', required=True, help='Prediction json file path.', type=pathlib.Path) 16 | parser.add_argument('--combine_name', default='all_predictions', required=False, help='Output file name.', type=str) 17 | args = parser.parse_args() 18 | 19 | return args 20 | 21 | 22 | # if you find the accuracy is not enough, pleae consider increasing `prec`. 23 | def json_prec_dump(data, prec=6): 24 | return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec))) 25 | 26 | 27 | def main(): 28 | logging.basicConfig(level=logging.INFO) 29 | args = parse_args() 30 | 31 | all_predictions = defaultdict(list) 32 | for prediction_file in args.combine_path.iterdir(): 33 | if prediction_file.suffix != '.json': 34 | print(f'Ignoring file {prediction_file.name} by suffix.') 35 | continue 36 | prediction_data = json.loads(prediction_file.read_text()) 37 | all_predictions[prediction_data['dataset_name']].append(prediction_data) 38 | 39 | all_combine_predictions = [] 40 | 41 | KNOWN_AVERAGE_KEYS = ['num_trainable_params'] 42 | KNOWN_MERGE_KEYS = ['rnd_seeds', 'predictions'] 43 | KNOWN_DIFF_KEYS = KNOWN_AVERAGE_KEYS + KNOWN_MERGE_KEYS 44 | 45 | for ds, prediction_data in all_predictions.items(): 46 | prediction_keys = list(prediction_data[0]) 47 | combined_dict = dict() 48 | for key in prediction_keys: 49 | values = [x[key] for x in prediction_data] 50 | if key not in KNOWN_DIFF_KEYS: 51 | assert all(x == values[0] for x in values) 52 | values = values[0] 53 | else: 54 | if key in KNOWN_MERGE_KEYS: 55 | values = list(itertools.chain.from_iterable(values)) 56 | elif key in KNOWN_AVERAGE_KEYS: 57 | values = np.asarray(values).mean() 58 | else: 59 | assert False 60 | combined_dict[key] = values 61 | all_combine_predictions.append(combined_dict) 62 | 63 | all_predictions = {"data": all_combine_predictions} 64 | all_predictions = json_prec_dump(all_predictions) 65 | save_path = args.combine_path / f'{args.combine_name}.zip' 66 | zf = zipfile.ZipFile(save_path, "w", zipfile.ZIP_DEFLATED) 67 | zf.writestr('all_predictions.json', all_predictions) 68 | zf.close() 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /vision_benchmark/commands/zeroshot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zero shot evaluation. 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import os 9 | import argparse 10 | import logging 11 | 12 | import numpy as np 13 | 14 | from vision_benchmark.common.utils import log_arg_env_config 15 | from vision_benchmark.utils import comm, create_logger 16 | from vision_benchmark.datasets import SimpleTokenizer, HFPTTokenizer 17 | from vision_benchmark.evaluation import extract_features, extract_text_features, clip_zeroshot_evaluator 18 | from vision_benchmark.config import config, update_config 19 | 20 | 21 | def add_zero_shot_args(parser): 22 | parser.add_argument('--ds', required=False, help='Evaluation dataset configure file name.', type=str) 23 | parser.add_argument('--model', required=True, help='Clip model configure file name', type=str) 24 | parser.add_argument('--text_feature_only', help='consider text feature or not.', default=False, action='store_true') 25 | parser.add_argument('--save-predictions', help='save predictions logits for analysis.', default=True, type=lambda x: (str(x).lower() == 'true')) 26 | parser.add_argument('opts', 27 | help="Modify config options using the command-line", 28 | default=None, 29 | nargs=argparse.REMAINDER) 30 | 31 | def load_or_extract_features(args, cfg): 32 | if cfg.MODEL.SPEC.TEXT.TOKENIZER == 'clip': 33 | tokenizer = SimpleTokenizer() 34 | elif 'hf_' in cfg.MODEL.SPEC.TEXT.TOKENIZER: 35 | tokenizer = HFPTTokenizer(pt_name=cfg.MODEL.SPEC.TEXT.TOKENIZER[3:]) 36 | else: 37 | tokenizer = None 38 | 39 | # Load or extract image features. 40 | feature_file = os.path.join(cfg.DATASET.ROOT, 'zeroshot_features_' + cfg.MODEL.NAME.replace('/', '') + f'_wiki_{cfg.KNOWLEDGE.WIKITIONARY.USE_DEFINITION}' + f'_gpt3_{cfg.KNOWLEDGE.GPT3.USE_GPT3}' + '.npy') 41 | logging.info(f'feature_file: {feature_file}') 42 | if os.path.exists(feature_file): 43 | logging.info('Loading features from existing files.') 44 | with open(feature_file, 'rb') as fread: 45 | image_features = np.load(fread) 46 | text_features = np.load(fread) 47 | image_labels = np.load(fread) 48 | else: 49 | image_features, image_labels = extract_features(cfg, test_split_only=True) 50 | text_features = extract_text_features(cfg, tokenizer, args) 51 | logging.info(f'Test size is {image_features.shape[0]}.') 52 | 53 | return image_features, text_features, image_labels 54 | 55 | def load_or_extract_text_features(args, cfg): 56 | if cfg.MODEL.SPEC.TEXT.TOKENIZER == 'clip': 57 | tokenizer = SimpleTokenizer() 58 | elif 'hf_' in cfg.MODEL.SPEC.TEXT.TOKENIZER: 59 | tokenizer = HFPTTokenizer(pt_name=cfg.MODEL.SPEC.TEXT.TOKENIZER[3:]) 60 | else: 61 | tokenizer = None 62 | 63 | # Load or extract image features. 64 | feature_file = os.path.join(cfg.DATASET.ROOT, 'zeroshot_text_features_' + cfg.MODEL.NAME.replace('/', '') + f'_wiki_{cfg.KNOWLEDGE.WIKITIONARY.USE_DEFINITION}' + f'_gpt3_{cfg.KNOWLEDGE.GPT3.USE_GPT3}' + '.npy') 65 | logging.info(f'feature_file: {feature_file}') 66 | if os.path.exists(feature_file): 67 | logging.info('Loading features from existing files.') 68 | with open(feature_file, 'rb') as fread: 69 | text_features = np.load(fread) 70 | else: 71 | wiki_dict, gpt3_dict = extract_text_features(cfg, tokenizer, args) 72 | logging.info(f'Test size is {len(wiki_dict)}.') 73 | 74 | return wiki_dict, gpt3_dict 75 | 76 | def main(): 77 | parser = argparse.ArgumentParser(description='Zero-shot evaluation script.') 78 | add_zero_shot_args(parser) 79 | args = parser.parse_args() 80 | 81 | args.cfg = args.ds 82 | update_config(config, args) 83 | args.cfg = args.model 84 | update_config(config, args) 85 | config.defrost() 86 | config.NAME = "" 87 | config.freeze() 88 | 89 | exp_name = 'zeroshot_eval_' + f'wiki_{config.KNOWLEDGE.WIKITIONARY.USE_DEFINITION}_wnh_{config.KNOWLEDGE.WORDNET.USE_HIERARCHY}_wnd_{config.KNOWLEDGE.WORDNET.USE_DEFINITION}_gpt3_{config.KNOWLEDGE.GPT3.USE_GPT3}' 90 | exp_name += f'agg_{config.KNOWLEDGE.AGGREGATION.MEHTOD}_gpt3count_{config.KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS}' 91 | final_output_dir = create_logger(config, exp_name) 92 | 93 | if comm.is_main_process(): 94 | log_arg_env_config(args, config, final_output_dir) 95 | 96 | if args.text_feature_only: 97 | wiki_dict, gpt3_dict = load_or_extract_text_features(args, config) 98 | 99 | else: 100 | image_features, text_features, image_labels = load_or_extract_features(args, config) 101 | result, test_predictions, metric = clip_zeroshot_evaluator(image_features, text_features, image_labels, config) 102 | msg = f'=> TEST: {metric} {100 * result:.3f}% ' 103 | logging.info(msg) 104 | 105 | if args.save_predictions: 106 | import json 107 | 108 | # a hack to control the json dump float accuracy 109 | # if you find the accuracy is not enough, pleae consider increasing `prec`. 110 | def json_prec_dump(data, prec=6): 111 | return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec))) 112 | 113 | results_dict = { 114 | 'model_name': f'CLIP-{config.MODEL.NAME}', 115 | 'dataset_name': config.DATASET.DATASET, 116 | 'num_trainable_params': 0, 117 | 'num_params': config.MODEL.STATS.get('n_params', None), 118 | 'num_visual_params': config.MODEL.STATS.get('n_visual_params', None), 119 | 'num_backbone_params': config.MODEL.STATS.get('n_backbone_params', None), 120 | 'n_shot': 0, 121 | 'rnd_seeds': [0], 122 | 'predictions': [test_predictions.cpu().data.numpy().tolist()], 123 | } 124 | json_string = json_prec_dump(results_dict) 125 | 126 | prediction_folder = os.path.join(config.OUTPUT_DIR, 'predictions', exp_name) 127 | os.makedirs(prediction_folder, exist_ok=True) 128 | with open(os.path.join(prediction_folder, f'{config.DATASET.DATASET}.json' ) , 'w') as outfile: 129 | outfile.write(json_string) 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /vision_benchmark/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/common/__init__.py -------------------------------------------------------------------------------- /vision_benchmark/common/constants.py: -------------------------------------------------------------------------------- 1 | from vision_datasets import DatasetHub 2 | import pathlib 3 | 4 | VISION_DATASET_STORAGE = 'https://cvinthewildeus.blob.core.windows.net/datasets?sp=r&st=2023-08-28T01:41:20Z&se=3023-08-28T09:41:20Z&sv=2022-11-02&sr=c&sig=Msoq5dIl%2Fve6F01edGr8jgcZUt7rtsuJ896xvstSNfM%3D' 5 | 6 | 7 | def get_dataset_hub(): 8 | vision_dataset_json = (pathlib.Path(__file__).resolve().parents[1] / 'resources' / 'datasets' / 'vision_datasets.json').read_text() 9 | hub = DatasetHub(vision_dataset_json) 10 | 11 | return hub 12 | -------------------------------------------------------------------------------- /vision_benchmark/common/data_class_base.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import dataclasses 3 | 4 | 5 | class DataClassBase: 6 | def __post_init__(self): 7 | self.validate() 8 | 9 | @classmethod 10 | def from_dict(cls, data_content): 11 | c = {} 12 | for field in dataclasses.fields(cls): 13 | d_type = DataClassBase._get_dataclass_type(field.type) 14 | if field.name in data_content: 15 | c[field.name] = d_type.from_dict(data_content[field.name]) if d_type else data_content[field.name] 16 | 17 | assert len(data_content) == len(c), f"{data_content.keys()} vs {c.keys()}" 18 | return cls(**c) 19 | 20 | def to_dict(self, skip_default=True): 21 | result = {} 22 | for f in dataclasses.fields(self): 23 | value = getattr(self, f.name) 24 | if dataclasses.is_dataclass(value): 25 | value = value.to_dict() 26 | elif isinstance(value, (list, tuple)): 27 | value = type(value)(v.to_dict() if dataclasses.is_dataclass(v) else v for v in value) 28 | if not skip_default or value != f.default: 29 | result[f.name] = value 30 | return result 31 | 32 | def validate(self): 33 | # Check the field types. 34 | for field in dataclasses.fields(self): 35 | if hasattr(field.type, '__origin__') and field.type.__origin__ in (tuple, collections.abc.Sequence): 36 | expected_types = field.type.__origin__ 37 | elif hasattr(field.type, '__args__'): 38 | # Optional[].__args__ is (, NoneType) 39 | expected_types = field.type.__args__ 40 | else: 41 | expected_types = field.type 42 | 43 | if not isinstance(self.__dict__[field.name], expected_types): 44 | raise TypeError(f"Unexpected field type for {field.name}: Expected: {expected_types}. Actual: {type(self.__dict__[field.name])}") 45 | 46 | def _raise_value_error(self, config_name, msg=None): 47 | error_msg = f"Invalid {config_name}: {getattr(self, config_name)}." 48 | if msg: 49 | error_msg += ' ' + msg 50 | 51 | raise ValueError(error_msg) 52 | 53 | def _check_value(self, value_name, checker): 54 | value = getattr(self, value_name) 55 | if not checker(value): 56 | raise ValueError(f"Invalid {value_name}: {value}.") 57 | 58 | def _get_dataclass_type(field_type): 59 | """Returns dataclass type if the given type is dataclass or Optional[dataclass].""" 60 | if dataclasses.is_dataclass(field_type): 61 | return field_type 62 | if hasattr(field_type, '__args__'): 63 | args = field_type.__args__ 64 | if len(args) == 2 and type(None) in args: 65 | return next((t for t in args if dataclasses.is_dataclass(t)), None) 66 | return None 67 | -------------------------------------------------------------------------------- /vision_benchmark/common/prediction_submission.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import datetime 3 | import logging 4 | import math 5 | import pathlib 6 | from typing import List 7 | 8 | from .data_class_base import DataClassBase 9 | from .constants import VISION_DATASET_STORAGE 10 | from vision_datasets import DatasetTypes, DatasetHub, Usages, DatasetManifest 11 | 12 | 13 | class Tasks: 14 | IC_MULTILABEL = DatasetTypes.IC_MULTILABEL 15 | IC_MULTICLASS = DatasetTypes.IC_MULTICLASS 16 | OBJECT_DETECTION = DatasetTypes.OD 17 | 18 | VALID_TYPES = [IC_MULTILABEL, IC_MULTICLASS, OBJECT_DETECTION] 19 | 20 | @staticmethod 21 | def is_valid(task): 22 | return task in Tasks.VALID_TYPES 23 | 24 | 25 | class Tracks: 26 | LINEAR_PROBING = 'linear_probing' 27 | TRANSFER_LEARNING = 'transfer_learning' 28 | ZERO_SHOT = 'zero_shot' 29 | 30 | VALID_TYPES = [LINEAR_PROBING, TRANSFER_LEARNING, ZERO_SHOT] 31 | 32 | @staticmethod 33 | def is_valid(task, track): 34 | if track not in Tracks.VALID_TYPES: 35 | return False 36 | 37 | if task in [Tasks.IC_MULTICLASS, Tasks.IC_MULTILABEL]: 38 | return True 39 | 40 | if task == Tasks.OBJECT_DETECTION: 41 | return track != Tracks.LINEAR_PROBING 42 | 43 | return False 44 | 45 | 46 | @dataclasses.dataclass(frozen=True) 47 | class PredictionSubmission(DataClassBase): 48 | dataset_name: str 49 | model_name: str 50 | created_by: str 51 | task: str 52 | track: str 53 | predictions: List 54 | 55 | def validate(self): 56 | vision_dataset_json = (pathlib.Path(__file__).resolve().parents[1] / 'resources' / 'datasets' / 'vision_datasets.json').read_text() 57 | hub = DatasetHub(vision_dataset_json) 58 | dataset_names = set([x['name'] for x in hub.list_data_version_and_types()]) 59 | 60 | self._check_value('dataset_name', lambda x: x and x in dataset_names) 61 | self._check_value('model_name', lambda x: x) 62 | self._check_value('created_by', lambda x: x) 63 | self._check_value('task', lambda x: Tasks.is_valid(x)) 64 | self._check_value('track', lambda x: Tracks.is_valid(self.task, x)) 65 | self._check_value('predictions', lambda x: x) 66 | dataset_manifest = hub.create_dataset_manifest(VISION_DATASET_STORAGE, None, self.dataset_name, usage=Usages.TEST_PURPOSE)[0] 67 | logging.info(f'Created test set manifest for {self.dataset_name}') 68 | for fold_idx, predictions in enumerate(self.predictions): 69 | PredictionSubmission.validate_predictions(dataset_manifest, predictions, fold_idx) 70 | 71 | @staticmethod 72 | def validate_predictions(dataset_manifest: DatasetManifest, predictions, fold_idx): 73 | assert predictions, f'fold {fold_idx}, empty predictions.' 74 | assert len(predictions) == len(dataset_manifest.images), f'fold {fold_idx}, Number of predictions does not match number of images.' 75 | 76 | if dataset_manifest.data_type in [DatasetTypes.IC_MULTICLASS, DatasetTypes.IC_MULTILABEL]: 77 | for i, probs in enumerate(predictions): 78 | if dataset_manifest.data_type == DatasetTypes.IC_MULTICLASS: 79 | sum_probs = sum(probs) 80 | assert math.isclose(sum_probs, 1.0, rel_tol=1e-3), f'fold {fold_idx}, Sum of predicted prob vector for image {i}: {sum_probs}, should be 1.0.' 81 | 82 | assert all([0.0 <= prob <= 1.0 for prob in probs]), f'fold {fold_idx}, Predicted prob for image {i} not in [0, 1]: {probs}' 83 | 84 | if dataset_manifest.data_type == DatasetTypes.OD: 85 | # [[[class_index, conf, L, T, R, B], [class_index, conf, L, T, R, B], ..., []], [...], ..., [...]] 86 | for i, img_wise_bboxes in enumerate(predictions): 87 | for bbox_pred in img_wise_bboxes: 88 | assert PredictionSubmission.is_valid_box(bbox_pred, len(dataset_manifest.labelmap)), f'fold {fold_idx}, Invalid predicted bbox for image {i}: {bbox_pred}' 89 | 90 | @staticmethod 91 | def is_valid_box(bbox_pred, num_classes): 92 | return len(bbox_pred) == 6 and (0 <= bbox_pred[0] < num_classes) and (0.0 <= bbox_pred[1] <= 1.0) and all([x >= 0 for x in bbox_pred[2:]]) and (bbox_pred[2] <= bbox_pred[4]) \ 93 | and (bbox_pred[3] <= bbox_pred[5]) 94 | 95 | 96 | @dataclasses.dataclass(frozen=True) 97 | class ModelInfoSubmission(DataClassBase): 98 | name: str 99 | author: str 100 | num_params_in_millions: int 101 | pretrained_data: str 102 | creation_time: str 103 | 104 | def validate(self): 105 | self._check_value('name', lambda x: x) 106 | self._check_value('author', lambda x: x) 107 | self._check_value('num_params_in_millions', lambda x: x > 0) 108 | self._check_value('pretrained_data', lambda x: x) 109 | self._check_value('creation_time', lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')) 110 | -------------------------------------------------------------------------------- /vision_benchmark/common/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pprint 3 | 4 | from torch.utils.collect_env import get_pretty_env_info 5 | 6 | 7 | def log_arg_env_config(args, config, output_dir): 8 | logging.info("=> collecting env info (might take some time)") 9 | logging.info("\n" + get_pretty_env_info()) 10 | logging.info(pprint.pformat(args)) 11 | logging.info(config) 12 | logging.info(f'=> saving logging info into: {output_dir}') 13 | 14 | 15 | def submit_predictions(prediction_list, submit_by, config, track, task): 16 | from vision_benchmark.commands.submit_predictions import submit_predictions_to_leaderboard, submit_model_to_leaderboard 17 | 18 | submission = { 19 | 'dataset_name': config.DATASET.DATASET, 20 | 'model_name': config.MODEL.NAME, 21 | 'track': track, 22 | 'task': task, 23 | 'created_by': submit_by, 24 | 'predictions': [prediction_list] 25 | } 26 | 27 | logging.info('Submit model and predictions to leaderboard.') 28 | submit_predictions_to_leaderboard(submission) 29 | 30 | model_info = { 31 | "name": config.MODEL.NAME, 32 | "author": config.MODEL.AUTHOR, 33 | "num_params_in_millions": config.MODEL.NUM_PARAMS_IN_M, 34 | "pretrained_data": config.MODEL.PRETRAINED_DATA, 35 | "creation_time": config.MODEL.CREATION_TIME 36 | } 37 | 38 | submit_model_to_leaderboard(model_info) 39 | -------------------------------------------------------------------------------- /vision_benchmark/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .default import _C as config 2 | from .default import update_config 3 | from .models import MODEL_SPECS 4 | 5 | __all__ = ['config', 'update_config', 'MODEL_SPECS'] 6 | -------------------------------------------------------------------------------- /vision_benchmark/config/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from yacs.config import CfgNode as CN 6 | 7 | # high_resoluton_net related params for classification 8 | HIGH_RESOLUTION_NET = CN() 9 | HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*'] 10 | HIGH_RESOLUTION_NET.STEM_INPLANES = 64 11 | HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1 12 | HIGH_RESOLUTION_NET.WITH_HEAD = True 13 | 14 | HIGH_RESOLUTION_NET.STAGE2 = CN() 15 | HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1 16 | HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2 17 | HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4] 18 | HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64] 19 | HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC' 20 | HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'CAT' 21 | 22 | HIGH_RESOLUTION_NET.STAGE3 = CN() 23 | HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1 24 | HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3 25 | HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4] 26 | HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128] 27 | HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC' 28 | HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'CAT' 29 | 30 | HIGH_RESOLUTION_NET.STAGE4 = CN() 31 | HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1 32 | HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4 33 | HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] 34 | HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256] 35 | HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC' 36 | HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'CAT' 37 | 38 | RESNEXT = CN() 39 | RESNEXT.NUM_LAYERS = 50 40 | RESNEXT.BASE_WIDTH = 4 41 | RESNEXT.CARDINALITY = 32 42 | RESNEXT.KERNEL_SIZE_STEM = 7 43 | 44 | RESNET = CN() 45 | RESNET.NUM_LAYERS = 50 46 | RESNET.KERNEL_SIZE_STEM = 7 47 | 48 | 49 | MODEL_SPECS = { 50 | 'cls_hrnet': HIGH_RESOLUTION_NET, 51 | } 52 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompts import class_map, template_map 2 | from .simple_tokenizer import SimpleTokenizer 3 | from .hfpt_tokenizer import HFPTTokenizer 4 | 5 | __all__ = ['class_map', 'template_map', 'SimpleTokenizer', 'HFPTTokenizer'] 6 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /vision_benchmark/datasets/hfpt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from transformers import AutoTokenizer 4 | import torch 5 | 6 | 7 | class HFPTTokenizer(object): 8 | def __init__(self, pt_name=None): 9 | 10 | self.pt_name = pt_name 11 | self.added_sep_token = 0 12 | self.added_cls_token = 0 13 | self.enable_add_tokens = False 14 | self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name)) 15 | 16 | if (pt_name is None): 17 | self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') 18 | else: 19 | self.tokenizer = AutoTokenizer.from_pretrained(pt_name) 20 | 21 | # Adding tokens to GPT causing NaN training loss. 22 | # Disable for now until further investigation. 23 | if (self.enable_add_tokens): 24 | if (self.tokenizer.sep_token is None): 25 | self.tokenizer.add_special_tokens({'sep_token': ''}) 26 | self.added_sep_token = 1 27 | 28 | if (self.tokenizer.cls_token is None): 29 | self.tokenizer.add_special_tokens({'cls_token': ''}) 30 | self.added_cls_token = 1 31 | 32 | if (self.gpt_special_case): 33 | self.tokenizer.pad_token = self.tokenizer.eos_token 34 | self.tokenizer.sep_token = self.tokenizer.eos_token 35 | 36 | def get_eot_token(self): 37 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0] 38 | 39 | def get_sot_token(self): 40 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0] 41 | 42 | def get_eot_token_list(self): 43 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False) 44 | 45 | def get_sot_token_list(self): 46 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False) 47 | 48 | def get_tokenizer_obj(self): 49 | return self.tokenizer 50 | 51 | # Language model needs to know if new tokens 52 | # were added to the dictionary. 53 | def check_added_tokens(self): 54 | return self.added_sep_token + self.added_cls_token 55 | 56 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 57 | if isinstance(texts, str): 58 | texts = [texts] 59 | 60 | padding = 'max_length' 61 | 62 | seqstart = [] 63 | seqend = [] 64 | 65 | max_length = context_length 66 | 67 | if (self.added_cls_token > 0): 68 | seqstart = self.get_sot_token_list() 69 | max_length = max_length - 1 70 | 71 | if (self.added_sep_token > 0): 72 | seqend = self.get_eot_token_list() 73 | max_length = max_length - 1 74 | 75 | tokens = self.tokenizer( 76 | texts, padding=padding, 77 | truncation=True, 78 | max_length=max_length 79 | )['input_ids'] 80 | 81 | for i in range(len(tokens)): 82 | tokens[i] = seqstart + tokens[i] + seqend 83 | 84 | if (self.gpt_special_case): 85 | for i in range(len(tokens)): 86 | tokens[i][-1] = self.get_eot_token() 87 | 88 | result = torch.Tensor(tokens).type(torch.LongTensor) 89 | 90 | return result 91 | 92 | def get_vocab_size(self): 93 | return self.tokenizer.vocab_size 94 | 95 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 96 | return self.tokenize(texts, context_length) 97 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/languages/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from typing import Union, List 6 | 7 | from .simple_tokenizer import SimpleTokenizer 8 | from .hfpt_tokenizer import HFPTTokenizer 9 | 10 | from .build import build_tokenizer 11 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /vision_benchmark/datasets/languages/build.py: -------------------------------------------------------------------------------- 1 | from .hfpt_tokenizer import HFPTTokenizer 2 | from .simple_tokenizer import SimpleTokenizer 3 | 4 | 5 | def build_tokenizer(tokenizer_name): 6 | tokenizer = None 7 | if tokenizer_name == 'clip': 8 | tokenizer = SimpleTokenizer() 9 | elif 'hf_' in tokenizer_name: 10 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:]) 11 | elif 'hfc_' in tokenizer_name: 12 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:]) 13 | else: 14 | raise ValueError('Unknown tokenizer') 15 | 16 | return tokenizer 17 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/languages/hfpt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from transformers import AutoTokenizer 4 | import torch 5 | 6 | 7 | class HFPTTokenizer(object): 8 | def __init__(self, pt_name = None): 9 | 10 | self.pt_name = pt_name 11 | self.added_sep_token = 0 12 | self.added_cls_token = 0 13 | self.enable_add_tokens = False 14 | self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name)) 15 | 16 | if (pt_name is None): 17 | self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') 18 | else: 19 | self.tokenizer = AutoTokenizer.from_pretrained(pt_name) 20 | 21 | # Adding tokens to GPT causing NaN training loss. 22 | # Disable for now until further investigation. 23 | if (self.enable_add_tokens): 24 | if (self.tokenizer.sep_token is None): 25 | self.tokenizer.add_special_tokens({'sep_token': ''}) 26 | self.added_sep_token = 1 27 | 28 | if (self.tokenizer.cls_token is None): 29 | self.tokenizer.add_special_tokens({'cls_token': ''}) 30 | self.added_cls_token = 1 31 | 32 | if (self.gpt_special_case): 33 | self.tokenizer.pad_token = self.tokenizer.eos_token 34 | self.tokenizer.sep_token = self.tokenizer.eos_token 35 | 36 | def get_eot_token(self): 37 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0] 38 | 39 | def get_sot_token(self): 40 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0] 41 | 42 | def get_eot_token_list(self): 43 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False) 44 | 45 | def get_sot_token_list(self): 46 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False) 47 | 48 | def get_tokenizer_obj(self): 49 | return self.tokenizer 50 | 51 | # Language model needs to know if new tokens 52 | # were added to the dictionary. 53 | def check_added_tokens(self): 54 | return self.added_sep_token + self.added_cls_token 55 | 56 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 57 | if isinstance(texts, str): 58 | texts = [texts] 59 | 60 | padding = 'max_length' 61 | 62 | seqstart = [] 63 | seqtok = [] 64 | seqend = [] 65 | 66 | max_length = context_length 67 | 68 | if (self.added_cls_token > 0): 69 | seqstart = self.get_sot_token_list() 70 | max_length = max_length - 1 71 | 72 | if (self.added_sep_token > 0): 73 | seqend = self.get_eot_token_list() 74 | max_length = max_length - 1 75 | 76 | tokens = self.tokenizer( 77 | texts, padding=padding, 78 | truncation=True, 79 | max_length=max_length 80 | )['input_ids'] 81 | 82 | for i in range(len(tokens)): 83 | tokens[i] = seqstart + tokens[i] + seqend 84 | 85 | if (self.gpt_special_case): 86 | for i in range(len(tokens)): 87 | tokens[i][-1] = self.get_eot_token() 88 | 89 | #print(str(tokens)) 90 | 91 | result = torch.Tensor(tokens).type(torch.LongTensor) 92 | 93 | return result 94 | 95 | def get_vocab_size(self): 96 | return self.tokenizer.vocab_size 97 | 98 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 99 | return self.tokenize(texts, context_length) 100 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/languages/prompt_engineering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | def get_prompt_templates(): 6 | prompt_templates = [ 7 | '{}.', 8 | 'a photo of a {}.', 9 | 'a bad photo of a {}.', 10 | 'a photo of many {}.', 11 | 'a sculpture of a {}.', 12 | 'a photo of the hard to see {}.', 13 | 'a low resolution photo of the {}.', 14 | 'a rendering of a {}.', 15 | 'graffiti of a {}.', 16 | 'a bad photo of the {}.', 17 | 'a cropped photo of the {}.', 18 | 'a tattoo of a {}.', 19 | 'the embroidered {}.', 20 | 'a photo of a hard to see {}.', 21 | 'a bright photo of a {}.', 22 | 'a photo of a clean {}.', 23 | 'a photo of a dirty {}.', 24 | 'a dark photo of the {}.', 25 | 'a drawing of a {}.', 26 | 'a photo of my {}.', 27 | 'the plastic {}.', 28 | 'a photo of the cool {}.', 29 | 'a close-up photo of a {}.', 30 | 'a black and white photo of the {}.', 31 | 'a painting of the {}.', 32 | 'a painting of a {}.', 33 | 'a pixelated photo of the {}.', 34 | 'a sculpture of the {}.', 35 | 'a bright photo of the {}.', 36 | 'a cropped photo of a {}.', 37 | 'a plastic {}.', 38 | 'a photo of the dirty {}.', 39 | 'a jpeg corrupted photo of a {}.', 40 | 'a blurry photo of the {}.', 41 | 'a photo of the {}.', 42 | 'a good photo of the {}.', 43 | 'a rendering of the {}.', 44 | 'a {} in a video game.', 45 | 'a photo of one {}.', 46 | 'a doodle of a {}.', 47 | 'a close-up photo of the {}.', 48 | 'the origami {}.', 49 | 'the {} in a video game.', 50 | 'a sketch of a {}.', 51 | 'a doodle of the {}.', 52 | 'a origami {}.', 53 | 'a low resolution photo of a {}.', 54 | 'the toy {}.', 55 | 'a rendition of the {}.', 56 | 'a photo of the clean {}.', 57 | 'a photo of a large {}.', 58 | 'a rendition of a {}.', 59 | 'a photo of a nice {}.', 60 | 'a photo of a weird {}.', 61 | 'a blurry photo of a {}.', 62 | 'a cartoon {}.', 63 | 'art of a {}.', 64 | 'a sketch of the {}.', 65 | 'a embroidered {}.', 66 | 'a pixelated photo of a {}.', 67 | 'itap of the {}.', 68 | 'a jpeg corrupted photo of the {}.', 69 | 'a good photo of a {}.', 70 | 'a plushie {}.', 71 | 'a photo of the nice {}.', 72 | 'a photo of the small {}.', 73 | 'a photo of the weird {}.', 74 | 'the cartoon {}.', 75 | 'art of the {}.', 76 | 'a drawing of the {}.', 77 | 'a photo of the large {}.', 78 | 'a black and white photo of a {}.', 79 | 'the plushie {}.', 80 | 'a dark photo of a {}.', 81 | 'itap of a {}.', 82 | 'graffiti of the {}.', 83 | 'a toy {}.', 84 | 'itap of my {}.', 85 | 'a photo of a cool {}.', 86 | 'a photo of a small {}.', 87 | 'a tattoo of the {}.', 88 | ] 89 | return prompt_templates 90 | 91 | 92 | def prompt_engineering(classnames): 93 | prompt_templates = get_prompt_templates() 94 | temp_idx = np.random.randint(len(prompt_templates)) 95 | 96 | if isinstance(classnames, list): 97 | classname = random.choice(classnames) 98 | else: 99 | classname = classnames 100 | 101 | return prompt_templates[temp_idx].replace('{}', classname.replace(',', '').replace('+', ' ')) 102 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/languages/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | from typing import Union, List 9 | 10 | import torch 11 | 12 | @lru_cache() 13 | def default_bpe(): 14 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 15 | 16 | 17 | @lru_cache() 18 | def bytes_to_unicode(): 19 | """ 20 | Returns list of utf-8 byte and a corresponding list of unicode strings. 21 | The reversible bpe codes work on unicode strings. 22 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 23 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 24 | This is a signficant percentage of your normal, say, 32K bpe vocab. 25 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 26 | And avoids mapping to whitespace/control characters the bpe code barfs on. 27 | """ 28 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 29 | cs = bs[:] 30 | n = 0 31 | for b in range(2**8): 32 | if b not in bs: 33 | bs.append(b) 34 | cs.append(2**8+n) 35 | n += 1 36 | cs = [chr(n) for n in cs] 37 | return dict(zip(bs, cs)) 38 | 39 | 40 | def get_pairs(word): 41 | """Return set of symbol pairs in a word. 42 | Word is represented as tuple of symbols (symbols being variable-length strings). 43 | """ 44 | pairs = set() 45 | prev_char = word[0] 46 | for char in word[1:]: 47 | pairs.add((prev_char, char)) 48 | prev_char = char 49 | return pairs 50 | 51 | 52 | def basic_clean(text): 53 | text = ftfy.fix_text(text) 54 | text = html.unescape(html.unescape(text)) 55 | return text.strip() 56 | 57 | 58 | def whitespace_clean(text): 59 | text = re.sub(r'\s+', ' ', text) 60 | text = text.strip() 61 | return text 62 | 63 | 64 | class SimpleTokenizer(object): 65 | def __init__(self, bpe_path: str = default_bpe()): 66 | self.byte_encoder = bytes_to_unicode() 67 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 68 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 69 | merges = merges[1:49152-256-2+1] 70 | merges = [tuple(merge.split()) for merge in merges] 71 | vocab = list(bytes_to_unicode().values()) 72 | vocab = vocab + [v+'' for v in vocab] 73 | for merge in merges: 74 | vocab.append(''.join(merge)) 75 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 76 | self.encoder = dict(zip(vocab, range(len(vocab)))) 77 | self.decoder = {v: k for k, v in self.encoder.items()} 78 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 79 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 80 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 81 | 82 | def bpe(self, token): 83 | if token in self.cache: 84 | return self.cache[token] 85 | word = tuple(token[:-1]) + ( token[-1] + '',) 86 | pairs = get_pairs(word) 87 | 88 | if not pairs: 89 | return token+'' 90 | 91 | while True: 92 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 93 | if bigram not in self.bpe_ranks: 94 | break 95 | first, second = bigram 96 | new_word = [] 97 | i = 0 98 | while i < len(word): 99 | try: 100 | j = word.index(first, i) 101 | new_word.extend(word[i:j]) 102 | i = j 103 | except: 104 | new_word.extend(word[i:]) 105 | break 106 | 107 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 108 | new_word.append(first+second) 109 | i += 2 110 | else: 111 | new_word.append(word[i]) 112 | i += 1 113 | new_word = tuple(new_word) 114 | word = new_word 115 | if len(word) == 1: 116 | break 117 | else: 118 | pairs = get_pairs(word) 119 | word = ' '.join(word) 120 | self.cache[token] = word 121 | return word 122 | 123 | def encode(self, text): 124 | bpe_tokens = [] 125 | text = whitespace_clean(basic_clean(text)).lower() 126 | for token in re.findall(self.pat, text): 127 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 128 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 129 | return bpe_tokens 130 | 131 | def decode(self, tokens): 132 | text = ''.join([self.decoder[token] for token in tokens]) 133 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 134 | return text 135 | 136 | def get_vocab_size(self): 137 | return 49408 138 | 139 | def get_eot_token(self): 140 | return self.encoder["<|endoftext|>"] 141 | 142 | def get_sot_token(self): 143 | return self.encoder["<|startoftext|>"] 144 | 145 | def check_added_tokens(self): 146 | return 0 147 | 148 | def get_tokenizer_obj(self): 149 | return None 150 | 151 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 152 | if isinstance(texts, str): 153 | texts = [texts] 154 | 155 | sot_token = self.encoder["<|startoftext|>"] 156 | eot_token = self.encoder["<|endoftext|>"] 157 | all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts] 158 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 159 | 160 | for i, tokens in enumerate(all_tokens): 161 | if len(tokens) > context_length: 162 | tokens = tokens[:context_length] 163 | # raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") 164 | 165 | result[i, :len(tokens)] = torch.tensor(tokens) 166 | 167 | return result 168 | 169 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 170 | return self.tokenize(texts, context_length) 171 | -------------------------------------------------------------------------------- /vision_benchmark/datasets/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | from typing import Union, List 9 | 10 | import torch 11 | 12 | 13 | @lru_cache() 14 | def default_bpe(): 15 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 16 | 17 | 18 | @lru_cache() 19 | def bytes_to_unicode(): 20 | """ 21 | Returns list of utf-8 byte and a corresponding list of unicode strings. 22 | The reversible bpe codes work on unicode strings. 23 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 24 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 25 | This is a signficant percentage of your normal, say, 32K bpe vocab. 26 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 27 | And avoids mapping to whitespace/control characters the bpe code barfs on. 28 | """ 29 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 30 | cs = bs[:] 31 | n = 0 32 | for b in range(2**8): 33 | if b not in bs: 34 | bs.append(b) 35 | cs.append(2**8+n) 36 | n += 1 37 | cs = [chr(n) for n in cs] 38 | return dict(zip(bs, cs)) 39 | 40 | 41 | def get_pairs(word): 42 | """Return set of symbol pairs in a word. 43 | Word is represented as tuple of symbols (symbols being variable-length strings). 44 | """ 45 | pairs = set() 46 | prev_char = word[0] 47 | for char in word[1:]: 48 | pairs.add((prev_char, char)) 49 | prev_char = char 50 | return pairs 51 | 52 | 53 | def basic_clean(text): 54 | text = ftfy.fix_text(text) 55 | text = html.unescape(html.unescape(text)) 56 | return text.strip() 57 | 58 | 59 | def whitespace_clean(text): 60 | text = re.sub(r'\s+', ' ', text) 61 | text = text.strip() 62 | return text 63 | 64 | 65 | class SimpleTokenizer(object): 66 | def __init__(self, bpe_path: str = default_bpe()): 67 | self.byte_encoder = bytes_to_unicode() 68 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 69 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 70 | merges = merges[1:49152-256-2+1] 71 | merges = [tuple(merge.split()) for merge in merges] 72 | vocab = list(bytes_to_unicode().values()) 73 | vocab = vocab + [v+'' for v in vocab] 74 | for merge in merges: 75 | vocab.append(''.join(merge)) 76 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 77 | self.encoder = dict(zip(vocab, range(len(vocab)))) 78 | self.decoder = {v: k for k, v in self.encoder.items()} 79 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 80 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 81 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 82 | 83 | def bpe(self, token): 84 | if token in self.cache: 85 | return self.cache[token] 86 | word = tuple(token[:-1]) + (token[-1] + '',) 87 | pairs = get_pairs(word) 88 | 89 | if not pairs: 90 | return token+'' 91 | 92 | while True: 93 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 94 | if bigram not in self.bpe_ranks: 95 | break 96 | first, second = bigram 97 | new_word = [] 98 | i = 0 99 | while i < len(word): 100 | try: 101 | j = word.index(first, i) 102 | new_word.extend(word[i:j]) 103 | i = j 104 | except Exception: 105 | new_word.extend(word[i:]) 106 | break 107 | 108 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 109 | new_word.append(first+second) 110 | i += 2 111 | else: 112 | new_word.append(word[i]) 113 | i += 1 114 | new_word = tuple(new_word) 115 | word = new_word 116 | if len(word) == 1: 117 | break 118 | else: 119 | pairs = get_pairs(word) 120 | word = ' '.join(word) 121 | self.cache[token] = word 122 | return word 123 | 124 | def encode(self, text): 125 | bpe_tokens = [] 126 | text = whitespace_clean(basic_clean(text)).lower() 127 | for token in re.findall(self.pat, text): 128 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 129 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 130 | return bpe_tokens 131 | 132 | def decode(self, tokens): 133 | text = ''.join([self.decoder[token] for token in tokens]) 134 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 135 | return text 136 | 137 | def get_vocab_size(self): 138 | return 49408 139 | 140 | def get_eot_token(self): 141 | return self.encoder["<|endoftext|>"] 142 | 143 | def get_sot_token(self): 144 | return self.encoder["<|startoftext|>"] 145 | 146 | def check_added_tokens(self): 147 | return 0 148 | 149 | def get_tokenizer_obj(self): 150 | return None 151 | 152 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 153 | if isinstance(texts, str): 154 | texts = [texts] 155 | 156 | sot_token = self.encoder["<|startoftext|>"] 157 | eot_token = self.encoder["<|endoftext|>"] 158 | all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts] 159 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 160 | 161 | for i, tokens in enumerate(all_tokens): 162 | if len(tokens) > context_length: 163 | tokens = tokens[:context_length] 164 | # raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") 165 | 166 | result[i, :len(tokens)] = torch.tensor(tokens) 167 | 168 | return result 169 | 170 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 171 | return self.tokenize(texts, context_length) 172 | -------------------------------------------------------------------------------- /vision_benchmark/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature import extract_features, extract_text_features, construct_dataloader 2 | from .full_model_finetune import full_model_finetune 3 | from .clip_zeroshot_evaluator import clip_zeroshot_evaluator 4 | 5 | __all__ = ['extract_features', 'linear_classifier', 'lr_classifier', 'extract_text_features', 'clip_zeroshot_evaluator', 'construct_dataloader', 'full_model_finetune', 'linear_classifier_contrast'] 6 | -------------------------------------------------------------------------------- /vision_benchmark/evaluation/clip_zeroshot_evaluator.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLIP zeroshot evaluation 3 | """ 4 | import torch 5 | import torch.nn.functional as F 6 | from .metric import get_metric 7 | 8 | 9 | def clip_zeroshot_evaluator(image_features, text_features, image_labels, config): 10 | metric = get_metric(config.TEST.METRIC) 11 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 12 | image_features = torch.from_numpy(image_features).to(device) 13 | text_features = torch.from_numpy(text_features).to(device) 14 | image_labels = torch.from_numpy(image_labels).to(device) 15 | 16 | # Normalize image_features 17 | image_features = F.normalize(image_features) 18 | 19 | # Compute logits 20 | logits = (100. * image_features @ text_features).softmax(dim=-1) 21 | result = metric(image_labels.squeeze().cpu().detach().numpy(), logits.cpu().detach().numpy()) 22 | return result, logits, metric.__name__ 23 | -------------------------------------------------------------------------------- /vision_benchmark/evaluation/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.utils.data 4 | from PIL import Image 5 | from torchvision import transforms 6 | 7 | 8 | class Voc2007Classification(torch.utils.data.Dataset): 9 | def __init__(self, data_root, image_set="train", transform=None): 10 | """ 11 | Pascal voc2007 training/validation data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 12 | test data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 13 | """ 14 | self.data_root = self._update_path(data_root, image_set) 15 | self.transform = transform 16 | self.labels = self._read_annotation(image_set) 17 | self.images = list(self.labels.keys()) 18 | 19 | @staticmethod 20 | def _update_path(data_root, image_set): 21 | if image_set == "train" or image_set == "val": 22 | data_root += "train/VOCdevkit/VOC2007" 23 | elif image_set == "test": 24 | data_root += "test/VOCdevkit 2/VOC2007" 25 | else: 26 | raise Exception("Incorrect image set!") 27 | return data_root 28 | 29 | def __getitem__(self, index): 30 | img_path = os.path.join(self.data_root, 'JPEGImages/' + self.images[index] + '.jpg') 31 | image = Image.open(img_path).convert("RGB") 32 | if self.transform is not None: 33 | image = self.transform(image) 34 | else: 35 | image = transforms.ToTensor()(image) 36 | label = self.labels[self.images[index]] 37 | label = torch.LongTensor(label) 38 | return image, label 39 | 40 | def __len__(self): 41 | return len(self.images) 42 | 43 | def _read_annotation(self, image_set="train"): 44 | """ 45 | Annotation interpolation, refer to: 46 | http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00093000000000000000 47 | """ 48 | object_categories = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 49 | 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 50 | 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] 51 | annotation_folder = os.path.join(self.data_root, "ImageSets/Main/") 52 | files = [file_name for file_name in os.listdir(annotation_folder) if file_name.endswith("_" + image_set + ".txt")] 53 | labels_all = dict() 54 | for file_name in files: 55 | label_str = file_name.split("_")[0] 56 | label_int = object_categories.index(label_str) 57 | with open(annotation_folder + "/" + file_name, "r") as fread: 58 | for line in fread.readlines(): 59 | index = line[:6] 60 | if index not in labels_all.keys(): 61 | labels_all[index] = [0] * len(object_categories) 62 | flag = 1 63 | if line[7:9] and int(line[7:9]) != 1: 64 | flag = -1 65 | if flag == 1: 66 | labels_all[index][label_int] = 1 67 | return labels_all 68 | 69 | -------------------------------------------------------------------------------- /vision_benchmark/evaluation/metric.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from sklearn.metrics import roc_auc_score 4 | import vision_evaluation.evaluators as v_eval 5 | 6 | 7 | def accuracy(y_label, y_pred): 8 | """ Compute Top1 accuracy 9 | Args: 10 | y_label: the ground truth labels. Shape (N,) 11 | y_pred: the prediction of a model. Shape (N,) 12 | """ 13 | evaluator = v_eval.TopKAccuracyEvaluator(1) 14 | evaluator.add_predictions(predictions=y_pred, targets=y_label) 15 | return evaluator.get_report()['accuracy_top1'] 16 | 17 | 18 | def map_11_points(y_label, y_pred_proba): 19 | evaluator = v_eval.MeanAveragePrecisionNPointsEvaluator(11) 20 | evaluator.add_predictions(predictions=y_pred_proba, targets=y_label) 21 | return evaluator.get_report()[evaluator._get_id()] 22 | 23 | 24 | def balanced_accuracy_score(y_label, y_pred): 25 | evaluator = v_eval.BalancedAccuracyScoreEvaluator() 26 | evaluator.add_predictions(y_pred, y_label) 27 | return evaluator.get_report()[evaluator._get_id()] 28 | 29 | 30 | def roc_auc(y_true, y_score): 31 | if y_score.shape[1] == 2: 32 | return roc_auc_score(y_true, y_score[:, 1]) 33 | return roc_auc_score(y_true, y_score) 34 | 35 | 36 | def get_metric(metric_name): 37 | if metric_name == "accuracy": 38 | return accuracy 39 | if metric_name == "mean-per-class": 40 | return balanced_accuracy_score 41 | if metric_name == "11point_mAP": 42 | return map_11_points 43 | if metric_name == "roc_auc": 44 | return roc_auc 45 | 46 | logging.error("Undefined metric.") 47 | -------------------------------------------------------------------------------- /vision_benchmark/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cls_example 2 | from . import clip_example 3 | from . import clip_react 4 | from . import cls_swin 5 | from . import clip_swin 6 | from . import mae 7 | from . import mocov3 8 | from . import declip 9 | 10 | __all__ = ['cls_example', 'clip_example', 'clip_react', 'cls_swin', 'clip_swin', 'mae', 'mocov3', 'declip'] 11 | -------------------------------------------------------------------------------- /vision_benchmark/models/clip_example.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class Example(nn.Module): 5 | def encode_image(): 6 | """ 7 | This method is called to extract image features for evaluation. 8 | """ 9 | pass 10 | 11 | def encode_text(): 12 | """ 13 | This method is called to extract text features for evaluation. 14 | """ 15 | pass 16 | 17 | 18 | def get_zeroshot_model(config, **kwargs): 19 | """ 20 | Specify your model here 21 | """ 22 | model = Example() 23 | return model 24 | -------------------------------------------------------------------------------- /vision_benchmark/models/cls_example.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class Example(nn.Module): 5 | def forward_features(): 6 | """ 7 | This method is called to extract features for evaluation. 8 | """ 9 | pass 10 | 11 | 12 | def get_cls_model(config, **kwargs): 13 | """ 14 | Specify your model here 15 | """ 16 | model = Example() 17 | return model 18 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | 4 | from .declip_model import declip as _declip 5 | from .declip_model import slip as _slip 6 | from .declip_model import filip as _filip 7 | 8 | def get_model(config): 9 | if config.MODEL.NAME in ['filip_vitb32', 'defilip_vitb32']: 10 | model = _filip.filip_vitb32(**config.MODEL.SPEC.DECLIP) 11 | elif config.MODEL.NAME == 'slip_vitb32': 12 | model = _slip.slip_vitb32(**config.MODEL.SPEC.DECLIP) 13 | else: 14 | model = _declip.declip_clip_vitb32(**config.MODEL.SPEC.DECLIP) 15 | 16 | model_file = config.TEST.MODEL_FILE 17 | logging.info(f'=> load model file: {model_file}') 18 | 19 | if model_file.startswith('http'): 20 | checkpoint = torch.hub.load_state_dict_from_url(model_file, progress=False, map_location="cpu") 21 | else: 22 | checkpoint = torch.load(model_file, map_location="cpu") 23 | 24 | # rename moco pre-trained keys 25 | state_dict = checkpoint['model'] 26 | for k in list(state_dict.keys()): 27 | if k.startswith('module.'): 28 | state_dict[k[len("module."):]] = state_dict[k] 29 | del state_dict[k] 30 | 31 | incompatible = model.load_state_dict(state_dict, strict=False) 32 | 33 | if incompatible.missing_keys: 34 | logging.warning('Missing keys: {}'.format(', '.join(incompatible.missing_keys))) 35 | if incompatible.unexpected_keys: 36 | logging.warning('Unexpected keys: {}'.format(', '.join(incompatible.unexpected_keys))) 37 | 38 | return model 39 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import ( # noqa: F401 2 | clip_vitb32 3 | ) 4 | 5 | from .declip import declip_vitb32 6 | 7 | from .filip import filip_vitb32 8 | 9 | from .slip import slip_vitb32 10 | 11 | from .defilip import defilip_vitb32 12 | 13 | 14 | 15 | def model_entry(config): 16 | return globals()[config['type']](**config['kwargs']) 17 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/clip.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from socket import IP_DEFAULT_MULTICAST_LOOP 3 | from typing import Tuple, Union, List 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import nn 7 | import numpy as np 8 | import os 9 | 10 | import timm 11 | from .image_encoder.visual_transformer import visual_transformer_B32, visual_transformer_B16 12 | # from .image_encoder.modified_resnet import modified_resnet_R50, modified_resnet_R101 13 | from .text_encoder.text_transformer import text_transformers 14 | 15 | 16 | BN = None 17 | 18 | __all__ = ['clip_res50', 'clip_vitb32'] 19 | 20 | class AllGather(torch.autograd.Function): 21 | 22 | @staticmethod 23 | def forward(ctx, tensor): 24 | ctx.rank = link.get_rank() 25 | ctx.world_size = link.get_world_size() 26 | 27 | # y = tensor.new(ctx.world_size, *tensor.size()) 28 | 29 | y = [tensor.new(*tensor.size()) for _ in range(ctx.world_size)] 30 | 31 | link.allgather(y, tensor) 32 | 33 | y = torch.cat(y, 0).view(-1, *tensor.size()) 34 | 35 | return y 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | in_grad = torch.zeros_like(grad_output) 40 | in_grad.copy_(grad_output) 41 | # sum grad for gathered tensor 42 | link.allreduce(in_grad) 43 | # split 44 | return in_grad[ctx.rank] 45 | 46 | 47 | 48 | class CLIP(nn.Module): 49 | def __init__(self,image_encode, text_encode, use_allgather): 50 | super().__init__() 51 | self.use_allgather = use_allgather 52 | self.visual =image_encode 53 | self.encode_text = text_encode 54 | self.logit_scale = nn.Parameter(torch.ones([1])) 55 | # self.logit_scale = nn.Parameter(torch.ones([])) 56 | nn.init.constant_(self.logit_scale, np.log(1/0.07)) 57 | #nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) 58 | 59 | def text_parameters(self): 60 | param = [self.logit_scale] 61 | if self.encode_text.text_encode_type == 'Transformer': 62 | param.append(self.encode_text.positional_embedding) 63 | elif self.encode_text.text_encode_type == 'Bert': 64 | # print('Bert', self.encode_text.text_transformer.cls.predictions, flush=True) 65 | # param.extend([self.encode_text.text_transformer.cls.predictions.decoder.weight, 66 | # self.encode_text.text_transformer.cls.predictions.bias]) 67 | param.extend([self.encode_text.text_transformer.cls.predictions.bias]) 68 | return param 69 | 70 | def text_modules(self): 71 | if self.encode_text.text_encode_type == 'Transformer': 72 | return [self.encode_text.transformer, self.encode_text.text_projection, self.encode_text.token_embedding, self.encode_text.ln_final] 73 | elif self.encode_text.text_encode_type == 'Bert': 74 | # print('Bert', self.encode_text.text_transformer, flush=True) 75 | return [self.encode_text.text_transformer.bert, self.encode_text.text_projection, 76 | self.encode_text.text_transformer.cls.predictions.transform] 77 | # self.encode_text.text_transformer.cls.predictions.decoder, # decoder: bias 78 | else: 79 | import ipdb 80 | ipdb.set_trace() 81 | return [self.encode_text.text_transformer, self.encode_text.text_projection] 82 | 83 | def visual_parameters(self): 84 | return [] 85 | 86 | def visual_modules(self): 87 | return [self.visual] 88 | 89 | @property 90 | def dtype(self): 91 | try: 92 | return self.visual.conv1.weight.dtype 93 | except: 94 | try: 95 | return self.visual.head.weight.dtype 96 | except: 97 | try: 98 | return self.visual.stem[0].weight.dtype 99 | except: 100 | return self.encode_text.text_projection.weight.dtype 101 | 102 | def encode_image(self, image): 103 | return self.visual(image.type(self.dtype)) 104 | 105 | def sample_captions(self, texts): 106 | return [text[0] for text in texts] 107 | 108 | def all_gather(self, input): 109 | output = AllGather.apply(input) 110 | output = output.view(-1, *(output.shape[2:])) 111 | return output 112 | 113 | def forward(self, input, all_gather=False): 114 | # input 115 | images = input['images'] 116 | texts = input['captions'] 117 | texts = self.sample_captions(texts) 118 | # text&image encode 119 | image_features = self.encode_image(images) 120 | text_features = self.encode_text(texts) 121 | 122 | 123 | # normalized features 124 | image_features = image_features / (image_features.norm(dim=-1, keepdim=True)) 125 | text_features = text_features / (text_features.norm(dim=-1, keepdim=True)+1e-10) 126 | 127 | # cosine similarity as logits 128 | logit_scale = self.logit_scale.exp() 129 | logit_scale.data = torch.clamp(logit_scale.data, max=100) 130 | 131 | if self.training and self.use_allgather or all_gather: 132 | gathered_image_features = self.all_gather(image_features) 133 | gathered_text_features = self.all_gather(text_features) 134 | 135 | logits_per_image = logit_scale * image_features @ gathered_text_features.t() 136 | logits_per_text = logit_scale * text_features @ gathered_image_features.t() 137 | else: 138 | logits_per_image = logit_scale * image_features @ text_features.t() 139 | logits_per_text = logit_scale * text_features @ image_features.t() 140 | 141 | return logits_per_image, logits_per_text 142 | 143 | 144 | def clip_res50(**kwargs): 145 | """ 146 | Constructs a clip_res50 model. 147 | """ 148 | image_encode = modified_resnet_R50(**kwargs['image_encode']) 149 | text_encode = text_transformers(**kwargs['text_encode']) 150 | model = CLIP(image_encode,text_encode,**kwargs['clip']) 151 | return model 152 | 153 | def clip_vitb32(**kwargs): 154 | """' 155 | Constructs a clip_ViT_B32 model. 156 | """ 157 | image_encode = visual_transformer_B32(**kwargs['image_encode']) 158 | text_encode = text_transformers(**kwargs['text_encode']) 159 | model = CLIP(image_encode,text_encode,**kwargs['clip']) 160 | return model 161 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/image_encoder/base_transformer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from torch import nn 5 | from torch.utils.checkpoint import checkpoint_sequential 6 | 7 | global LAYER_NORM 8 | LAYER_NORM = True 9 | 10 | class LayerNorm(nn.LayerNorm): 11 | """Subclass torch's LayerNorm to handle fp16.""" 12 | 13 | def forward(self, x: torch.Tensor): 14 | if LAYER_NORM: 15 | ret = super().forward(x) 16 | else: 17 | ret = x 18 | return ret 19 | # orig_type = x.dtype 20 | # ret = super().forward(x.type(torch.float32)) 21 | # return ret.type(orig_type) 22 | 23 | 24 | class QuickGELU(nn.Module): 25 | def forward(self, x: torch.Tensor): 26 | return x * torch.sigmoid(1.702 * x) 27 | 28 | 29 | class ResidualAttentionBlock(nn.Module): 30 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, dropout: float = 0.): 31 | super().__init__() 32 | 33 | self.attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout) 34 | self.ln_1 = LayerNorm(d_model) 35 | self.mlp = nn.Sequential(OrderedDict([ 36 | ("c_fc", nn.Linear(d_model, d_model * 4)), 37 | ("gelu", QuickGELU()), 38 | # ("dropout_1", nn.Dropout(dropout)), 39 | ("c_proj", nn.Linear(d_model * 4, d_model)), 40 | # ("dropout_2", nn.Dropout(dropout)) 41 | ])) 42 | self.ln_2 = LayerNorm(d_model) 43 | self.attn_mask = attn_mask 44 | 45 | def attention(self, x: torch.Tensor): 46 | self.attn_mask = self.attn_mask.to( 47 | dtype=x.dtype, device=x.device) if self.attn_mask is not None else None 48 | return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)[0] 49 | 50 | def forward(self, x: torch.Tensor): 51 | x = x + self.attention(self.ln_1(x)) 52 | x = x + self.mlp(self.ln_2(x)) 53 | return x 54 | 55 | 56 | class Transformer(nn.Module): 57 | def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, checkpoint: bool = False, dropout: float = 0., emb_dropout: float = 0.): 58 | super().__init__() 59 | self.width = width 60 | self.layers = layers 61 | self.checkpoint = checkpoint 62 | self.dropout = nn.Dropout(emb_dropout) 63 | self.resblocks = nn.Sequential( 64 | *[ResidualAttentionBlock(width, heads, attn_mask, dropout=dropout) for _ in range(layers)]) 65 | 66 | def checkpoint_fwd(self, layer, input, segments=2): 67 | """checkpoint forward""" 68 | # Make sure that the input to checkpoint have requires_grad=True, so that 69 | # the autograd can take care of the checkpointed part of model 70 | if not input.requires_grad: 71 | input = input.detach() 72 | input.requires_grad = True 73 | return checkpoint_sequential(layer, segments, input) 74 | 75 | def forward(self, x: torch.Tensor): 76 | x = self.dropout(x) 77 | if self.checkpoint: 78 | return self.checkpoint_fwd(self.resblocks, x, self.layers) 79 | return self.resblocks(x) 80 | 81 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/image_encoder/visual_transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from .base_transformer import Transformer, LayerNorm 5 | 6 | class VisualTransformer(nn.Module): 7 | def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, embed_dim: int, checkpoint: bool, dropout: float=0, emb_dropout: float=0): 8 | super().__init__() 9 | self.input_resolution = input_resolution 10 | output_dim = embed_dim 11 | self.output_dim = output_dim 12 | self.freeze_conv1 = True 13 | # self.freeze_conv1 = False 14 | self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, 15 | kernel_size=patch_size, stride=patch_size, bias=False) 16 | 17 | scale = width ** -0.5 18 | self.class_embedding = nn.Parameter(scale * torch.randn(width)) 19 | self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) 20 | self.ln_pre = LayerNorm(width) 21 | 22 | self.transformer = Transformer(width, layers, heads, checkpoint=checkpoint, dropout=dropout, emb_dropout=emb_dropout) 23 | 24 | self.ln_post = LayerNorm(width) 25 | self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) 26 | self.initialize_parameters() 27 | 28 | def initialize_parameters(self): 29 | nn.init.normal_(self.positional_embedding, std=0.01) 30 | 31 | proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5) 32 | attn_std = self.transformer.width ** -0.5 33 | fc_std = (2 * self.transformer.width) ** -0.5 34 | for block in self.transformer.resblocks: 35 | nn.init.normal_(block.attn.in_proj_weight, std=attn_std) 36 | nn.init.normal_(block.attn.out_proj.weight, std=proj_std) 37 | nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) 38 | nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) 39 | 40 | def train(self, mode=True): 41 | self.training = mode 42 | for module in self.children(): 43 | module.train(mode) 44 | 45 | if self.freeze_conv1: 46 | for layer in [self.conv1]: 47 | layer.eval() 48 | for param in layer.parameters(): 49 | param.requires_grad = False 50 | return self 51 | 52 | 53 | def forward(self, x: torch.Tensor, return_dense=False, return_feature=False): 54 | x = self.conv1(x) # shape = [*, width, grid, grid] 55 | # shape = [*, width, grid ** 2] 56 | x = x.reshape(x.shape[0], x.shape[1], -1) 57 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] 58 | x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], 59 | dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] 60 | x = x + self.positional_embedding.to(x.dtype) 61 | x = self.ln_pre(x) 62 | 63 | x = x.permute(1, 0, 2) # NLD -> LND 64 | x = self.transformer(x) 65 | x = x.permute(1, 0, 2) # LND -> NLD 66 | dense_feat = x[:, 1:, :] 67 | x = self.ln_post(x[:, 0, :]) 68 | feature = x 69 | 70 | if self.proj is not None: 71 | x = x @ self.proj 72 | 73 | ret = [x] 74 | if return_dense: 75 | ret.append(dense_feat) 76 | if return_feature: 77 | ret.append(feature) 78 | if len(ret) == 1: 79 | return ret[0] 80 | return tuple(ret) 81 | # if return_dense: 82 | # return x, dense_feat 83 | 84 | # return x 85 | 86 | def visual_transformer_B32(**kwargs): 87 | vision_width = 768 88 | vision_layers = 12 89 | vision_heads = vision_width // 64 90 | 91 | default_kwargs = { 92 | # 'output_dim': 512, from config 93 | 'layers':vision_layers, 94 | 'heads': vision_heads, 95 | 'input_resolution': 224, 96 | 'patch_size': 32, 97 | 'width': vision_width, 98 | 'checkpoint': False 99 | } 100 | default_kwargs.update(**kwargs) 101 | model = VisualTransformer(**default_kwargs) 102 | return model 103 | 104 | def visual_transformer_B16(**kwargs): 105 | vision_width = 768 106 | vision_layers = 12 107 | vision_heads = vision_width // 64 108 | 109 | default_kwargs = { 110 | # 'output_dim': 512, from config 111 | 'layers':vision_layers, 112 | 'heads': vision_heads, 113 | 'input_resolution': 224, 114 | 'patch_size': 16, 115 | 'width': vision_width, 116 | 'checkpoint': False 117 | } 118 | default_kwargs.update(**kwargs) 119 | model = VisualTransformer(**default_kwargs) 120 | return model 121 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/text_encoder/base_transformer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from torch import nn 5 | from torch.utils.checkpoint import checkpoint_sequential 6 | 7 | global LAYER_NORM 8 | LAYER_NORM = True 9 | 10 | class LayerNorm(nn.LayerNorm): 11 | """Subclass torch's LayerNorm to handle fp16.""" 12 | 13 | def forward(self, x: torch.Tensor): 14 | if LAYER_NORM: 15 | ret = super().forward(x) 16 | else: 17 | ret = x 18 | return ret 19 | # orig_type = x.dtype 20 | # ret = super().forward(x.type(torch.float32)) 21 | # return ret.type(orig_type) 22 | 23 | 24 | class QuickGELU(nn.Module): 25 | def forward(self, x: torch.Tensor): 26 | return x * torch.sigmoid(1.702 * x) 27 | 28 | 29 | class ResidualAttentionBlock(nn.Module): 30 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, dropout: float = 0.): 31 | super().__init__() 32 | 33 | self.attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout) 34 | self.ln_1 = LayerNorm(d_model) 35 | self.mlp = nn.Sequential(OrderedDict([ 36 | ("c_fc", nn.Linear(d_model, d_model * 4)), 37 | ("gelu", QuickGELU()), 38 | # ("dropout_1", nn.Dropout(dropout)), 39 | ("c_proj", nn.Linear(d_model * 4, d_model)), 40 | # ("dropout_2", nn.Dropout(dropout)) 41 | ])) 42 | self.ln_2 = LayerNorm(d_model) 43 | self.attn_mask = attn_mask 44 | 45 | def attention(self, x: torch.Tensor): 46 | self.attn_mask = self.attn_mask.to( 47 | dtype=x.dtype, device=x.device) if self.attn_mask is not None else None 48 | return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)[0] 49 | 50 | def forward(self, x: torch.Tensor): 51 | x = x + self.attention(self.ln_1(x)) 52 | x = x + self.mlp(self.ln_2(x)) 53 | return x 54 | 55 | 56 | class Transformer(nn.Module): 57 | def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, checkpoint: bool = False, dropout: float = 0., emb_dropout: float = 0.): 58 | super().__init__() 59 | self.width = width 60 | self.layers = layers 61 | self.checkpoint = checkpoint 62 | self.dropout = nn.Dropout(emb_dropout) 63 | self.resblocks = nn.Sequential( 64 | *[ResidualAttentionBlock(width, heads, attn_mask, dropout=dropout) for _ in range(layers)]) 65 | 66 | def checkpoint_fwd(self, layer, input, segments=2): 67 | """checkpoint forward""" 68 | # Make sure that the input to checkpoint have requires_grad=True, so that 69 | # the autograd can take care of the checkpointed part of model 70 | if not input.requires_grad: 71 | input = input.detach() 72 | input.requires_grad = True 73 | return checkpoint_sequential(layer, segments, input) 74 | 75 | def forward(self, x: torch.Tensor): 76 | x = self.dropout(x) 77 | if self.checkpoint: 78 | return self.checkpoint_fwd(self.resblocks, x, self.layers) 79 | return self.resblocks(x) 80 | 81 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/nnclr_modules/__init__.py: -------------------------------------------------------------------------------- 1 | """The lightly.models.modules package provides reusable modules. 2 | 3 | This package contains reusable modules such as the NNmemoryBankModule which 4 | can be combined with any lightly model. 5 | 6 | """ 7 | 8 | # Copyright (c) 2021. Lightly AG and its affiliates. 9 | # All Rights Reserved 10 | 11 | from .nn_memory_bank import NNMemoryBankModule 12 | from .memory_bank_cuda import MemoryBankModule 13 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/nnclr_modules/memory_bank.py: -------------------------------------------------------------------------------- 1 | """ Memory Bank Wrapper """ 2 | 3 | # Copyright (c) 2020. Lightly AG and its affiliates. 4 | # All Rights Reserved 5 | 6 | import torch 7 | import functools 8 | 9 | class MemoryBankModule(torch.nn.Module): 10 | """Memory bank implementation 11 | 12 | This is a parent class to all loss functions implemented by the lightly 13 | Python package. This way, any loss can be used with a memory bank if 14 | desired. 15 | 16 | Attributes: 17 | size: 18 | Number of keys the memory bank can store. If set to 0, 19 | memory bank is not used. 20 | 21 | Examples: 22 | >>> class MyLossFunction(MemoryBankModule): 23 | >>> 24 | >>> def __init__(self, memory_bank_size: int = 2 ** 16): 25 | >>> super(MyLossFunction, self).__init__(memory_bank_size) 26 | >>> 27 | >>> def forward(self, output: torch.Tensor, 28 | >>> labels: torch.Tensor = None): 29 | >>> 30 | >>> output, negatives = super( 31 | >>> MyLossFunction, self).forward(output) 32 | >>> 33 | >>> if negatives is not None: 34 | >>> # evaluate loss with negative samples 35 | >>> else: 36 | >>> # evaluate loss without negative samples 37 | 38 | """ 39 | 40 | def __init__(self, size: int = 2 ** 16): 41 | 42 | super(MemoryBankModule, self).__init__() 43 | 44 | if size < 0: 45 | msg = f'Illegal memory bank size {size}, must be non-negative.' 46 | raise ValueError(msg) 47 | 48 | self.size = size 49 | 50 | self.bank = None 51 | self.bank_ptr = None 52 | 53 | @torch.no_grad() 54 | def _init_memory_bank(self, dim: int): 55 | """Initialize the memory bank if it's empty 56 | 57 | Args: 58 | dim: 59 | The dimension of the which are stored in the bank. 60 | 61 | """ 62 | # create memory bank 63 | # we could use register buffers like in the moco repo 64 | # https://github.com/facebookresearch/moco but we don't 65 | # want to pollute our checkpoints 66 | self.bank = torch.randn(dim, self.size) 67 | self.bank = torch.nn.functional.normalize(self.bank, dim=0) 68 | self.bank_ptr = torch.LongTensor([0]) 69 | 70 | @torch.no_grad() 71 | def _dequeue_and_enqueue(self, batch: torch.Tensor): 72 | """Dequeue the oldest batch and add the latest one 73 | 74 | Args: 75 | batch: 76 | The latest batch of keys to add to the memory bank. 77 | 78 | """ 79 | batch_size = batch.shape[0] 80 | ptr = int(self.bank_ptr) 81 | 82 | if ptr + batch_size >= self.size: 83 | self.bank[:, ptr:] = batch[:self.size - ptr].T.detach() 84 | self.bank_ptr[0] = 0 85 | else: 86 | self.bank[:, ptr:ptr + batch_size] = batch.T.detach() 87 | self.bank_ptr[0] = ptr + batch_size 88 | 89 | def forward(self, 90 | output: torch.Tensor, 91 | labels: torch.Tensor = None, 92 | update: bool = False): 93 | """Query memory bank for additional negative samples 94 | 95 | Args: 96 | output: 97 | The output of the model. 98 | labels: 99 | Should always be None, will be ignored. 100 | 101 | Returns: 102 | The output if the memory bank is of size 0, otherwise the output 103 | and the entries from the memory bank. 104 | 105 | """ 106 | 107 | # no memory bank, return the output 108 | if self.size == 0: 109 | return output, None 110 | 111 | _, dim = output.shape 112 | 113 | # initialize the memory bank if it is not already done 114 | if self.bank is None: 115 | self._init_memory_bank(dim) 116 | 117 | # query and update memory bank 118 | bank = self.bank.clone().detach() 119 | 120 | # only update memory bank if we later do backward pass (gradient) 121 | if update: 122 | self._dequeue_and_enqueue(output) 123 | 124 | return output, bank 125 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/nnclr_modules/memory_bank_cuda.py: -------------------------------------------------------------------------------- 1 | """ Memory Bank Wrapper """ 2 | 3 | # Copyright (c) 2020. Lightly AG and its affiliates. 4 | # All Rights Reserved 5 | 6 | import torch 7 | import functools 8 | 9 | class MemoryBankModule(torch.nn.Module): 10 | """Memory bank implementation 11 | 12 | This is a parent class to all loss functions implemented by the lightly 13 | Python package. This way, any loss can be used with a memory bank if 14 | desired. 15 | 16 | Attributes: 17 | size: 18 | Number of keys the memory bank can store. If set to 0, 19 | memory bank is not used. 20 | 21 | Examples: 22 | >>> class MyLossFunction(MemoryBankModule): 23 | >>> 24 | >>> def __init__(self, memory_bank_size: int = 2 ** 16): 25 | >>> super(MyLossFunction, self).__init__(memory_bank_size) 26 | >>> 27 | >>> def forward(self, output: torch.Tensor, 28 | >>> labels: torch.Tensor = None): 29 | >>> 30 | >>> output, negatives = super( 31 | >>> MyLossFunction, self).forward(output) 32 | >>> 33 | >>> if negatives is not None: 34 | >>> # evaluate loss with negative samples 35 | >>> else: 36 | >>> # evaluate loss without negative samples 37 | 38 | """ 39 | 40 | def __init__(self, size: int = 2 ** 16): 41 | 42 | super(MemoryBankModule, self).__init__() 43 | 44 | if size < 0: 45 | msg = f'Illegal memory bank size {size}, must be non-negative.' 46 | raise ValueError(msg) 47 | 48 | self.size = size 49 | 50 | self.bank = None 51 | self.bank_ptr = None 52 | 53 | @torch.no_grad() 54 | def _init_memory_bank(self, dim: int): 55 | """Initialize the memory bank if it's empty 56 | 57 | Args: 58 | dim: 59 | The dimension of the which are stored in the bank. 60 | 61 | """ 62 | # create memory bank 63 | # we could use register buffers like in the moco repo 64 | # https://github.com/facebookresearch/moco but we don't 65 | # want to pollute our checkpoints 66 | self.bank = torch.randn(dim, self.size).cuda().half() 67 | self.bank = self.bank / (self.bank.norm(dim=0, keepdim=True)+1e-10) 68 | self.bank_ptr = torch.LongTensor([0]) 69 | 70 | @torch.no_grad() 71 | def _dequeue_and_enqueue(self, batch: torch.Tensor): 72 | """Dequeue the oldest batch and add the latest one 73 | 74 | Args: 75 | batch: 76 | The latest batch of keys to add to the memory bank. 77 | 78 | """ 79 | batch_size = batch.shape[0] 80 | ptr = int(self.bank_ptr) 81 | 82 | while ptr + batch_size >= self.size: 83 | self.bank[:, ptr:] = batch[:self.size - ptr].T.detach() 84 | batch = batch[self.size - ptr:] 85 | self.bank_ptr[0] = 0 86 | 87 | batch_size = batch.shape[0] 88 | ptr = int(self.bank_ptr) 89 | 90 | if batch_size != 0: 91 | self.bank[:, ptr:ptr + batch_size] = batch.T.detach() 92 | self.bank_ptr[0] = ptr + batch_size 93 | 94 | def forward(self, 95 | output: torch.Tensor, 96 | labels: torch.Tensor = None, 97 | update: bool = False): 98 | """Query memory bank for additional negative samples 99 | 100 | Args: 101 | output: 102 | The output of the model. 103 | labels: 104 | Should always be None, will be ignored. 105 | 106 | Returns: 107 | The output if the memory bank is of size 0, otherwise the output 108 | and the entries from the memory bank. 109 | 110 | """ 111 | 112 | # no memory bank, return the output 113 | if self.size == 0: 114 | return output, None 115 | 116 | _, dim = output.shape 117 | 118 | # initialize the memory bank if it is not already done 119 | if self.bank is None: 120 | self._init_memory_bank(dim) 121 | 122 | # query and update memory bank 123 | bank = self.bank.clone().detach() 124 | 125 | # only update memory bank if we later do backward pass (gradient) 126 | if update: 127 | self._dequeue_and_enqueue(output) 128 | 129 | return output, bank 130 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/nnclr_modules/nn_memory_bank.py: -------------------------------------------------------------------------------- 1 | """ Nearest Neighbour Memory Bank Module """ 2 | 3 | # Copyright (c) 2021. Lightly AG and its affiliates. 4 | # All Rights Reserved 5 | 6 | import torch 7 | from .memory_bank import MemoryBankModule 8 | 9 | 10 | class NNMemoryBankModule(MemoryBankModule): 11 | """Nearest Neighbour Memory Bank implementation 12 | 13 | This class implements a nearest neighbour memory bank as described in the 14 | NNCLR paper[0]. During the forward pass we return the nearest neighbour 15 | from the memory bank. 16 | 17 | [0] NNCLR, 2021, https://arxiv.org/abs/2104.14548 18 | 19 | Attributes: 20 | size: 21 | Number of keys the memory bank can store. If set to 0, 22 | memory bank is not used. 23 | 24 | Examples: 25 | >>> model = NNCLR(backbone) 26 | >>> criterion = NTXentLoss(temperature=0.1) 27 | >>> 28 | >>> nn_replacer = NNmemoryBankModule(size=2 ** 16) 29 | >>> 30 | >>> # forward pass 31 | >>> (z0, p0), (z1, p1) = model(x0, x1) 32 | >>> z0 = nn_replacer(z0.detach(), update=False) 33 | >>> z1 = nn_replacer(z1.detach(), update=True) 34 | >>> 35 | >>> loss = 0.5 * (criterion(z0, p1) + criterion(z1, p0)) 36 | 37 | """ 38 | def __init__(self, size: int = 2 ** 16, topk: int = 1): 39 | super(NNMemoryBankModule, self).__init__(size) 40 | self.topk = topk 41 | 42 | def forward(self, 43 | output: torch.Tensor, 44 | update: bool = False): 45 | """Returns nearest neighbour of output tensor from memory bank 46 | 47 | Args: 48 | output: The torch tensor for which you want the nearest neighbour 49 | update: If `True` updated the memory bank by adding output to it 50 | 51 | """ 52 | 53 | output, bank = super(NNMemoryBankModule, self).forward(output, update=update) 54 | bank = bank.to(output.device).t() 55 | 56 | output_normed = torch.nn.functional.normalize(output, dim=1) 57 | bank_normed = torch.nn.functional.normalize(bank, dim=1) 58 | 59 | similarity_matrix = torch.einsum("nd,md->nm", output_normed, bank_normed) 60 | # index_nearest_neighbours = torch.argmax(similarity_matrix, dim=1) 61 | # nearest_neighbours = torch.index_select(bank, dim=0, index=index_nearest_neighbours) 62 | _, index_nearest_neighbours = torch.topk(similarity_matrix, k=self.topk, dim=1) 63 | nearest_neighbours = [torch.index_select(bank, dim=0, index=index_nearest_neighbours[:,i]) for i in range(self.topk)] 64 | 65 | return nearest_neighbours 66 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/text_utils/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/models/declip_model/utils/text_utils/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/text_utils/mask_tokens.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Tuple, List 3 | 4 | 5 | def mask_tokens(inputs, special_tokens, mask_token, tokenizer_length, mlm_probability=0.15, special_tokens_mask=None) -> Tuple[torch.Tensor, torch.Tensor]: 6 | """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ 7 | labels = inputs.clone() 8 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) 9 | probability_matrix = torch.full(labels.shape, mlm_probability) 10 | if special_tokens_mask is None: 11 | special_tokens_mask = [1 if val in special_tokens else 0 for val in labels.tolist()] 12 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) 13 | # if tokenizer._pad_token is not None: 14 | # padding_mask = labels.eq(tokenizer.pad_token_id) 15 | # probability_matrix.masked_fill_(padding_mask, value=0.0) 16 | masked_indices = torch.bernoulli(probability_matrix).bool() 17 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 18 | 19 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 20 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 21 | inputs[indices_replaced] = mask_token 22 | 23 | # 10% of the time, we replace masked input tokens with random word 24 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 25 | random_words = torch.randint(tokenizer_length, labels.shape, dtype=torch.long) 26 | inputs[indices_random] = random_words[indices_random] 27 | 28 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 29 | return inputs, labels 30 | 31 | 32 | def MaskTokens(tokens, mask_type, mask_token, special_tokens=None, tokenizer_length=None, sepcial_tokens_mask=None, special_tokens_mask=None): 33 | if mask_type == 'MLM': 34 | tokens, labels = mask_tokens(inputs=tokens, special_tokens=special_tokens, mask_token=mask_token, tokenizer_length=tokenizer_length, special_tokens_mask=special_tokens_mask) 35 | else: 36 | raise NotImplementedError(mask_type) 37 | return tokens, labels 38 | -------------------------------------------------------------------------------- /vision_benchmark/models/declip_model/utils/text_utils/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | # Change: Extend <|mask|> tokenizer-size+=1 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(os.path.join(os.path.dirname(os.path.abspath(__file__)), bpe_path)).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|mask|>']) 74 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 75 | self.encoder = dict(zip(vocab, range(len(vocab)))) 76 | self.decoder = {v: k for k, v in self.encoder.items()} 77 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 78 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 79 | self.cache['<|mask|>'] = '<|mask|>' 80 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 81 | 82 | def bpe(self, token): 83 | if token in self.cache: 84 | return self.cache[token] 85 | word = tuple(token[:-1]) + ( token[-1] + '',) 86 | pairs = get_pairs(word) 87 | 88 | if not pairs: 89 | return token+'' 90 | 91 | while True: 92 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 93 | if bigram not in self.bpe_ranks: 94 | break 95 | first, second = bigram 96 | new_word = [] 97 | i = 0 98 | while i < len(word): 99 | try: 100 | j = word.index(first, i) 101 | new_word.extend(word[i:j]) 102 | i = j 103 | except: 104 | new_word.extend(word[i:]) 105 | break 106 | 107 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 108 | new_word.append(first+second) 109 | i += 2 110 | else: 111 | new_word.append(word[i]) 112 | i += 1 113 | new_word = tuple(new_word) 114 | word = new_word 115 | if len(word) == 1: 116 | break 117 | else: 118 | pairs = get_pairs(word) 119 | word = ' '.join(word) 120 | self.cache[token] = word 121 | return word 122 | 123 | def encode(self, text): 124 | bpe_tokens = [] 125 | text = whitespace_clean(basic_clean(text)).lower() 126 | for token in re.findall(self.pat, text): 127 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 128 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 129 | return bpe_tokens 130 | 131 | def decode(self, tokens): 132 | text = ''.join([self.decoder[token] for token in tokens]) 133 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 134 | return text 135 | -------------------------------------------------------------------------------- /vision_benchmark/models/mae.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm 9 | # DeiT: https://github.com/facebookresearch/deit 10 | # -------------------------------------------------------- 11 | 12 | from functools import partial 13 | 14 | import torch 15 | import torch.nn as nn 16 | 17 | import timm.models.vision_transformer 18 | 19 | import logging 20 | 21 | 22 | class VisionTransformer(timm.models.vision_transformer.VisionTransformer): 23 | """ Vision Transformer with support for global average pooling 24 | """ 25 | def __init__(self, global_pool=False, **kwargs): 26 | super(VisionTransformer, self).__init__(**kwargs) 27 | 28 | self.global_pool = global_pool 29 | if self.global_pool: 30 | norm_layer = kwargs['norm_layer'] 31 | embed_dim = kwargs['embed_dim'] 32 | self.fc_norm = norm_layer(embed_dim) 33 | 34 | del self.norm # remove the original norm 35 | 36 | def forward_features(self, x): 37 | B = x.shape[0] 38 | x = self.patch_embed(x) 39 | 40 | cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks 41 | x = torch.cat((cls_tokens, x), dim=1) 42 | x = x + self.pos_embed 43 | x = self.pos_drop(x) 44 | 45 | for blk in self.blocks: 46 | x = blk(x) 47 | 48 | if self.global_pool: 49 | x = x[:, 1:, :].mean(dim=1) # global pool without cls token 50 | outcome = self.fc_norm(x) 51 | else: 52 | x = self.norm(x) 53 | outcome = x[:, 0] 54 | 55 | return outcome 56 | 57 | 58 | def vit_base_patch16(**kwargs): 59 | model = VisionTransformer( 60 | patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 61 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 62 | return model 63 | 64 | 65 | def vit_large_patch16(**kwargs): 66 | model = VisionTransformer( 67 | patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, 68 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 69 | return model 70 | 71 | 72 | def vit_huge_patch14(**kwargs): 73 | model = VisionTransformer( 74 | patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, 75 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 76 | return model 77 | 78 | def get_model(config): 79 | mae_specs = config.MODEL.SPEC 80 | 81 | model = VisionTransformer( 82 | patch_size=mae_specs.PATCH_SIZE, embed_dim=mae_specs.EMBED_DIM, 83 | depth=mae_specs.DEPTH, num_heads=mae_specs.NUM_HEADS, mlp_ratio=mae_specs.MLP_RATIO, 84 | qkv_bias=mae_specs.QKV_BIAS, 85 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 86 | global_pool=mae_specs.GLOBAL_POOL) 87 | 88 | model_file = config.TEST.MODEL_FILE 89 | logging.info(f'=> load model file: {model_file}') 90 | 91 | if model_file.startswith('http'): 92 | checkpoint = torch.hub.load_state_dict_from_url(model_file, progress=False, map_location="cpu") 93 | else: 94 | checkpoint = torch.load(model_file, map_location="cpu") 95 | 96 | state_dict = checkpoint['model'] 97 | 98 | incompatible = model.load_state_dict(state_dict, strict=False) 99 | 100 | if incompatible.missing_keys: 101 | logging.warning('Missing keys: {}'.format(', '.join(incompatible.missing_keys))) 102 | if incompatible.unexpected_keys: 103 | logging.warning('Unexpected keys: {}'.format(', '.join(incompatible.unexpected_keys))) 104 | 105 | return model -------------------------------------------------------------------------------- /vision_benchmark/models/mocov3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | import torch 9 | import torch.nn as nn 10 | from functools import partial, reduce 11 | from operator import mul 12 | 13 | from timm.models.vision_transformer import VisionTransformer, _cfg 14 | from timm.models.layers.helpers import to_2tuple 15 | from timm.models.layers import PatchEmbed 16 | 17 | import logging 18 | 19 | 20 | class VisionTransformerMoCo(VisionTransformer): 21 | def __init__(self, stop_grad_conv1=False, **kwargs): 22 | super().__init__(**kwargs) 23 | # Use fixed 2D sin-cos position embedding 24 | self.build_2d_sincos_position_embedding() 25 | 26 | # weight initialization 27 | for name, m in self.named_modules(): 28 | if isinstance(m, nn.Linear): 29 | if 'qkv' in name: 30 | # treat the weights of Q, K, V separately 31 | val = math.sqrt(6. / float(m.weight.shape[0] // 3 + m.weight.shape[1])) 32 | nn.init.uniform_(m.weight, -val, val) 33 | else: 34 | nn.init.xavier_uniform_(m.weight) 35 | nn.init.zeros_(m.bias) 36 | nn.init.normal_(self.cls_token, std=1e-6) 37 | 38 | if isinstance(self.patch_embed, PatchEmbed): 39 | # xavier_uniform initialization 40 | val = math.sqrt(6. / float(3 * reduce(mul, self.patch_embed.patch_size, 1) + self.embed_dim)) 41 | nn.init.uniform_(self.patch_embed.proj.weight, -val, val) 42 | nn.init.zeros_(self.patch_embed.proj.bias) 43 | 44 | if stop_grad_conv1: 45 | self.patch_embed.proj.weight.requires_grad = False 46 | self.patch_embed.proj.bias.requires_grad = False 47 | 48 | def build_2d_sincos_position_embedding(self, temperature=10000.): 49 | h, w = self.patch_embed.grid_size 50 | grid_w = torch.arange(w, dtype=torch.float32) 51 | grid_h = torch.arange(h, dtype=torch.float32) 52 | grid_w, grid_h = torch.meshgrid(grid_w, grid_h) 53 | assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' 54 | pos_dim = self.embed_dim // 4 55 | omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim 56 | omega = 1. / (temperature**omega) 57 | out_w = torch.einsum('m,d->md', [grid_w.flatten(), omega]) 58 | out_h = torch.einsum('m,d->md', [grid_h.flatten(), omega]) 59 | pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :] 60 | 61 | assert self.num_tokens == 1, 'Assuming one and only one token, [cls]' 62 | pe_token = torch.zeros([1, 1, self.embed_dim], dtype=torch.float32) 63 | self.pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1)) 64 | self.pos_embed.requires_grad = False 65 | 66 | 67 | class ConvStem(nn.Module): 68 | """ 69 | ConvStem, from Early Convolutions Help Transformers See Better, Tete et al. https://arxiv.org/abs/2106.14881 70 | """ 71 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): 72 | super().__init__() 73 | 74 | assert patch_size == 16, 'ConvStem only supports patch size of 16' 75 | assert embed_dim % 8 == 0, 'Embed dimension must be divisible by 8 for ConvStem' 76 | 77 | img_size = to_2tuple(img_size) 78 | patch_size = to_2tuple(patch_size) 79 | self.img_size = img_size 80 | self.patch_size = patch_size 81 | self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) 82 | self.num_patches = self.grid_size[0] * self.grid_size[1] 83 | self.flatten = flatten 84 | 85 | # build stem, similar to the design in https://arxiv.org/abs/2106.14881 86 | stem = [] 87 | input_dim, output_dim = 3, embed_dim // 8 88 | for l in range(4): 89 | stem.append(nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=2, padding=1, bias=False)) 90 | stem.append(nn.BatchNorm2d(output_dim)) 91 | stem.append(nn.ReLU(inplace=True)) 92 | input_dim = output_dim 93 | output_dim *= 2 94 | stem.append(nn.Conv2d(input_dim, embed_dim, kernel_size=1)) 95 | self.proj = nn.Sequential(*stem) 96 | 97 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 98 | 99 | def forward(self, x): 100 | B, C, H, W = x.shape 101 | assert H == self.img_size[0] and W == self.img_size[1], \ 102 | f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." 103 | x = self.proj(x) 104 | if self.flatten: 105 | x = x.flatten(2).transpose(1, 2) # BCHW -> BNC 106 | x = self.norm(x) 107 | return x 108 | 109 | 110 | def vit_small(**kwargs): 111 | model = VisionTransformerMoCo( 112 | patch_size=16, embed_dim=384, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 113 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 114 | model.default_cfg = _cfg() 115 | return model 116 | 117 | def vit_base(**kwargs): 118 | model = VisionTransformerMoCo( 119 | patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 120 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 121 | model.default_cfg = _cfg() 122 | return model 123 | 124 | def vit_conv_small(**kwargs): 125 | # minus one ViT block 126 | model = VisionTransformerMoCo( 127 | patch_size=16, embed_dim=384, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True, 128 | norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs) 129 | model.default_cfg = _cfg() 130 | return model 131 | 132 | def vit_conv_base(**kwargs): 133 | # minus one ViT block 134 | model = VisionTransformerMoCo( 135 | patch_size=16, embed_dim=768, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True, 136 | norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs) 137 | model.default_cfg = _cfg() 138 | return model 139 | 140 | 141 | def get_model(config): 142 | mae_specs = config.MODEL.SPEC 143 | 144 | model = VisionTransformerMoCo( 145 | patch_size=mae_specs.PATCH_SIZE, embed_dim=mae_specs.EMBED_DIM, 146 | depth=mae_specs.DEPTH, num_heads=mae_specs.NUM_HEADS, mlp_ratio=mae_specs.MLP_RATIO, 147 | qkv_bias=mae_specs.QKV_BIAS, 148 | norm_layer=partial(nn.LayerNorm, eps=1e-6)) 149 | 150 | model_file = config.TEST.MODEL_FILE 151 | logging.info(f'=> load model file: {model_file}') 152 | 153 | if model_file.startswith('http'): 154 | checkpoint = torch.hub.load_state_dict_from_url(model_file, progress=False, map_location="cpu") 155 | else: 156 | checkpoint = torch.load(model_file, map_location="cpu") 157 | 158 | # rename moco pre-trained keys 159 | state_dict = checkpoint['state_dict'] 160 | for k in list(state_dict.keys()): 161 | if k.startswith('module.base_encoder'): 162 | state_dict[k[len("module.base_encoder."):]] = state_dict[k] 163 | del state_dict[k] 164 | elif k.startswith('module.'): 165 | state_dict[k[len("module."):]] = state_dict[k] 166 | del state_dict[k] 167 | 168 | incompatible = model.load_state_dict(state_dict, strict=False) 169 | 170 | if incompatible.missing_keys: 171 | logging.warning('Missing keys: {}'.format(', '.join(incompatible.missing_keys))) 172 | if incompatible.unexpected_keys: 173 | logging.warning('Unexpected keys: {}'.format(', '.join(incompatible.unexpected_keys))) 174 | 175 | return model 176 | -------------------------------------------------------------------------------- /vision_benchmark/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_optimizer 2 | 3 | __all__ = ['build_optimizer'] 4 | -------------------------------------------------------------------------------- /vision_benchmark/optim/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | from timm.optim import create_optimizer 9 | 10 | def _is_depthwise(m): 11 | return ( 12 | isinstance(m, nn.Conv2d) 13 | and m.groups == m.in_channels 14 | and m.groups == m.out_channels 15 | ) 16 | 17 | 18 | def _set_wd(cfg, model): 19 | without_decay_list = cfg.TRAIN.WITHOUT_WD_LIST 20 | without_decay_depthwise = [] 21 | without_decay_norm = [] 22 | for m in model.modules(): 23 | if _is_depthwise(m) and 'depthwise' in without_decay_list: 24 | without_decay_depthwise.append(m.weight) 25 | elif isinstance(m, nn.BatchNorm2d) and 'bn' in without_decay_list: 26 | without_decay_norm.append(m.weight) 27 | without_decay_norm.append(m.bias) 28 | elif isinstance(m, nn.GroupNorm) and 'gn' in without_decay_list: 29 | without_decay_norm.append(m.weight) 30 | without_decay_norm.append(m.bias) 31 | elif isinstance(m, nn.LayerNorm) and 'ln' in without_decay_list: 32 | without_decay_norm.append(m.weight) 33 | without_decay_norm.append(m.bias) 34 | 35 | with_decay = [] 36 | without_decay = [] 37 | 38 | skip = {} 39 | if hasattr(model, 'no_weight_decay'): 40 | skip = model.no_weight_decay() 41 | 42 | for n, p in model.named_parameters(): 43 | ever_set = False 44 | 45 | if p.requires_grad is False: 46 | continue 47 | 48 | if n in skip: 49 | print('=> set {} wd to 0'.format(n)) 50 | without_decay.append(p) 51 | continue 52 | 53 | for pp in without_decay_depthwise: 54 | if p is pp: 55 | if cfg.VERBOSE: 56 | print('=> set depthwise({}) wd to 0'.format(n)) 57 | without_decay.append(p) 58 | ever_set = True 59 | break 60 | 61 | for pp in without_decay_norm: 62 | if p is pp: 63 | if cfg.VERBOSE: 64 | print('=> set norm({}) wd to 0'.format(n)) 65 | without_decay.append(p) 66 | ever_set = True 67 | break 68 | 69 | if ( 70 | (not ever_set) 71 | and 'bias' in without_decay_list 72 | and n.endswith('.bias') 73 | ): 74 | if cfg.VERBOSE: 75 | print('=> set bias({}) wd to 0'.format(n)) 76 | without_decay.append(p) 77 | elif not ever_set: 78 | with_decay.append(p) 79 | 80 | # assert (len(with_decay) + len(without_decay) == len(list(model.parameters()))) 81 | params = [ 82 | {'params': with_decay}, 83 | {'params': without_decay, 'weight_decay': 0.} 84 | ] 85 | return params 86 | 87 | 88 | def build_optimizer(cfg, model): 89 | if cfg.TRAIN.OPTIMIZER == 'timm': 90 | args = cfg.TRAIN.OPTIMIZER_ARGS 91 | 92 | print(f'=> usage timm optimizer args: {cfg.TRAIN.OPTIMIZER_ARGS}') 93 | optimizer = create_optimizer(args, model) 94 | 95 | return optimizer 96 | 97 | optimizer = None 98 | params = _set_wd(cfg, model) 99 | if cfg.TRAIN.OPTIMIZER == 'sgd': 100 | if cfg.TRAIN.TWO_LR: 101 | 102 | trunk_parameters = [] 103 | head_parameters = [] 104 | for name, param in model.named_parameters(): 105 | if 'backbone' in name: 106 | trunk_parameters.append(param) 107 | else: 108 | head_parameters.append(param) 109 | 110 | optimizer = optim.SGD( 111 | [{'params': trunk_parameters}, 112 | {'params': head_parameters, 'lr': cfg.TRAIN.LR }], 113 | lr=cfg.TRAIN.LR * 0.1, 114 | momentum=cfg.TRAIN.MOMENTUM, 115 | weight_decay=cfg.TRAIN.WD, 116 | nesterov=cfg.TRAIN.NESTEROV 117 | ) 118 | 119 | else: 120 | optimizer = optim.SGD( 121 | params, 122 | # filter(lambda p: p.requires_grad, model.parameters()), 123 | lr=cfg.TRAIN.LR, 124 | momentum=cfg.TRAIN.MOMENTUM, 125 | weight_decay=cfg.TRAIN.WD, 126 | nesterov=cfg.TRAIN.NESTEROV 127 | ) 128 | elif cfg.TRAIN.OPTIMIZER == 'adam': 129 | 130 | if cfg.TRAIN.TWO_LR: 131 | 132 | trunk_parameters = [] 133 | head_parameters = [] 134 | for name, param in model.named_parameters(): 135 | if 'backbone' in name: 136 | trunk_parameters.append(param) 137 | else: 138 | head_parameters.append(param) 139 | 140 | optimizer = optim.Adam( 141 | [{'params': trunk_parameters}, 142 | {'params': head_parameters, 'lr': cfg.TRAIN.LR}], 143 | lr=cfg.TRAIN.LR * 0.1, 144 | weight_decay=cfg.TRAIN.WD, 145 | ) 146 | else: 147 | optimizer = optim.Adam( 148 | params, 149 | # filter(lambda p: p.requires_grad, model.parameters()), 150 | lr=cfg.TRAIN.LR, 151 | weight_decay=cfg.TRAIN.WD, 152 | ) 153 | elif cfg.TRAIN.OPTIMIZER == 'adamW': 154 | optimizer = optim.AdamW( 155 | params, 156 | lr=cfg.TRAIN.LR, 157 | weight_decay=cfg.TRAIN.WD, 158 | ) 159 | elif cfg.TRAIN.OPTIMIZER == 'rmsprop': 160 | optimizer = optim.RMSprop( 161 | params, 162 | # filter(lambda p: p.requires_grad, model.parameters()), 163 | lr=cfg.TRAIN.LR, 164 | momentum=cfg.TRAIN.MOMENTUM, 165 | weight_decay=cfg.TRAIN.WD, 166 | alpha=cfg.TRAIN.RMSPROP_ALPHA, 167 | centered=cfg.TRAIN.RMSPROP_CENTERED 168 | ) 169 | 170 | return optimizer 171 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/caltech101.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'caltech-101' 5 | ROOT: '../DATASET/caltech101-tf/' 6 | NUM_CLASSES: 102 7 | TEST: 8 | METRIC: 'mean-per-class' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/cifar10.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'cifar-10' 5 | ROOT: '../../DATASET/cifar10/' 6 | NUM_CLASSES: 10 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/cifar100.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'cifar-100' 5 | ROOT: '../DATASET/cifar100/' 6 | NUM_CLASSES: 100 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/country211.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'country211' 4 | ROOT: '../DATASET/country211/' 5 | NUM_CLASSES: 211 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/dtd.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'dtd' 5 | ROOT: '../DATASET/dtd-v1/' 6 | NUM_CLASSES: 47 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/eurosat-clip.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'eurosat_clip' 4 | ROOT: '../DATASET/eurosat_clip/' 5 | NUM_CLASSES: 10 6 | TEST: 7 | METRIC: 'accuracy' -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/fer2013.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'fer-2013' 5 | ROOT: '../DATASET/fer2013-v1/' 6 | NUM_CLASSES: 7 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/fgvc-aircraft-2013b.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATA_DIR: '' 3 | 4 | DATASET: 5 | DATASET: 'fgvc-aircraft-2013b-variants102' 6 | ROOT: '../DATASET/fgvc-aircraft-2013b-variants102/' 7 | NUM_CLASSES: 100 8 | TEST: 9 | METRIC: 'mean-per-class' 10 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/flower102.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'oxford-flower-102' 4 | ROOT: '../DATASET/flower102/' 5 | NUM_CLASSES: 102 6 | TEST: 7 | METRIC: 'mean-per-class' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/food101.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATA_DIR: '' 3 | 4 | DATASET: 5 | DATASET: 'food-101' 6 | ROOT: '../DATASET/food101/' 7 | NUM_CLASSES: 101 8 | TEST: 9 | METRIC: 'accuracy' 10 | DEBUG: 11 | DEBUG: false 12 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/gtsrb.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'gtsrb' 4 | ROOT: '../DATASET/gtsrb/' 5 | NUM_CLASSES: 43 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/hateful-memes.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | DATASET: 'hateful-memes' 3 | ROOT: '../DATASET/hateful_memes/' 4 | NUM_CLASSES: 2 5 | TEST: 6 | METRIC: 'roc_auc' -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/imagenet-1k.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | DATASET: 'imagenet-1k' 3 | ROOT: '../DATASET/imagenet-1k/' 4 | TEST: 5 | METRIC: 'accuracy' 6 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/kitti-distance.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'kitti-distance' 4 | CENTER_CROP: false 5 | ROOT: '../DATASET/kitti_distance_20210923/' 6 | NUM_CLASSES: 4 7 | TEST: 8 | METRIC: 'accuracy' -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/mnist.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'mnist' 4 | ROOT: '../DATASET/mnist/' 5 | NUM_CLASSES: 10 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/oxford-iiit-pets.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'oxford-iiit-pets' 4 | ROOT: '../DATASET/pet37/' 5 | NUM_CLASSES: 37 6 | TEST: 7 | METRIC: 'mean-per-class' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/patchcamelyon.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'patch-camelyon' 5 | ROOT: '../DATASET/patchcamelyon/' 6 | NUM_CLASSES: 2 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/rendered-sst2.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'rendered-sst2' 4 | ROOT: '../DATASET/rendered-sst2/' 5 | NUM_CLASSES: 2 6 | TEST: 7 | METRIC: 'accuracy' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/resisc45-clip.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'resisc45_clip' 4 | ROOT: '../DATASET/resisc45_clip/' 5 | NUM_CLASSES: 45 6 | TEST: 7 | METRIC: 'accuracy' -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/stanfordcar.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | 3 | DATASET: 4 | DATASET: 'stanford-cars' 5 | ROOT: '../DATASET/stanfordcars/' 6 | NUM_CLASSES: 196 7 | TEST: 8 | METRIC: 'accuracy' 9 | -------------------------------------------------------------------------------- /vision_benchmark/resources/datasets/voc2007classification.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | DATASET: 3 | DATASET: 'voc-2007-classification' 4 | ROOT: '../DATASET/voc2007/' 5 | NUM_CLASSES: 20 6 | TEST: 7 | METRIC: '11point_mAP' 8 | -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/cifar-10_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "airplane", "def_wiki": "A powered heavier-than-air aircraft with fixed wings.", "path_wn": ["airplane", "heavier-than-air_craft", "aircraft", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an aircraft that has a fixed wing and is powered by propellers or jets"}, {"classname": "automobile", "def_wiki": "A type of vehicle designed to move on the ground under its own stored power and intended to carry a driver, a small number of additional passengers, and a very limited amount of other load. A car or motorcar.", "path_wn": ["car", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a motor vehicle with four wheels; usually propelled by an internal combustion engine"}, {"classname": "bird", "def_wiki": "A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", "path_wn": ["bird", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"}, {"classname": "cat", "def_wiki": "An animal of the family Felidae:\n Synonym: felid", "path_wn": ["cat", "feline", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats"}, {"classname": "deer", "def_wiki": "A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", "path_wn": ["deer", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "distinguished from Bovidae by the male's having solid deciduous antlers"}, {"classname": "dog", "def_wiki": "A mammal, Canis familiaris or Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.", "path_wn": ["dog", "canine", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds"}, {"classname": "frog", "def_wiki": "A small tailless amphibian of the order Anura that typically hops.", "path_wn": ["frog", "amphibian", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "any of various tailless stout-bodied amphibians with long hind limbs for leaping; semiaquatic and terrestrial species"}, {"classname": "horse", "def_wiki": "Any of several animals related to Equus ferus caballus.", "path_wn": ["horse", "equine", "odd-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "solid-hoofed herbivorous quadruped domesticated since prehistoric times"}, {"classname": "ship", "def_wiki": "A water-borne vessel generally larger than a boat.", "path_wn": ["ship", "vessel", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a vessel that carries passengers or freight"}, {"classname": "truck", "def_wiki": "A small wheel or roller, specifically the wheel of a gun carriage.", "path_wn": ["truck", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an automotive vehicle suitable for hauling"}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/eurosat_clip_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "annual crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "forest", "def_wiki": "A dense uncultivated tract of trees and undergrowth, larger than woods.", "path_wn": ["forest", "vegetation", "collection", "group", "abstraction", "entity"], "def_wn": "the trees and other plants in a large densely wooded area"}, {"classname": "brushland or shrubland", "def_wiki": "Land that is covered mostly with shrubs.", "path_wn": "", "def_wn": ""}, {"classname": "highway or road", "def_wiki": "A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard: a country road is the same as a country lane.", "path_wn": "", "def_wn": ""}, {"classname": "industrial buildings or commercial buildings", "def_wiki": "The act or process by which something is built; construction.", "path_wn": "", "def_wn": ""}, {"classname": "pasture land", "def_wiki": "land used for grazing animals", "path_wn": "", "def_wn": ""}, {"classname": "permanent crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "residential buildings or homes or apartments", "def_wiki": "A complete domicile occupying only part of a building, especially one for rent; a flat.", "path_wn": "", "def_wn": ""}, {"classname": "river", "def_wiki": "A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", "path_wn": ["river", "stream", "body_of_water", "thing", "physical_entity", "entity"], "def_wn": "a large natural stream of water (larger than a creek)"}, {"classname": "lake or sea", "def_wiki": "A large body of salt water.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/fer-2013_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "angry", "def_wiki": "Displaying or feeling anger.", "path_wn": ["angry"], "def_wn": "feeling or showing anger"}, {"classname": "disgusted", "def_wiki": "Filled with disgust.", "path_wn": ["disgust", "dislike", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "fill with distaste"}, {"classname": "fearful", "def_wiki": "Frightening.", "path_wn": ["fearful"], "def_wn": "experiencing or showing fear"}, {"classname": "happy", "def_wiki": "Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", "path_wn": ["happy"], "def_wn": "enjoying or showing or marked by joy or pleasure"}, {"classname": "neutral", "def_wiki": "Not taking sides in a conflict such as war; nonaligned.", "path_wn": ["neutral", "person", "causal_agent", "physical_entity", "entity"], "def_wn": "one who does not side with any party in a war or dispute"}, {"classname": "sad", "def_wiki": "Emotionally negative.", "path_wn": ["sad"], "def_wn": "experiencing or showing sorrow or unhappiness; ; - Christina Rossetti"}, {"classname": "surprised", "def_wiki": "Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", "path_wn": ["surprise", "astonishment", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "cause to be surprised"}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/hateful-memes_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": ["meme", "acculturation", "content", "cognition", "psychological_feature", "abstraction", "entity"], "def_wn": "a cultural unit (an idea or value or pattern of behavior) that is passed from one person to another by non-genetic means (as by imitation)"}, {"classname": "hatespeech meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/kitti-distance_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "a photo i took of a car on my left or right side.", "def_wiki": "The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car nearby.", "def_wiki": "adjacent, near, close by", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car in the distance.", "def_wiki": "far away; a long distance away", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with no car.", "def_wiki": "A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/mnist_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "0", "def_wiki": "0.", "path_wn": ["nothing", "relative_quantity", "measure", "abstraction", "entity"], "def_wn": "a mathematical element that when added to another number yields the same number"}, {"classname": "1", "def_wiki": "The number one (1).", "path_wn": ["one", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the smallest whole number or a numeral representing this number"}, {"classname": "2", "def_wiki": "A particle used for marking the following verb as an infinitive.", "path_wn": ["two", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one or a numeral representing this number"}, {"classname": "3", "def_wiki": null, "path_wn": ["three", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one and one"}, {"classname": "4", "def_wiki": "Because, as, since.", "path_wn": ["four", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of three and one"}, {"classname": "5", "def_wiki": null, "path_wn": ["five", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of four and one"}, {"classname": "6", "def_wiki": "MI6; the agency or a particular agent.", "path_wn": ["six", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of five and one"}, {"classname": "7", "def_wiki": null, "path_wn": ["seven", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of six and one"}, {"classname": "8", "def_wiki": "To ingest; to be ingested.", "path_wn": ["eight", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of seven and one"}, {"classname": "9", "def_wiki": null, "path_wn": ["nine", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of eight and one"}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/patch-camelyon_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "lymph node", "def_wiki": "Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue filled with lymphocytes and macrophages that collect and destroy bacteria, viruses and foreign matter from lymph. When the body is fighting an infection, these lymphocytes multiply rapidly and produce a characteristic swelling of the lymph nodes.", "path_wn": "", "def_wn": ""}, {"classname": "lymph node containing metastatic tumor tissue", "def_wiki": "Thin, woven, gauze-like fabric.", "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/rendered-sst2_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "negative", "def_wiki": "Not positive nor neutral.", "path_wn": ["negative", "denial", "speech_act", "act", "event", "psychological_feature", "abstraction", "entity"], "def_wn": "a reply of denial"}, {"classname": "positive", "def_wiki": "Not negative or neutral.", "path_wn": ["positive", "adjective", "modifier", "content_word", "word", "language_unit", "part", "relation", "abstraction", "entity"], "def_wn": "the primary form of an adjective or adverb; denotes a quality without qualification, comparison, or relation to increase or diminution"}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/external/voc-2007-classification_knowledge.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "aeroplane", "def_wiki": "A powered heavier-than-air aircraft with fixed wings.", "path_wn": ["airplane", "heavier-than-air_craft", "aircraft", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an aircraft that has a fixed wing and is powered by propellers or jets"}, {"classname": "bicycle", "def_wiki": "A vehicle that has two wheels, one behind the other, a steering handle, and a saddle seat or seats and is usually propelled by the action of a rider\u2019s feet upon pedals.", "path_wn": ["bicycle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a wheeled vehicle that has two wheels and is moved by foot pedals"}, {"classname": "bird", "def_wiki": "A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", "path_wn": ["bird", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"}, {"classname": "boat", "def_wiki": "A craft used for transportation of goods, fishing, racing, recreational cruising, or military use on or in the water, propelled by oars or outboard motor or inboard motor or by wind.", "path_wn": ["boat", "vessel", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a small vessel for travel on water"}, {"classname": "bottle", "def_wiki": "A container, typically made of glass or plastic and having a tapered neck, used primarily for holding liquids.", "path_wn": ["bottle", "vessel", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a glass or plastic vessel used for storing drinks or other liquids; typically cylindrical without handles and with a narrow neck that can be plugged or capped"}, {"classname": "bus", "def_wiki": "A motor vehicle for transporting large numbers of people along roads.", "path_wn": ["bus", "public_transport", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a vehicle carrying many passengers; used for public transport"}, {"classname": "car", "def_wiki": "A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", "path_wn": ["car", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a motor vehicle with four wheels; usually propelled by an internal combustion engine"}, {"classname": "cat", "def_wiki": "An animal of the family Felidae:\n Synonym: felid", "path_wn": ["cat", "feline", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats"}, {"classname": "chair", "def_wiki": "An item of furniture used to sit on or in, comprising a seat, legs or wheels, back, and sometimes arm rests, for use by one person. Compare stool, couch, sofa, settee, loveseat and bench.", "path_wn": ["chair", "seat", "furniture", "furnishing", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a seat for one person, with a support for the back"}, {"classname": "cow", "def_wiki": "An adult female of the species Bos taurus, especially one that has calved.", "path_wn": ["cow", "cattle", "bovine", "bovid", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "female of domestic cattle:"}, {"classname": "diningtable", "def_wiki": null, "path_wn": "", "def_wn": ""}, {"classname": "dog", "def_wiki": "A mammal, Canis familiaris or Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.", "path_wn": ["dog", "canine", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds"}, {"classname": "horse", "def_wiki": "Any of several animals related to Equus ferus caballus.", "path_wn": ["horse", "equine", "odd-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "solid-hoofed herbivorous quadruped domesticated since prehistoric times"}, {"classname": "motorbike", "def_wiki": "A motorcycle.", "path_wn": ["minibike", "motorcycle", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "small motorcycle with a low frame and small wheels and elevated handlebars"}, {"classname": "person", "def_wiki": "An individual; usually a human being. [from 13th c.]\n\nEach person is unique, both mentally and physically.", "path_wn": ["person", "causal_agent", "physical_entity", "entity"], "def_wn": "a human being"}, {"classname": "pottedplant", "def_wiki": null, "path_wn": "", "def_wn": ""}, {"classname": "sheep", "def_wiki": "A woolly ruminant of the genus Ovis.", "path_wn": ["sheep", "bovid", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "woolly usually horned ruminant mammal related to the goat"}, {"classname": "sofa", "def_wiki": "A raised area of a building's floor, usually covered with carpeting, used for sitting.", "path_wn": ["sofa", "seat", "furniture", "furnishing", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an upholstered seat for more than one person"}, {"classname": "train", "def_wiki": "Elongated portion.", "path_wn": ["train", "public_transport", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "public transport provided by a line of railway cars coupled together and drawn by a locomotive"}, {"classname": "tvmonitor", "def_wiki": null, "path_wn": "", "def_wn": ""}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_cifar-10.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "airplane", "gpt3": [" A machine that is used for flying through the air.", " A vehicle that flies through the air under its own power.", " A machine that is used for flying through the air.", " Any of various heavier-than-air craft that travel by means of aerodynamic lift, as opposed to lighter-than-air craft such as balloons.", " A heavier-than-air craft that derives its lift from aerodynamic forces and that depends on its engine for propulsion."]}, {"classname": "automobile", "gpt3": [" A wheeled vehicle that runs on land and is propelled by an engine.", " A motor vehicle.", " A type of vehicle designed to move on the ground under its own stored power and intended to carry a driver, a small number of additional passengers, and a very limited amount of other load. A car or motorcar.", " A motor vehicle with four wheels; usually propelled by an internal-combustion engine.", " A motor vehicle with four wheels; typically propelled by an internal combustion engine."]}, {"classname": "bird", "gpt3": [" A warm-blooded vertebrate with wings and feathers and a beak, and able to fly.", " Any of numerous warm-blooded egg-laying vertebrates of the class Aves, having a body covered with feathers and forelimbs modified into wings.", " A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", " A warm-blooded vertebrate with wings, feathers, a beak, and usually the power of flight.", " A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs."]}, {"classname": "cat", "gpt3": [" Any of numerous carnivorous mammals of the family Felidae, having thick soft fur, a short snout, and usually sharp retractile claws.", " A small domesticated carnivorous mammal, Felis catus, having thick soft fur, a short snout, and retractile claws.", " Any of numerous carnivorous mammals of the family Felidae, having thick soft fur, a short tail, and usually sharp retractile claws.", " A small domesticated carnivorous mammal with soft fur, a short snout, and retractile claws.", " A small domesticated carnivorous mammal with soft fur, a short snout, and retractile claws."]}, {"classname": "deer", "gpt3": [" Any of various ruminants of the family Cervidae, such as the red deer or elk, having antlers in the male.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla."]}, {"classname": "dog", "gpt3": [" A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A member of the genus \"Canis\" (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds; has a long association with humans; and is widely kept as a pet or for work"]}, {"classname": "frog", "gpt3": [" Any of numerous tailless stout-bodied amphibians of the order Anura, having a short broad head, short limbs, and webbed toes.", " Any of numerous tailless stout-bodied amphibians with long hind legs for leaping, short forelegs for hopping, and a long, tapering, usually pointed head.", " Any of numerous tailless stout-bodied amphibians with long hind legs adapted for leaping, found in a wide variety of habitats.", " A small tailless amphibian of the order Anura that typically hops.", " A small tailless amphibian of the order Anura that typically hops."]}, {"classname": "horse", "gpt3": [" A hoofed mammal of the family Equidae, related to the ass, having a thick, shaggy coat, a long head, and a long, slender, muscular neck.", " A hoofed mammal of the genus \"Equus\", domesticated since prehistoric times.", " A hoofed mammal of the genus Equus, domesticated since prehistoric times.", " A large hoofed mammal, Equus caballus, domesticated since prehistoric times, having a thick, shaggy coat, a heavy body, and a large head with a short, pointed muzzle.", " A hoofed mammal of the family Equidae, related to the zebra."]}, {"classname": "ship", "gpt3": [" A water-borne vessel generally larger than a boat.", " A water-borne vessel generally larger than a boat.", " A large vessel for transporting goods or passengers, typically ocean-going.", " A large vessel for transporting goods or passengers.", " A large vessel for transporting goods or passengers, typically ocean-going."]}, {"classname": "truck", "gpt3": [" A motor vehicle with a large carrying capacity, typically having a boxlike body and a relatively small passenger compartment.", " A motor vehicle with a large carrying capacity, typically having a long wheelbase and an enclosed body.", " A motor vehicle with a large carrying capacity, typically having a boxlike body and two rear wheels.", " A motor vehicle with a large carrying capacity, typically having a boxlike body and two rear wheels.", " A motor vehicle with a large carrying space, typically having a boxlike body and a separate cab."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_eurosat_clip.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "annual crop land", "gpt3": [" arable land", " arable land", " land used for growing crops that are harvested once a year", " Land that is used to grow crops for one year.", " arable land"]}, {"classname": "forest", "gpt3": [" a large area of land covered with trees", " land covered with trees", " a tract of land covered with trees and underbrush", " A large area of land covered with trees.", " A tract of land covered with trees and undergrowth, larger than woods."]}, {"classname": "brushland or shrubland", "gpt3": [" A land area covered with low-growing woody plants, such as bushes, small trees, and shrubs.", " Land that is covered mostly with shrubs.", " land covered with bushes, shrubs, and small trees.", " land covered with low, scrubby vegetation", " land covered with low, scrubby vegetation, especially thorny bushes."]}, {"classname": "highway or road", "gpt3": [" A way or course for the passage of vehicles, persons, and merchandise, usually including paved or graveled surface, curbs, and usually sidewalks.", " a way (usually public) for the transportation of people or goods", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way or path for the travel of people or vehicles."]}, {"classname": "industrial buildings or commercial buildings", "gpt3": [" The act or process by which something is built; construction.", " A building or group of buildings where goods are produced, processed or manufactured, or where services are provided.", " A building or group of buildings where goods are manufactured, processed or repaired, or goods and services are sold.", " The act or process by which something is built; construction.", " Buildings used for manufacturing or storing and selling goods."]}, {"classname": "pasture land", "gpt3": [" grassland used for grazing livestock", " Land used for grazing.", " land used for grazing animals", " Land used for grazing.", " Land used for grazing."]}, {"classname": "permanent crop land", "gpt3": [" land used for growing crops that are not harvested annually", " arable land", " Land that is used for growing crops that are not intended for sale or for feeding livestock.", " Land used for growing crops that are not intended for harvest in the current year.", " land used for growing crops that are not harvested annually, such as trees, shrubs, and vines."]}, {"classname": "residential buildings or homes or apartments", "gpt3": [" A complete domicile occupying only part of a building, especially one for rent; a flat.", " A building or group of buildings containing a number of homes.", " The act or process by which something is built; construction.", " buildings where people live", " A complete domicile occupying only part of a building, especially one for rent; a flat."]}, {"classname": "river", "gpt3": [" A large natural stream of water flowing in a channel to the sea, a lake, or another river.", " a natural stream of water larger than a creek", " A natural stream of water of considerable volume flowing in a definite course from higher to lower ground.", " A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", " A large natural stream of water (larger than a creek) flowing in a channel on the surface of the earth."]}, {"classname": "lake or sea", "gpt3": [" A large body of salt water.", " a large body of water surrounded by land", " A large body of water surrounded by land.", " A large body of water surrounded by land.", " A large body of water surrounded by land."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_fer-2013.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "angry", "gpt3": [" Feeling or showing anger.", " Displaying or feeling anger.", " Feeling or showing anger; wrathful; irate.", " Displaying or feeling anger.", " A feeling of displeasure and antagonism aroused by a wrong; wrath; ire; rage."]}, {"classname": "disgusted", "gpt3": [" Having a feeling of loathing or revulsion; repulsed.", " Feeling or displaying disgust.", " Affected with a feeling of loathing or repugnance; having a feeling of extreme repugnance or aversion.", " Filled with disgust.", " Feeling or showing a strong dislike of something."]}, {"classname": "fearful", "gpt3": [" Causing fear.", " Causing fear.", " Frightening.", " Frightening.", " Causing fear or dread; frightening."]}, {"classname": "happy", "gpt3": [" Emotionally positive.", " Feeling or showing pleasure and satisfaction.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous."]}, {"classname": "neutral", "gpt3": [" Not showing any emotion.", " Having no particular feeling or interest; not caring one way or the other; indifferent.", " Not taking sides in a conflict such as war; nonaligned.", " Neither positive nor negative.", " Neither good nor bad."]}, {"classname": "sad", "gpt3": [" Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; gloomy; dismal; dreary; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; melancholy; dismal; gloomy; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; d"]}, {"classname": "surprised", "gpt3": [" Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Astonished; astounded; amazed; dumbfounded; flabbergasted; flummoxed; flabbergasted; thunderstruck; dumbstruck; thunderstruck; dumbstruck; thunderstruck; dumbstruck;", " Surprised.", " Surprised."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_hateful-memes.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}, {"classname": "hatespeech meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_kitti-distance.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "a photo i took of a car on my left or right side.", "gpt3": [" The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", " beside, next to, by", " a wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project."]}, {"classname": "a photo i took with a car nearby.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " near; close", " Nearby; close by; close at hand; close to hand; close by; close to; close on; close to at hand; close to hand; close by; close to; close on; close to at hand; close to hand;"]}, {"classname": "a photo i took with a car in the distance.", "gpt3": [" far away; a long distance away", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " far away; a long distance away", " distant, far away, far off", " far away; a long distance away"]}, {"classname": "a photo i took with no car.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car; no cars", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_mnist.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "0", "gpt3": [" 0.", " The number zero (0).", " A particle used for marking the following verb as an infinitive.", " 0.", " To be in a state of confusion."]}, {"classname": "1", "gpt3": [" 1.", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The act of ingesting.", " The number one (1)."]}, {"classname": "2", "gpt3": [" A particle used for marking the following verb as an infinitive.", " The number two (2).", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The first person to visit the moon."]}, {"classname": "3", "gpt3": [" 0.", " The CIA.", " Because, as, since.", " A particle used for marking the following verb as an infinitive.", " Because, as, since."]}, {"classname": "4", "gpt3": [" The number four (4).", " The first of the four basic operations of arithmetic, that is, the operation of finding the remainder when one number is divided by another.", " Because, as, since.", " A type of small, flat, round cake.", " To be in a state of disrepair."]}, {"classname": "5", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " A kind of animal."]}, {"classname": "6", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be able to.", " The first person to be killed in the novel.", " The number six (6).", " Because, as, since."]}, {"classname": "7", "gpt3": [" To be in a state of disrepair.", " To be ingested.", " To be in a state of confusion.", " To be in a state of being.", " Because, as, since."]}, {"classname": "8", "gpt3": [" To ingest; to be ingested.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " Because, as, since."]}, {"classname": "9", "gpt3": [" The number nine (9).", " A particle used for marking the preceding noun as a subject.", " The number nine (9).", " To be in a state of being.", " The first person to do something."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_patch-camelyon.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "lymph node", "gpt3": [" A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infection by trapping ger", " Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue", " Thin, woven, gauze-like fabric.", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger"]}, {"classname": "lymph node containing metastatic tumor tissue", "gpt3": [" Thin, woven, gauze-like fabric.", " Thin, woven, gauze-like fabric.", " A small, oval, soft, elastic body of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of", " Thin, woven, gauze-like fabric.", " A small, oval, solid organ of the lymphatic system, distributed along the lymphatic vessels, that acts as a filter for bacteria, viruses, and foreign matter."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/knowledge/gpt3/GPT3_rendered-sst2.tsv: -------------------------------------------------------------------------------- 1 | [{"classname": "negative", "gpt3": [" Not positive or neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral."]}, {"classname": "positive", "gpt3": [" Not negative or neutral.", " Not negative or neutral.", " Not negative nor neutral.", " Not negative nor neutral.", " Not negative or neutral."]}] -------------------------------------------------------------------------------- /vision_benchmark/resources/model/clip_example.yaml: -------------------------------------------------------------------------------- 1 | INPUT: 2 | MEAN: 3 | - 0.485 4 | - 0.456 5 | - 0.406 6 | STD: 7 | - 0.229 8 | - 0.224 9 | - 0.225 10 | MODEL: 11 | NAME: clip_example 12 | NUM_PARAMS_IN_M: 11.0 13 | AUTHOR: 'MSFT' 14 | PRETRAINED_DATA: 'ImageNet22K' 15 | CREATION_TIME: '2019-05-27' 16 | # Following configuration is needed for CLIP model. 17 | SPEC: 18 | TEXT: 19 | TOKENIZER: clip 20 | STYLE: clip 21 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/clip_swin_tiny.yaml: -------------------------------------------------------------------------------- 1 | INPUT: 2 | MEAN: 3 | - 0.485 4 | - 0.456 5 | - 0.406 6 | STD: 7 | - 0.229 8 | - 0.224 9 | - 0.225 10 | MODEL: 11 | NAME: clip_swin 12 | NUM_PARAMS_IN_M: 11.0 13 | AUTHOR: 'MSFT' 14 | PRETRAINED_DATA: 'ImageNet22K_YFCC15M' 15 | CREATION_TIME: '2021-10-27' 16 | # Following configuration is needed for CLIP model. 17 | PRETRAINED: '' 18 | PRETRAINED_LAYERS: ['*'] 19 | SPEC: 20 | EMBED_DIM: 512 21 | GATHER_TENSORS: True 22 | TEXT: 23 | TOKENIZER: clip 24 | CONTEXT_LENGTH: 77 25 | WIDTH: 512 26 | HEADS: 8 27 | LAYERS: 12 28 | VISION: 29 | PATCH_SIZE: 4 30 | IN_CHANS: 3 31 | EMBED_DIM: 96 32 | DEPTHS: [2, 2, 6, 2] 33 | NUM_HEADS: [3, 6, 12, 24] 34 | WINDOW_SIZE: 7 35 | MLP_RATIO: 4. 36 | QKV_BIAS: True 37 | APE: False 38 | PATCH_NORM: True 39 | DROP_RATE: 0.0 40 | DROP_PATH_RATE: 0.0 41 | 42 | KNOWLEDGE: 43 | WORDNET: 44 | USE_HIERARCHY: False # False 45 | USE_DEFINITION: False # True 46 | 47 | # DATASET: 48 | # DATASET: 'imagenet' 49 | # ROOT: ../../data/zeroshot/classification/imagenet 50 | OUTPUT_DIR: /home/chunyl/azure_mount/chunyleu_output/cvinwild/ic_benchmark/debug/swin_tiny/unicl_imagenet21k 51 | # ../../output/hcl_exp/hcl_yfcc15m_half_imagenet22k_half/wordnet_h_true_d_false 52 | TEST: 53 | MODEL_FILE: '/home/chunyl/azure_mount/chunyleu_output/ckpts/benchmark/swin_tiny/unicl_imagenet21k/model_state_dict.pt' 54 | BATCH_SIZE_PER_GPU: 128 55 | 56 | TRAIN: 57 | BATCH_SIZE_PER_GPU: 64 58 | BEGIN_EPOCH: 0 59 | END_EPOCH: 10 60 | EXTRA_FINAL_TRAIN_EPOCH: 40 61 | OPTIMIZER: sgd 62 | WD: 0. 63 | MOMENTUM: 0.9 64 | NESTEROV: false 65 | SHUFFLE: true 66 | LR_SCHEDULER: 67 | METHOD: 'WarmupCosine' 68 | WARMUP_EPOCH: 5 69 | 70 | # hcl_imagenet_21k_wiki 71 | # hcl_imagenet21k 72 | # hcl_yfcc15m_half_imagenet21k_half_multitask 73 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_half_imagenet22k_half/model_state_dict.pt' 74 | 75 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k_multitask/model_state_dict.pt' 76 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k/model_state_dict.pt' 77 | 78 | # hcl_imagenet22k hcl_yfcc15m hcl_yfcc15m_half_imagenet21k_half hcl_yfcc15m_half_imagenet22k_half hcl_yfcc15m_imagenet21k hcl_yfcc15m_imagenet22k hcl_yfcc15m_imagenet22k_multitask 79 | # hcl_imagenet1k 80 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/deit_base_patch16_224.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: 'OUTPUT/DEIT_BASE_PATCH16_224/' 3 | 4 | MODEL: 5 | NAME: deit_base_patch16_224 6 | NUM_PARAMS_IN_M: 86.5 7 | AUTHOR: 'timm' 8 | PRETRAINED_DATA: 'ImageNet1K' 9 | CREATION_TIME: '2020-10-13' 10 | SPEC: 11 | EMBED_DIM: 768 12 | 13 | TEST: 14 | BATCH_SIZE_PER_GPU: 128 15 | MODEL_FILE: '' 16 | 17 | TRAIN: 18 | BATCH_SIZE_PER_GPU: 64 19 | BEGIN_EPOCH: 0 20 | END_EPOCH: 10 21 | EXTRA_FINAL_TRAIN_EPOCH: 40 22 | OPTIMIZER: sgd 23 | WD: 0. 24 | MOMENTUM: 0.9 25 | NESTEROV: false 26 | SHUFFLE: true 27 | LR_SCHEDULER: 28 | METHOD: 'WarmupCosine' 29 | WARMUP_EPOCH: 5 30 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/example.yaml: -------------------------------------------------------------------------------- 1 | INPUT: 2 | MEAN: 3 | - 0.485 4 | - 0.456 5 | - 0.406 6 | STD: 7 | - 0.229 8 | - 0.224 9 | - 0.225 10 | MODEL: 11 | NAME: cls_example 12 | NUM_PARAMS_IN_M: 11 13 | AUTHOR: 'MSFT' 14 | PRETRAINED_DATA: 'ImageNet22K' 15 | CREATION_TIME: '2019-05-27' -------------------------------------------------------------------------------- /vision_benchmark/resources/model/mae_vitb16.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/' 3 | 4 | MODEL: 5 | NAME: mae_vitb16 6 | NUM_PARAMS_IN_M: 86.6 7 | AUTHOR: 'Facebook' 8 | PRETRAINED_DATA: 'ImageNet22K' 9 | CREATION_TIME: '2020-10-13' 10 | SPEC: 11 | EMBED_DIM: 768 12 | PATCH_SIZE: 16 13 | DEPTH: 12 14 | NUM_HEADS: 12 15 | MLP_RATIO: 4 16 | QKV_BIAS: True 17 | GLOBAL_POOL: True 18 | 19 | TEST: 20 | BATCH_SIZE_PER_GPU: 128 21 | MODEL_FILE: 'https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth' 22 | 23 | TRAIN: 24 | BATCH_SIZE_PER_GPU: 64 25 | BEGIN_EPOCH: 0 26 | END_EPOCH: 10 27 | EXTRA_FINAL_TRAIN_EPOCH: 40 28 | OPTIMIZER: sgd 29 | WD: 0. 30 | MOMENTUM: 0.9 31 | NESTEROV: false 32 | SHUFFLE: true 33 | LR_SCHEDULER: 34 | METHOD: 'WarmupCosine' 35 | WARMUP_EPOCH: 5 36 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/mocov3_vitb16.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/' 3 | 4 | MODEL: 5 | NAME: mocov3_vitb16 6 | NUM_PARAMS_IN_M: 86.6 7 | AUTHOR: 'Facebook' 8 | PRETRAINED_DATA: 'ImageNet22K' 9 | CREATION_TIME: '2020-10-13' 10 | SPEC: 11 | EMBED_DIM: 768 12 | PATCH_SIZE: 16 13 | DEPTH: 12 14 | NUM_HEADS: 12 15 | MLP_RATIO: 4 16 | QKV_BIAS: True 17 | GLOBAL_POOL: True 18 | 19 | TEST: 20 | BATCH_SIZE_PER_GPU: 128 21 | MODEL_FILE: 'https://dl.fbaipublicfiles.com/moco-v3/vit-b-300ep/vit-b-300ep.pth.tar' 22 | 23 | TRAIN: 24 | BATCH_SIZE_PER_GPU: 64 25 | BEGIN_EPOCH: 0 26 | END_EPOCH: 10 27 | EXTRA_FINAL_TRAIN_EPOCH: 40 28 | OPTIMIZER: sgd 29 | WD: 0. 30 | MOMENTUM: 0.9 31 | NESTEROV: false 32 | SHUFFLE: true 33 | LR_SCHEDULER: 34 | METHOD: 'WarmupCosine' 35 | WARMUP_EPOCH: 5 36 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/react_vitG14_OpenCLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'clip_react' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'Haotian Liu' 12 | PRETRAINED_DATA: 'CLIP-data/REACT-data' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 1280 17 | USE_QUICK_GELU: False 18 | VISION: 19 | MODEL: vit 20 | PATCH_SIZE: 14 21 | WIDTH: 1664 22 | LAYERS: 48 23 | HEADS: 16 24 | MLP_RATIO: 4.9231 25 | USE_RCP_BLOCK: true 26 | TEXT: 27 | TOKENIZER: clip 28 | STYLE: clip 29 | CONTEXT_LENGTH: 77 30 | WIDTH: 1280 31 | HEADS: 20 32 | LAYERS: 32 33 | USE_RCP_BLOCK: False 34 | RCP_BLOCK: 35 | MODE: gated_attn 36 | GUMBEL_SAMPLE: False 37 | USE_LAST_K: 12 38 | 39 | TEST: 40 | BATCH_SIZE_PER_GPU: 128 41 | MODEL_FILE: 'hf:react-vl/react-in1k:openclip-vit-bigG-14-gated-image-laion2b.pt' 42 | 43 | TRAIN: 44 | BATCH_SIZE_PER_GPU: 64 45 | BEGIN_EPOCH: 0 46 | END_EPOCH: 10 47 | EXTRA_FINAL_TRAIN_EPOCH: 40 48 | OPTIMIZER: sgd 49 | WD: 0. 50 | MOMENTUM: 0.9 51 | NESTEROV: false 52 | SHUFFLE: true 53 | LR_SCHEDULER: 54 | METHOD: 'WarmupCosine' 55 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/react_vitb16_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'clip_react' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'Haotian Liu' 12 | PRETRAINED_DATA: 'CLIP-data/REACT-data' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 16 20 | WIDTH: 768 21 | LAYERS: 12 22 | USE_RCP_BLOCK: true 23 | TEXT: 24 | TOKENIZER: clip 25 | STYLE: clip 26 | CONTEXT_LENGTH: 77 27 | VOCAB_SIZE: 49408 28 | WIDTH: 512 29 | HEADS: 8 30 | LAYERS: 12 31 | USE_RCP_BLOCK: false 32 | RCP_BLOCK: 33 | MODE: gated_attn 34 | GUMBEL_SAMPLE: False 35 | USE_LAST_K: 6 36 | 37 | TEST: 38 | BATCH_SIZE_PER_GPU: 128 39 | MODEL_FILE: 'hf:react-vl/react-in1k:clip-vit-base-16-gated-image.pt' 40 | 41 | TRAIN: 42 | BATCH_SIZE_PER_GPU: 64 43 | BEGIN_EPOCH: 0 44 | END_EPOCH: 10 45 | EXTRA_FINAL_TRAIN_EPOCH: 40 46 | OPTIMIZER: sgd 47 | WD: 0. 48 | MOMENTUM: 0.9 49 | NESTEROV: false 50 | SHUFFLE: true 51 | LR_SCHEDULER: 52 | METHOD: 'WarmupCosine' 53 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/react_vitb32_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'clip_react' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'Haotian Liu' 12 | PRETRAINED_DATA: 'CLIP-data/REACT-data' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 768 21 | LAYERS: 12 22 | USE_RCP_BLOCK: true 23 | TEXT: 24 | TOKENIZER: clip 25 | STYLE: clip 26 | CONTEXT_LENGTH: 77 27 | VOCAB_SIZE: 49408 28 | WIDTH: 512 29 | HEADS: 8 30 | LAYERS: 12 31 | USE_RCP_BLOCK: false 32 | RCP_BLOCK: 33 | MODE: gated_attn 34 | GUMBEL_SAMPLE: False 35 | USE_LAST_K: 6 36 | 37 | TEST: 38 | BATCH_SIZE_PER_GPU: 128 39 | MODEL_FILE: 'hf:react-vl/react-in1k:clip-vit-base-32-gated-image.pt' 40 | 41 | TRAIN: 42 | BATCH_SIZE_PER_GPU: 64 43 | BEGIN_EPOCH: 0 44 | END_EPOCH: 10 45 | EXTRA_FINAL_TRAIN_EPOCH: 40 46 | OPTIMIZER: sgd 47 | WD: 0. 48 | MOMENTUM: 0.9 49 | NESTEROV: false 50 | SHUFFLE: true 51 | LR_SCHEDULER: 52 | METHOD: 'WarmupCosine' 53 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/react_vitl14_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'clip_react' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'Haotian Liu' 12 | PRETRAINED_DATA: 'CLIP-data/REACT-data' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 768 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 14 20 | WIDTH: 1024 21 | LAYERS: 24 22 | USE_RCP_BLOCK: True 23 | TEXT: 24 | TOKENIZER: clip 25 | STYLE: clip 26 | CONTEXT_LENGTH: 77 27 | WIDTH: 768 28 | HEADS: 12 29 | LAYERS: 12 30 | USE_RCP_BLOCK: False 31 | RCP_BLOCK: 32 | MODE: gated_attn 33 | USE_LAST_K: 6 34 | USE_FFN: True 35 | WIDTH: 1024 36 | 37 | TEST: 38 | BATCH_SIZE_PER_GPU: 128 39 | MODEL_FILE: 'hf:react-vl/react-in1k:clip-vit-large-14-gated-image.pt' 40 | 41 | TRAIN: 42 | BATCH_SIZE_PER_GPU: 64 43 | BEGIN_EPOCH: 0 44 | END_EPOCH: 10 45 | EXTRA_FINAL_TRAIN_EPOCH: 40 46 | OPTIMIZER: sgd 47 | WD: 0. 48 | MOMENTUM: 0.9 49 | NESTEROV: false 50 | SHUFFLE: true 51 | LR_SCHEDULER: 52 | METHOD: 'WarmupCosine' 53 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vit_base_patch16_224.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: 'OUTPUT/VIT_BASE_PATCH16_224/' 3 | 4 | INPUT: 5 | MEAN: [0.5, 0.5, 0.5] 6 | STD: [0.5, 0.5, 0.5] 7 | 8 | MODEL: 9 | NAME: vit_base_patch16_224 10 | NUM_PARAMS_IN_M: 86.5 11 | AUTHOR: 'timm' 12 | PRETRAINED_DATA: 'ImageNet22K' 13 | CREATION_TIME: '2020-10-13' 14 | SPEC: 15 | EMBED_DIM: 768 16 | 17 | TEST: 18 | BATCH_SIZE_PER_GPU: 128 19 | MODEL_FILE: '' 20 | 21 | TRAIN: 22 | BATCH_SIZE_PER_GPU: 64 23 | BEGIN_EPOCH: 0 24 | END_EPOCH: 10 25 | EXTRA_FINAL_TRAIN_EPOCH: 40 26 | OPTIMIZER: sgd 27 | WD: 0. 28 | MOMENTUM: 0.9 29 | NESTEROV: false 30 | SHUFFLE: true 31 | LR_SCHEDULER: 32 | METHOD: 'WarmupCosine' 33 | WARMUP_EPOCH: 5 34 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vit_base_patch32_224.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VIT_BASE_PATCH32_224/' 3 | 4 | INPUT: 5 | MEAN: [0.5, 0.5, 0.5] 6 | STD: [0.5, 0.5, 0.5] 7 | 8 | MODEL: 9 | NAME: vit_base_patch32_224 10 | NUM_PARAMS_IN_M: 88.2 11 | AUTHOR: 'timm' 12 | PRETRAINED_DATA: 'ImageNet22K' 13 | CREATION_TIME: '2020-10-13' 14 | SPEC: 15 | EMBED_DIM: 768 16 | 17 | TEST: 18 | BATCH_SIZE_PER_GPU: 128 19 | MODEL_FILE: '' 20 | 21 | TRAIN: 22 | BATCH_SIZE_PER_GPU: 64 23 | BEGIN_EPOCH: 0 24 | END_EPOCH: 10 25 | EXTRA_FINAL_TRAIN_EPOCH: 40 26 | OPTIMIZER: sgd 27 | WD: 0. 28 | MOMENTUM: 0.9 29 | NESTEROV: false 30 | SHUFFLE: true 31 | LR_SCHEDULER: 32 | METHOD: 'WarmupCosine' 33 | WARMUP_EPOCH: 5 34 | -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vitb16_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'ViT-B/16' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'OpenAI' 12 | PRETRAINED_DATA: 'CLIP-data' 13 | CREATION_TIME: '2021-01-05' 14 | 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 16 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | 31 | TEST: 32 | BATCH_SIZE_PER_GPU: 128 33 | MODEL_FILE: '' 34 | 35 | TRAIN: 36 | BATCH_SIZE_PER_GPU: 64 37 | BEGIN_EPOCH: 0 38 | END_EPOCH: 10 39 | EXTRA_FINAL_TRAIN_EPOCH: 40 40 | OPTIMIZER: sgd 41 | WD: 0. 42 | MOMENTUM: 0.9 43 | NESTEROV: false 44 | SHUFFLE: true 45 | LR_SCHEDULER: 46 | METHOD: 'WarmupCosine' 47 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vitb32_CLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'ViT-B/32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'OpenAI' 12 | PRETRAINED_DATA: 'CLIP-data' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | 31 | TEST: 32 | BATCH_SIZE_PER_GPU: 128 33 | MODEL_FILE: '' 34 | 35 | TRAIN: 36 | BATCH_SIZE_PER_GPU: 64 37 | BEGIN_EPOCH: 0 38 | END_EPOCH: 10 39 | EXTRA_FINAL_TRAIN_EPOCH: 40 40 | OPTIMIZER: sgd 41 | WD: 0. 42 | MOMENTUM: 0.9 43 | NESTEROV: false 44 | SHUFFLE: true 45 | LR_SCHEDULER: 46 | METHOD: 'WarmupCosine' 47 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vitb32_DeCLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'declip_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'DeCLIP' 12 | PRETRAINED_DATA: 'DeCLIP-88M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 3072 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | SKIP_TOKENIZE: true 31 | DECLIP: 32 | image_encode: 33 | embed_dim: 3072 34 | text_encode: 35 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 36 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 37 | text_model_utils: 38 | random: False 39 | freeze: False 40 | embed_dim: 3072 41 | # clip: 42 | # use_allgather: True 43 | # text_mask_type: MLM 44 | # return_nn_bank: True 45 | # EDA: True 46 | # feature_dim: 3072 47 | 48 | TEST: 49 | BATCH_SIZE_PER_GPU: 128 50 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_vitb32.pth.tar' 51 | 52 | TRAIN: 53 | BATCH_SIZE_PER_GPU: 64 54 | BEGIN_EPOCH: 0 55 | END_EPOCH: 10 56 | EXTRA_FINAL_TRAIN_EPOCH: 40 57 | OPTIMIZER: sgd 58 | WD: 0. 59 | MOMENTUM: 0.9 60 | NESTEROV: false 61 | SHUFFLE: true 62 | LR_SCHEDULER: 63 | METHOD: 'WarmupCosine' 64 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vitb32_DeCLIP_YFCC15M.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'declip_yfcc_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'DeCLIP' 12 | PRETRAINED_DATA: 'YFCC-15M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 3072 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | SKIP_TOKENIZE: true 31 | DECLIP: 32 | image_encode: 33 | embed_dim: 512 34 | text_encode: 35 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 36 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 37 | text_model_utils: 38 | random: False 39 | freeze: False 40 | embed_dim: 512 41 | # clip: 42 | # use_allgather: True 43 | # text_mask_type: MLM 44 | # return_nn_bank: True 45 | # EDA: True 46 | # feature_dim: 512 47 | 48 | TEST: 49 | BATCH_SIZE_PER_GPU: 128 50 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_YFCC15M_vitb32.pth.tar' 51 | 52 | TRAIN: 53 | BATCH_SIZE_PER_GPU: 64 54 | BEGIN_EPOCH: 0 55 | END_EPOCH: 10 56 | EXTRA_FINAL_TRAIN_EPOCH: 40 57 | OPTIMIZER: sgd 58 | WD: 0. 59 | MOMENTUM: 0.9 60 | NESTEROV: false 61 | SHUFFLE: true 62 | LR_SCHEDULER: 63 | METHOD: 'WarmupCosine' 64 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vitb32_FILIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'filip_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'FILIP' 12 | PRETRAINED_DATA: 'DeCLIP-88M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 768 17 | DENSE_EVAL: true 18 | VISION: 19 | MODEL: vit 20 | PATCH_SIZE: 32 21 | WIDTH: 384 22 | LAYERS: 12 23 | TEXT: 24 | TOKENIZER: clip 25 | STYLE: clip 26 | CONTEXT_LENGTH: 77 27 | VOCAB_SIZE: 49408 28 | WIDTH: 512 29 | HEADS: 8 30 | LAYERS: 12 31 | SKIP_TOKENIZE: true 32 | DECLIP: 33 | image_encode: 34 | embed_dim: 768 35 | text_encode: 36 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 37 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 38 | text_model_utils: 39 | random: False 40 | freeze: False 41 | embed_dim: 768 42 | clip: 43 | mask_rate: 0.5 44 | patch_number: 14 45 | use_allgather: False 46 | text_mask_type: MLM 47 | return_nn_bank: False 48 | return_dense: True 49 | feature_dim: 768 50 | select_topk: True 51 | 52 | TEST: 53 | BATCH_SIZE_PER_GPU: 128 54 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/FILIP_YFCC15M_vitb32.pth.tar' 55 | 56 | TRAIN: 57 | BATCH_SIZE_PER_GPU: 64 58 | BEGIN_EPOCH: 0 59 | END_EPOCH: 10 60 | EXTRA_FINAL_TRAIN_EPOCH: 40 61 | OPTIMIZER: sgd 62 | WD: 0. 63 | MOMENTUM: 0.9 64 | NESTEROV: false 65 | SHUFFLE: true 66 | LR_SCHEDULER: 67 | METHOD: 'WarmupCosine' 68 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/resources/model/vitb32_SLIP.yaml: -------------------------------------------------------------------------------- 1 | # GPUS: (0,) 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/' 3 | 4 | INPUT: 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] 7 | 8 | MODEL: 9 | NAME: 'slip_vitb32' 10 | NUM_PARAMS_IN_M: 151.2 11 | AUTHOR: 'SLIP' 12 | PRETRAINED_DATA: 'YFCC-15M' 13 | CREATION_TIME: '2021-01-05' 14 | # Following configuration is needed for runing linear probe with Pytorch based linear model. 15 | SPEC: 16 | EMBED_DIM: 512 17 | VISION: 18 | MODEL: vit 19 | PATCH_SIZE: 32 20 | WIDTH: 384 21 | LAYERS: 12 22 | TEXT: 23 | TOKENIZER: clip 24 | STYLE: clip 25 | CONTEXT_LENGTH: 77 26 | VOCAB_SIZE: 49408 27 | WIDTH: 512 28 | HEADS: 8 29 | LAYERS: 12 30 | SKIP_TOKENIZE: true 31 | DECLIP: 32 | image_encode: 33 | embed_dim: 512 34 | text_encode: 35 | bpe_path: 'bpe_simple_vocab_16e6.txt.gz' 36 | text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx 37 | text_model_utils: 38 | random: False 39 | freeze: False 40 | embed_dim: 512 41 | clip: 42 | use_allgather: False 43 | return_sim: True 44 | feature_dim: 768 45 | sim_dim: 256 46 | 47 | TEST: 48 | BATCH_SIZE_PER_GPU: 128 49 | MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/SLIP_YFCC15M_vitb32.pth.tar' 50 | 51 | TRAIN: 52 | BATCH_SIZE_PER_GPU: 64 53 | BEGIN_EPOCH: 0 54 | END_EPOCH: 10 55 | EXTRA_FINAL_TRAIN_EPOCH: 40 56 | OPTIMIZER: sgd 57 | WD: 0. 58 | MOMENTUM: 0.9 59 | NESTEROV: false 60 | SHUFFLE: true 61 | LR_SCHEDULER: 62 | METHOD: 'WarmupCosine' 63 | WARMUP_EPOCH: 5 -------------------------------------------------------------------------------- /vision_benchmark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .comm import comm 2 | from .utils import create_logger 3 | 4 | __all__ = ['comm', 'create_logger'] 5 | -------------------------------------------------------------------------------- /vision_benchmark/utils/comm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains primitives for multi-gpu communication. 3 | This is useful when doing distributed training. 4 | """ 5 | 6 | import pickle 7 | 8 | import torch 9 | import torch.distributed as dist 10 | 11 | 12 | class Comm(object): 13 | def __init__(self): 14 | self.local_rank = 0 15 | 16 | @property 17 | def world_size(self): 18 | if not dist.is_available(): 19 | return 1 20 | if not dist.is_initialized(): 21 | return 1 22 | return dist.get_world_size() 23 | 24 | @property 25 | def rank(self): 26 | if not dist.is_available(): 27 | return 0 28 | if not dist.is_initialized(): 29 | return 0 30 | return dist.get_rank() 31 | 32 | @property 33 | def local_rank(self): 34 | if not dist.is_available(): 35 | return 0 36 | if not dist.is_initialized(): 37 | return 0 38 | return self._local_rank 39 | 40 | @local_rank.setter 41 | def local_rank(self, value): 42 | if not dist.is_available(): 43 | self._local_rank = 0 44 | if not dist.is_initialized(): 45 | self._local_rank = 0 46 | self._local_rank = value 47 | 48 | @property 49 | def head(self): 50 | return 'Rank[{}/{}]'.format(self.rank, self.world_size) 51 | 52 | def is_main_process(self): 53 | return self.rank == 0 54 | 55 | def synchronize(self): 56 | """ 57 | Helper function to synchronize (barrier) among all processes when 58 | using distributed training 59 | """ 60 | if self.world_size == 1: 61 | return 62 | dist.barrier() 63 | 64 | 65 | comm = Comm() 66 | 67 | 68 | def all_gather(data): 69 | """ 70 | Run all_gather on arbitrary picklable data (not necessarily tensors) 71 | Args: 72 | data: any picklable object 73 | Returns: 74 | list[data]: list of data gathered from each rank 75 | """ 76 | world_size = comm.world_size 77 | if world_size == 1: 78 | return [data] 79 | 80 | # serialized to a Tensor 81 | buffer = pickle.dumps(data) 82 | storage = torch.ByteStorage.from_buffer(buffer) 83 | tensor = torch.ByteTensor(storage).to("cuda") 84 | 85 | # obtain Tensor size of each rank 86 | local_size = torch.LongTensor([tensor.numel()]).to("cuda") 87 | size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)] 88 | dist.all_gather(size_list, local_size) 89 | size_list = [int(size.item()) for size in size_list] 90 | max_size = max(size_list) 91 | 92 | # receiving Tensor from all ranks 93 | # we pad the tensor because torch all_gather does not support 94 | # gathering tensors of different shapes 95 | tensor_list = [] 96 | for _ in size_list: 97 | tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) 98 | if local_size != max_size: 99 | padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") 100 | tensor = torch.cat((tensor, padding), dim=0) 101 | dist.all_gather(tensor_list, tensor) 102 | 103 | data_list = [] 104 | for size, tensor in zip(size_list, tensor_list): 105 | buffer = tensor.cpu().numpy().tobytes()[:size] 106 | data_list.append(pickle.loads(buffer)) 107 | 108 | return data_list 109 | 110 | 111 | def reduce_dict(input_dict, average=True): 112 | """ 113 | Args: 114 | input_dict (dict): all the values will be reduced 115 | average (bool): whether to do average or sum 116 | Reduce the values in the dictionary from all processes so that process with rank 117 | 0 has the averaged results. Returns a dict with the same fields as 118 | input_dict, after reduction. 119 | """ 120 | world_size = comm.world_size 121 | if world_size < 2: 122 | return input_dict 123 | with torch.no_grad(): 124 | names = [] 125 | values = [] 126 | # sort the keys so that they are consistent across processes 127 | for k in sorted(input_dict.keys()): 128 | names.append(k) 129 | values.append(input_dict[k]) 130 | values = torch.stack(values, dim=0) 131 | dist.reduce(values, dst=0) 132 | if dist.get_rank() == 0 and average: 133 | # only main process gets accumulated, so only divide by 134 | # world_size in this case 135 | values /= world_size 136 | reduced_dict = {k: v for k, v in zip(names, values)} 137 | return reduced_dict 138 | 139 | 140 | def gather_tensors(tensor): 141 | """ 142 | Performs all_gather operation on the provided tensors. 143 | *** Warning ***: torch.distributed.all_gather has no gradient. 144 | """ 145 | tensors_gather = [ 146 | torch.ones_like(tensor) 147 | for _ in range(comm.world_size) 148 | ] 149 | 150 | dist.all_gather(tensors_gather, tensor, async_op=False) 151 | # need to do this to restore propagation of the gradients 152 | tensors_gather[comm.rank] = tensor 153 | output = torch.cat(tensors_gather, dim=0) 154 | return output 155 | -------------------------------------------------------------------------------- /vision_benchmark/utils/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from pathlib import Path 6 | 7 | import os 8 | import logging 9 | import time 10 | 11 | from .comm import comm 12 | 13 | 14 | def setup_logger(final_output_dir, rank, phase): 15 | time_str = time.strftime('%Y-%m-%d-%H-%M') 16 | log_file = f'{phase}_{time_str}_rank{rank}.txt' 17 | final_log_file = os.path.join(final_output_dir, log_file) 18 | head = "%(asctime)-15s:[P:%(process)d]:" + comm.head + ' %(message)s' 19 | logging.basicConfig( 20 | filename=str(final_log_file), format=head 21 | ) 22 | logger = logging.getLogger() 23 | logger.setLevel(logging.INFO) 24 | console = logging.StreamHandler() 25 | console.setFormatter( 26 | logging.Formatter(head) 27 | ) 28 | logging.getLogger('').addHandler(console) 29 | 30 | 31 | def create_logger(cfg, phase='train'): 32 | root_output_dir = Path(cfg.OUTPUT_DIR) 33 | dataset = cfg.DATASET.DATASET 34 | cfg_name = cfg.NAME 35 | 36 | final_output_dir = root_output_dir / dataset / cfg_name 37 | 38 | print('=> creating {} ...'.format(root_output_dir)) 39 | root_output_dir.mkdir(parents=True, exist_ok=True) 40 | print('=> creating {} ...'.format(final_output_dir)) 41 | final_output_dir.mkdir(parents=True, exist_ok=True) 42 | 43 | print('=> setup logger ...') 44 | setup_logger(final_output_dir, cfg.RANK, phase) 45 | 46 | return str(final_output_dir) 47 | 48 | --------------------------------------------------------------------------------