├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── run.sh
├── run_gpt3.sh
├── run_multi.sh
├── setup.py
├── submission_file_readme.md
├── tox.ini
└── vision_benchmark
    ├── __init__.py
    ├── commands
        ├── __init__.py
        ├── extract_gpt3_knowledge.py
        ├── finetune.py
        ├── linear_probe.py
        ├── prepare_submit.py
        └── zeroshot.py
    ├── common
        ├── __init__.py
        ├── constants.py
        ├── data_class_base.py
        ├── prediction_submission.py
        └── utils.py
    ├── config
        ├── __init__.py
        ├── default.py
        └── models.py
    ├── datasets
        ├── __init__.py
        ├── bpe_simple_vocab_16e6.txt.gz
        ├── hfpt_tokenizer.py
        ├── languages
        │   ├── __init__.py
        │   ├── bpe_simple_vocab_16e6.txt.gz
        │   ├── build.py
        │   ├── hfpt_tokenizer.py
        │   ├── prompt_engineering.py
        │   └── simple_tokenizer.py
        ├── prompts.py
        └── simple_tokenizer.py
    ├── evaluation
        ├── __init__.py
        ├── clip_zeroshot_evaluator.py
        ├── dataset.py
        ├── feature.py
        ├── full_model_finetune.py
        └── metric.py
    ├── models
        ├── __init__.py
        ├── clip_example.py
        ├── clip_react.py
        ├── clip_swin.py
        ├── cls_example.py
        ├── cls_swin.py
        ├── declip.py
        ├── declip_model
        │   ├── __init__.py
        │   ├── clip.py
        │   ├── declip.py
        │   ├── defilip.py
        │   ├── filip.py
        │   ├── image_encoder
        │   │   ├── base_transformer.py
        │   │   └── visual_transformer.py
        │   ├── slip.py
        │   ├── text_encoder
        │   │   ├── base_transformer.py
        │   │   └── text_transformer.py
        │   └── utils
        │   │   ├── nnclr_modules
        │   │       ├── __init__.py
        │   │       ├── memory_bank.py
        │   │       ├── memory_bank_cuda.py
        │   │       └── nn_memory_bank.py
        │   │   └── text_utils
        │   │       ├── bpe_simple_vocab_16e6.txt.gz
        │   │       ├── mask_tokens.py
        │   │       └── simple_tokenizer.py
        ├── mae.py
        └── mocov3.py
    ├── optim
        ├── __init__.py
        └── build.py
    ├── resources
        ├── datasets
        │   ├── caltech101.yaml
        │   ├── cifar10.yaml
        │   ├── cifar100.yaml
        │   ├── country211.yaml
        │   ├── dtd.yaml
        │   ├── eurosat-clip.yaml
        │   ├── fer2013.yaml
        │   ├── fgvc-aircraft-2013b.yaml
        │   ├── flower102.yaml
        │   ├── food101.yaml
        │   ├── gtsrb.yaml
        │   ├── hateful-memes.yaml
        │   ├── imagenet-1k.yaml
        │   ├── kitti-distance.yaml
        │   ├── mnist.yaml
        │   ├── oxford-iiit-pets.yaml
        │   ├── patchcamelyon.yaml
        │   ├── rendered-sst2.yaml
        │   ├── resisc45-clip.yaml
        │   ├── stanfordcar.yaml
        │   ├── vision_datasets.json
        │   └── voc2007classification.yaml
        ├── knowledge
        │   ├── external
        │   │   ├── caltech-101_knowledge.tsv
        │   │   ├── cifar-100_knowledge.tsv
        │   │   ├── cifar-10_knowledge.tsv
        │   │   ├── country211_knowledge.tsv
        │   │   ├── dtd_knowledge.tsv
        │   │   ├── eurosat_clip_knowledge.tsv
        │   │   ├── fer-2013_knowledge.tsv
        │   │   ├── fgvc-aircraft-2013b-variants102_knowledge.tsv
        │   │   ├── food-101_knowledge.tsv
        │   │   ├── gtsrb_knowledge.tsv
        │   │   ├── hateful-memes_knowledge.tsv
        │   │   ├── imagenet-1k_knowledge.tsv
        │   │   ├── kitti-distance_knowledge.tsv
        │   │   ├── mnist_knowledge.tsv
        │   │   ├── oxford-flower-102_knowledge.tsv
        │   │   ├── oxford-iiit-pets_knowledge.tsv
        │   │   ├── patch-camelyon_knowledge.tsv
        │   │   ├── rendered-sst2_knowledge.tsv
        │   │   ├── resisc45_clip_knowledge.tsv
        │   │   ├── stanford-cars_knowledge.tsv
        │   │   └── voc-2007-classification_knowledge.tsv
        │   └── gpt3
        │   │   ├── GPT3_caltech-101.tsv
        │   │   ├── GPT3_cifar-10.tsv
        │   │   ├── GPT3_cifar-100.tsv
        │   │   ├── GPT3_country211.tsv
        │   │   ├── GPT3_dtd.tsv
        │   │   ├── GPT3_eurosat_clip.tsv
        │   │   ├── GPT3_fer-2013.tsv
        │   │   ├── GPT3_fgvc-aircraft-2013b-variants102.tsv
        │   │   ├── GPT3_food-101.tsv
        │   │   ├── GPT3_gtsrb.tsv
        │   │   ├── GPT3_hateful-memes.tsv
        │   │   ├── GPT3_imagenet-1k.tsv
        │   │   ├── GPT3_kitti-distance.tsv
        │   │   ├── GPT3_mnist.tsv
        │   │   ├── GPT3_oxford-flower-102.tsv
        │   │   ├── GPT3_oxford-iiit-pets.tsv
        │   │   ├── GPT3_patch-camelyon.tsv
        │   │   ├── GPT3_rendered-sst2.tsv
        │   │   ├── GPT3_resisc45_clip.tsv
        │   │   ├── GPT3_stanford-cars.tsv
        │   │   └── GPT3_voc-2007-classification.tsv
        └── model
        │   ├── clip_example.yaml
        │   ├── clip_swin_tiny.yaml
        │   ├── deit_base_patch16_224.yaml
        │   ├── example.yaml
        │   ├── mae_vitb16.yaml
        │   ├── mocov3_vitb16.yaml
        │   ├── react_vitG14_OpenCLIP.yaml
        │   ├── react_vitb16_CLIP.yaml
        │   ├── react_vitb32_CLIP.yaml
        │   ├── react_vitl14_CLIP.yaml
        │   ├── vit_base_patch16_224.yaml
        │   ├── vit_base_patch32_224.yaml
        │   ├── vitb16_CLIP.yaml
        │   ├── vitb32_CLIP.yaml
        │   ├── vitb32_DeCLIP.yaml
        │   ├── vitb32_DeCLIP_YFCC15M.yaml
        │   ├── vitb32_FILIP.yaml
        │   └── vitb32_SLIP.yaml
    └── utils
        ├── __init__.py
        ├── comm.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # IntelliJ project files
 2 | .idea
 3 | *.iml
 4 | out
 5 | gen
 6 | 
 7 | ### Vim template
 8 | [._]*.s[a-w][a-z]
 9 | [._]s[a-w][a-z]
10 | *.un~
11 | Session.vim
12 | .netrwhist
13 | *~
14 | 
15 | ### IPythonNotebook template
16 | # Temporary data
17 | .ipynb_checkpoints/
18 | 
19 | ### Python template
20 | # Byte-compiled / optimized / DLL files
21 | __pycache__/
22 | *.py[cod]
23 | *$py.class
24 | 
25 | # C extensions
26 | *.so
27 | 
28 | # Distribution / packaging
29 | .Python
30 | env/
31 | build/
32 | develop-eggs/
33 | dist/
34 | downloads/
35 | eggs/
36 | .eggs/
37 | #lib/
38 | #lib64/
39 | parts/
40 | sdist/
41 | var/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 | 
46 | # PyInstaller
47 | #  Usually these files are written by a python script from a template
48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 | 
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 | 
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *,cover
65 | 
66 | # Translations
67 | *.mo
68 | *.pot
69 | 
70 | # Django stuff:
71 | *.log
72 | 
73 | # Sphinx documentation
74 | docs/_build/
75 | 
76 | # PyBuilder
77 | target/
78 | 
79 | # *.ipynb
80 | *.params
81 | .vscode/
82 | *.code-workspace/
83 | 
84 | lib/pycocotools/_mask.c
85 | lib/nms/cpu_nms.c
86 | 
87 | OUTPUT
88 | OUTPUT/*
89 | models/*
90 | DATASET
91 | DATASET/*
92 | # external/
93 | 
94 | outputs
95 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Computer-Vision-in-the-Wild
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | yacs~=0.1.8
 2 | scikit-learn
 3 | timm~=0.4.12
 4 | numpy~=1.21.0
 5 | sharedmem
 6 | git+https://github.com/openai/CLIP.git
 7 | git+https://github.com/haotian-liu/CLIP_vlp.git
 8 | torch~=1.7.0
 9 | PyYAML~=5.4.1
10 | Pillow~=9.0.1
11 | torchvision~=0.8.0
12 | vision-evaluation==0.2.9
13 | vision-datasets==0.2.17
14 | tqdm~=4.62.3
15 | transformers~=4.11.3
16 | protobuf~=3.20.1
17 | ftfy~=6.1.1
18 | nltk~=3.7
19 | openai # to call gpt3 for knowledge extraction
20 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | ############## Configuration section begins ##################
 2 | 
 3 | # Model Config: [vitb32_CLIP, vitb16_CLIP, mae_vitb16, mocov3_vitb16, vit_base_patch16_224, vit_base_patch32_224, deit_base_patch16_224]
 4 | model_cfg=vitb32_CLIP
 5 | 
 6 | # Mode: [linear_probe, finetune, zeroshot]
 7 | mode=zeroshot
 8 | 
 9 | # Use FP32 [default: True]
10 | use_fp32=True
11 | 
12 | # Dataset: [caltech101]
13 | dataset=caltech101
14 | 
15 | # Model checkpoint
16 | model_ckpt=.
17 | 
18 | # output directory
19 | output_dir=./outputs
20 | 
21 | ############ Configurations for hyperparameter tuning begin ############
22 | # set to True to disable the automatic hyperparameter tuning
23 | # and set the learning rate and weight accordingly below
24 | # This option is only effective for linear probe and finetuning.
25 | 
26 | disable_hyperparameter_tuning=False
27 | learning_rate=0.1
28 | l2_weight_decay=1e-6
29 | 
30 | ############ Configurations for hyperparameter tuning end   ############
31 | 
32 | ############ Configurations for linear_probe/finetune begin ############
33 | 
34 | # Random seed: [0,1,2]
35 | random_seed=0
36 | 
37 | # Shots: {5, 20, 50} for few shot, and -1 for full-shot
38 | num_shots=5
39 | 
40 | # Whether to init the linear head with the text encoder
41 | init_head_with_text_encoder=True
42 | 
43 | # whether to merge the encoder and the linear head
44 | merge_encoder_and_proj=False
45 | 
46 | ############ Configurations for linear_probe/finetune end   ############
47 | 
48 | ############ Configurations for adding knowledge begin ############
49 | # Please change the knowledge source accordingly.
50 | 
51 | use_wordnet_hierachy=False
52 | use_wordnet_definition=False
53 | use_wiktionary_definition=False
54 | use_gpt3=False
55 | use_gpt3_count=0
56 | 
57 | ############ Configurations for adding knowledge end   ############
58 | 
59 | ############## Configuration section ends ##################
60 | 
61 | 
62 | # Launching the job......
63 | 
64 | cd vision_benchmark
65 | 
66 | if [[ "$mode" = "linear_probe" ]]; then
67 |     python commands/linear_probe.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.FREEZE_IMAGE_BACKBONE True TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt
68 | elif [[ "$mode" = "finetune" ]]; then
69 |     python commands/finetune.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt
70 | 
71 | elif [[ "$mode" = "zeroshot" ]]; then
72 |     python commands/zeroshot.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml MODEL.CLIP_FP32 $use_fp32 DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt
73 | else
74 |     echo Unknown mode! Please check and set mode to one of {linear_probe, finetune, zeroshot}.
75 |     exit -1
76 | fi;


--------------------------------------------------------------------------------
/run_gpt3.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | output_dir=/home/chunyl/project/OUTPUT_DIR/GPT3 # the path that the generated gpt3 knowledge is saved
 3 | apikey=XXXX # Please use your GPT3 API key
 4 | 
 5 | ds='cifar10'
 6 | # ['eurosat-clip','country211','kitti-distance','oxford-iiit-pets','ping-attack-on-titan-plus','ping-whiskey-plus','rendered-sst2','resisc45-clip','voc2007classification','caltech101','cifar10','cifar100','dtd','fer2013','fgvc-aircraft-2013b','flower102','food101','gtsrb','hateful-memes','mnist','patchcamelyon','stanfordcar']
 7 | 
 8 | 
 9 | cd vision_benchmark
10 | 
11 | 
12 | python commands/extract_gpt3_knowledge.py --ds resources/datasets/$ds.yaml --apikey $apikey --n_shot 3 --n_ensemble 5 \
13 | --target local DATASET.ROOT $output_dir/datasets/ds OUTPUT_DIR $output_dir/log 
14 | 
15 | 
16 | 
17 | # pip install openai
18 | # pip install nltk, spacy
19 | # python -m spacy download en


--------------------------------------------------------------------------------
/run_multi.sh:
--------------------------------------------------------------------------------
 1 | ############## Configuration section begins ##################
 2 | 
 3 | # Model Config: [vitb32_CLIP, vitb16_CLIP, mae_vitb16, mocov3_vitb16, vit_base_patch16_224, vit_base_patch32_224, deit_base_patch16_224]
 4 | model_cfg=vitb32_CLIP
 5 | 
 6 | # Mode: [linear_probe, finetune, zeroshot]
 7 | mode=zeroshot
 8 | 
 9 | # Use FP32 [default: True]
10 | use_fp32=True
11 | 
12 | # Dataset: [caltech101]
13 | dataset=$DATASET
14 | 
15 | # Model checkpoint
16 | model_ckpt=.
17 | 
18 | # output directory
19 | output_dir=$OUTPUT_DIR
20 | 
21 | ############ Configurations for hyperparameter tuning begin ############
22 | # set to True to disable the automatic hyperparameter tuning
23 | # and set the learning rate and weight accordingly below
24 | # This option is only effective for linear probe and finetuning.
25 | 
26 | disable_hyperparameter_tuning=False
27 | learning_rate=0.1
28 | l2_weight_decay=1e-6
29 | 
30 | ############ Configurations for hyperparameter tuning end   ############
31 | 
32 | ############ Configurations for linear_probe/finetune begin ############
33 | 
34 | # Random seed: [0,1,2]
35 | random_seed=0
36 | 
37 | # Shots: {5, 20, 50} for few shot, and -1 for full-shot
38 | num_shots=5
39 | 
40 | # Whether to init the linear head with the text encoder
41 | init_head_with_text_encoder=True
42 | 
43 | # whether to merge the encoder and the linear head
44 | merge_encoder_and_proj=False
45 | 
46 | ############ Configurations for linear_probe/finetune end   ############
47 | 
48 | ############ Configurations for adding knowledge begin ############
49 | # Please change the knowledge source accordingly.
50 | 
51 | use_wordnet_hierachy=False
52 | use_wordnet_definition=False
53 | use_wiktionary_definition=False
54 | use_gpt3=False
55 | use_gpt3_count=0
56 | 
57 | ############ Configurations for adding knowledge end   ############
58 | 
59 | ############## Configuration section ends ##################
60 | 
61 | 
62 | # Launching the job......
63 | 
64 | cd vision_benchmark
65 | 
66 | if [[ "$mode" = "linear_probe" ]]; then
67 |     python commands/linear_probe.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.FREEZE_IMAGE_BACKBONE True TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt
68 | elif [[ "$mode" = "finetune" ]]; then
69 |     python commands/finetune.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml --no-tuning $disable_hyperparameter_tuning --lr $learning_rate --l2 $l2_weight_decay MODEL.CLIP_FP32 $use_fp32 DATASET.NUM_SAMPLES_PER_CLASS $num_shots DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log DATASET.RANDOM_SEED_SAMPLING $random_seed TRAIN.INIT_HEAD_WITH_TEXT_ENCODER $init_head_with_text_encoder TRAIN.MERGE_ENCODER_AND_HEAD_PROJ $merge_encoder_and_proj KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt
70 | 
71 | elif [[ "$mode" = "zeroshot" ]]; then
72 |     python commands/zeroshot.py --ds resources/datasets/$dataset.yaml --model resources/model/$model_cfg.yaml MODEL.CLIP_FP32 $use_fp32 DATASET.ROOT $output_dir/datasets OUTPUT_DIR $output_dir/$model_cfg/log KNOWLEDGE.WORDNET.USE_HIERARCHY $use_wordnet_hierachy KNOWLEDGE.WORDNET.USE_DEFINITION $use_wordnet_definition KNOWLEDGE.WIKITIONARY.USE_DEFINITION $use_wiktionary_definition KNOWLEDGE.GPT3.USE_GPT3 $use_gpt3 KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS $use_gpt3_count TEST.MODEL_FILE $model_ckpt
73 | else
74 |     echo Unknown mode! Please check and set mode to one of {linear_probe, finetune, zeroshot}.
75 |     exit -1
76 | fi;


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | VERSION = '0.0.1'
 4 | 
 5 | setuptools.setup(name='vision_benchmark',
 6 |                  author='chunyl',
 7 |                  author_email='chunyl@microsoft.com',
 8 |                  version=VERSION,
 9 |                  python_requires='>=3.6',
10 |                  packages=setuptools.find_packages(exclude=['test', 'test.*']),
11 |                  package_data={'': ['resources/*']},
12 |                  install_requires=[
13 |                      'yacs~=0.1.8',
14 |                      'scikit-learn',
15 |                      'timm>=0.3.4',
16 |                      'numpy>=1.18.0',
17 |                      'sharedmem',
18 |                      'torch>=1.7.0',
19 |                      'PyYAML~=5.4.1',
20 |                      'Pillow',
21 |                      'torchvision>=0.8.0',
22 |                      'vision-datasets>=0.2.0',
23 |                      'vision-evaluation>=0.2.2',
24 |                      'tqdm~=4.62.3',
25 |                      'transformers~=4.11.3'
26 |                  ],
27 |                  entry_points={
28 |                      'console_scripts': [
29 |                          'vb_linear_probe=vision_benchmark.commands.linear_probe:main',
30 |                          'vb_zero_shot_eval=vision_benchmark.commands.zeroshot_eval:main',
31 |                          'vb_eval=vision_benchmark.commands.eval:main',
32 |                          'vb_submit_to_leaderboard=vision_benchmark.commands.submit_predictions:main',
33 |                          'vb_image_caption_eval=vision_benchmark.commands.image_caption_eval:main',
34 |                      ]
35 |                  })
36 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = .git,build,dist,venv,.idea
3 | max-line-length = 200
4 | 
5 | [pytest]
6 | junit_family = xunit2


--------------------------------------------------------------------------------
/vision_benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/__init__.py


--------------------------------------------------------------------------------
/vision_benchmark/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/commands/__init__.py


--------------------------------------------------------------------------------
/vision_benchmark/commands/finetune.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Linear Probe with sklearn Logistic Regression or linear model.
  3 | """
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import argparse
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import os
 13 | import random
 14 | 
 15 | from vision_datasets import DatasetTypes
 16 | from vision_benchmark.common.constants import get_dataset_hub
 17 | from vision_benchmark.utils import comm, create_logger
 18 | from vision_benchmark.evaluation import construct_dataloader, full_model_finetune
 19 | from vision_benchmark.config import config, update_config
 20 | # These 2 lines are a walk-around for "Too many open files error". Refer: https://github.com/pytorch/pytorch/issues/11201
 21 | import torch.multiprocessing
 22 | from vision_benchmark.common.utils import log_arg_env_config, submit_predictions
 23 | 
 24 | torch.multiprocessing.set_sharing_strategy('file_system')
 25 | 
 26 | MULTILABEL_DATASETS = {"chestx-ray8"}
 27 | 
 28 | 
 29 | def add_finetuning_args(parser):
 30 |     parser.add_argument('--ds', required=False, help='Evaluation dataset configure file name.', type=str)
 31 |     parser.add_argument('--model', required=True, help='Evaluation model configure file name', type=str)
 32 |     parser.add_argument('--submit-predictions', help='submit predictions and model info to leaderboard.', default=False, action='store_true')
 33 |     parser.add_argument('--submit-by', help='Person who submits the results.', type=str)
 34 |     parser.add_argument('--no-tuning', help='No hyperparameter-tuning.', default=False, type=lambda x:x.lower()=="true")
 35 |     parser.add_argument('--l2', help='(Inverse) L2 regularization strength. This option is only useful when option --no-tuning is True.', default=0.316, type=float)
 36 |     parser.add_argument('--lr', help='Test with a specific learning rate. This option is only useful when option --no-tuning is True.', default=0.001, type=float)
 37 |     parser.add_argument('--run', help='Run id', default=1, type=int)
 38 |     parser.add_argument('--fix_seed', help='Fix the random seed. [-1] not fixing the seeds', default=0, type=int)
 39 |     parser.add_argument('--save-predictions', help='save predictions logits for analysis.', default=True, action='store_true')
 40 | 
 41 |     parser.add_argument('opts',
 42 |                         help="Modify config options using the command-line",
 43 |                         default=None,
 44 |                         nargs=argparse.REMAINDER)    
 45 | 
 46 | 
 47 | def main():
 48 |     parser = argparse.ArgumentParser(description='Test a classification model, with finetuning.')
 49 |     add_finetuning_args(parser)
 50 |     args = parser.parse_args()
 51 | 
 52 |     args.cfg = args.ds
 53 |     update_config(config, args)
 54 |     args.cfg = args.model
 55 |     update_config(config, args)
 56 |     config.defrost()
 57 |     config.NAME = ''
 58 |     config.freeze()
 59 | 
 60 |     if args.submit_predictions:
 61 |         assert args.submit_by
 62 | 
 63 |     if args.fix_seed != -1:
 64 |         random.seed(args.fix_seed)
 65 |         np.random.seed(args.fix_seed)
 66 |         torch.manual_seed(args.fix_seed)
 67 |         torch.cuda.manual_seed_all(args.fix_seed)
 68 | 
 69 |     n_samples = str(config.DATASET.NUM_SAMPLES_PER_CLASS) if config.DATASET.NUM_SAMPLES_PER_CLASS > 0 else 'full'
 70 |     exp_name = 'finetuning_' + n_samples
 71 |     if config.TRAIN.TWO_LR: exp_name += '_two_lr'
 72 |     final_output_dir = create_logger(config, exp_name)
 73 | 
 74 |     if config.DATASET.NUM_SAMPLES_PER_CLASS == 1:
 75 |         config.defrost()
 76 |         config.DATASET.NUM_SAMPLES_PER_CLASS = 2
 77 |         config.DATASET.MERGE_TRAIN_VAL_FINAL_RUN = False
 78 |         config.freeze()
 79 | 
 80 |     if comm.is_main_process():
 81 |         log_arg_env_config(args, config, final_output_dir)
 82 | 
 83 |     if config.DATASET.DATASET == 'patch-camelyon' and config.DATASET.NUM_SAMPLES_PER_CLASS == -1:
 84 |         # deal with patch camelyon large dataset (search using 10000-shot subset, final run with the full dataset)
 85 |         logging.info(f'Detecting large dataset with {config.DATASET.NUM_SAMPLES_PER_CLASS}-shot.')
 86 |         config.defrost()
 87 |         config.DATASET.NUM_SAMPLES_PER_CLASS = 10000
 88 |         config.freeze()
 89 |         logging.info(f'Used the subset ({config.DATASET.NUM_SAMPLES_PER_CLASS}-shot) to train the model.')
 90 | 
 91 |     logging.info(f'{config.DATASET.DATASET} is a dataset.')
 92 |     train_dataloader, val_dataloader, test_dataloader = construct_dataloader(config)
 93 | 
 94 |     # Run full model finetuning
 95 |     logging.info('Finetuning with full model. This may take several minutes to hours depending on the size of your data.')
 96 |     best_acc, model_info = full_model_finetune(train_dataloader, val_dataloader, test_dataloader, args.no_tuning, args.lr, args.l2, config)
 97 | 
 98 |     test_predictions = model_info['best_logits']
 99 | 
100 |     if args.save_predictions:
101 |         import json
102 | 
103 |         # a hack to control the json dump float accuracy
104 |         # if you find the accuracy is not enough, pleae consider increasing `prec`.
105 |         def json_prec_dump(data, prec=6):
106 |             return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec)))
107 | 
108 |         results_dict = {
109 |             'model_name': config.MODEL.NAME,
110 |             'dataset_name': config.DATASET.DATASET,
111 |             'num_trainable_params': model_info.get('n_trainable_params', None),
112 |             'num_params': model_info.get('n_params', None),
113 |             'num_visual_params': model_info.get('n_visual_params', None),
114 |             'num_backbone_params': model_info.get('n_backbone_params', None),
115 |             'n_shot': config.DATASET.NUM_SAMPLES_PER_CLASS,
116 |             'rnd_seeds': [config.DATASET.RANDOM_SEED_SAMPLING],
117 |             'predictions': [test_predictions.tolist()],
118 |         }
119 |         json_string = json_prec_dump(results_dict)
120 | 
121 |         prediction_folder = os.path.join(config.OUTPUT_DIR, 'predictions', exp_name)
122 |         os.makedirs(prediction_folder, exist_ok=True)
123 |         with open(os.path.join(prediction_folder, f'seed{config.DATASET.RANDOM_SEED_SAMPLING}_{config.DATASET.DATASET}.json' ) , 'w') as outfile:
124 |             outfile.write(json_string)
125 | 
126 | if __name__ == '__main__':
127 |     main()
128 | 


--------------------------------------------------------------------------------
/vision_benchmark/commands/linear_probe.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Linear Probe with sklearn Logistic Regression or linear model.
  3 | """
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import argparse
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import random
 13 | 
 14 | import os
 15 | 
 16 | from vision_benchmark.utils import comm, create_logger
 17 | from vision_benchmark.evaluation import construct_dataloader, full_model_finetune
 18 | from vision_benchmark.config import config, update_config
 19 | # These 2 lines are a walk-around for "Too many open files error". Refer: https://github.com/pytorch/pytorch/issues/11201
 20 | import torch.multiprocessing
 21 | from vision_benchmark.common.utils import log_arg_env_config, submit_predictions
 22 | 
 23 | torch.multiprocessing.set_sharing_strategy('file_system')
 24 | 
 25 | MULTILABEL_DATASETS = {"chestx-ray8"}
 26 | 
 27 | 
 28 | def add_linear_probing_args(parser):
 29 |     parser.add_argument('--ds', required=False, help='Evaluation dataset configure file name.', type=str)
 30 |     parser.add_argument('--model', required=True, help='Evaluation model configure file name', type=str)
 31 |     parser.add_argument('--submit-predictions', help='submit predictions and model info to leaderboard.', default=False, action='store_true')
 32 |     parser.add_argument('--submit-by', help='Person who submits the results.', type=str)
 33 | 
 34 |     parser.add_argument('--no-tuning', help='No hyperparameter-tuning.', default=False, type=lambda x:x.lower()=="true")
 35 |     parser.add_argument('--l2', help='(Inverse) L2 regularization strength. This option is only useful when option --no-tuning is True.', default=0.316, type=float)
 36 |     parser.add_argument('--lr', help='Test with a specific learning rate. This option is only useful when option --no-tuning is True.', default=0.001, type=float)
 37 |     parser.add_argument('--run', help='Run id', default=1, type=int)
 38 |     parser.add_argument('--fix_seed', help='Fix the random seed. [-1] not fixing the seeds', default=0, type=int)
 39 |     parser.add_argument('--save-predictions', help='save predictions logits for analysis.', default=True, action='store_true')
 40 | 
 41 |     parser.add_argument('opts',
 42 |                         help="Modify config options using the command-line",
 43 |                         default=None,
 44 |                         nargs=argparse.REMAINDER)    
 45 | 
 46 | def main():
 47 |     parser = argparse.ArgumentParser(description='Test a classification model, with linear probing.')
 48 |     add_linear_probing_args(parser)
 49 |     args = parser.parse_args()
 50 | 
 51 |     args.cfg = args.ds
 52 |     update_config(config, args)
 53 |     args.cfg = args.model
 54 |     update_config(config, args)
 55 |     config.defrost()
 56 |     config.NAME = ''
 57 |     config.freeze()
 58 | 
 59 |     if args.submit_predictions:
 60 |         assert args.submit_by
 61 | 
 62 |     if args.fix_seed != -1:
 63 |         random.seed(args.fix_seed)
 64 |         np.random.seed(args.fix_seed)
 65 |         torch.manual_seed(args.fix_seed)
 66 |         torch.cuda.manual_seed_all(args.fix_seed)
 67 | 
 68 |     n_samples = str(config.DATASET.NUM_SAMPLES_PER_CLASS) if config.DATASET.NUM_SAMPLES_PER_CLASS >= 0 else 'full'
 69 |     exp_name = 'linear_probe_' + n_samples
 70 | 
 71 |     if config.DATASET.NUM_SAMPLES_PER_CLASS == 1:
 72 |         config.defrost()
 73 |         config.DATASET.NUM_SAMPLES_PER_CLASS = 2
 74 |         config.DATASET.MERGE_TRAIN_VAL_FINAL_RUN = False
 75 |         config.freeze()
 76 | 
 77 |     # Follow MAE's design choice: not using global pool in linear probe
 78 |     if config.MODEL.NAME.startswith('mae_'):
 79 |         config.defrost()
 80 |         config.MODEL.SPEC.GLOBAL_POOL = False
 81 |         config.freeze()
 82 | 
 83 |     final_output_dir = create_logger(config, exp_name)
 84 |     if comm.is_main_process():
 85 |         log_arg_env_config(args, config, final_output_dir)
 86 | 
 87 |     if config.DATASET.DATASET == 'patch-camelyon' and config.DATASET.NUM_SAMPLES_PER_CLASS == -1:
 88 |         # deal with patch camelyon large dataset (search using 10000-shot subset, final run with the full dataset)
 89 |         logging.info(f'Detecting large dataset with {config.DATASET.NUM_SAMPLES_PER_CLASS}-shot.')
 90 |         config.defrost()
 91 |         config.DATASET.NUM_SAMPLES_PER_CLASS = 10000
 92 |         config.freeze()
 93 |         logging.info(f'Used the subset ({config.DATASET.NUM_SAMPLES_PER_CLASS}-shot) to train the model.')
 94 | 
 95 |     # Run linear probe
 96 |     train_dataloader, val_dataloader, test_dataloader = construct_dataloader(config)
 97 | 
 98 |     best_acc, model_info = full_model_finetune(train_dataloader, val_dataloader, test_dataloader, args.no_tuning, args.lr, args.l2, config)
 99 |     test_predictions = model_info['best_logits']
100 | 
101 |     if args.save_predictions:
102 |         import json
103 | 
104 |         # a hack to control the json dump float accuracy
105 |         # if you find the accuracy is not enough, pleae consider increasing `prec`.
106 |         def json_prec_dump(data, prec=6):
107 |             return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec)))
108 | 
109 |         results_dict = {
110 |             'model_name': config.MODEL.NAME,
111 |             'dataset_name': config.DATASET.DATASET,
112 |             'num_trainable_params': model_info.get('n_trainable_params', None),
113 |             'num_params': model_info.get('n_params', None),
114 |             'num_visual_params': model_info.get('n_visual_params', None),
115 |             'num_backbone_params': model_info.get('n_backbone_params', None),
116 |             'n_shot': config.DATASET.NUM_SAMPLES_PER_CLASS,
117 |             'rnd_seeds': [config.DATASET.RANDOM_SEED_SAMPLING],
118 |             'predictions': [test_predictions.tolist()],
119 |         }
120 |         json_string = json_prec_dump(results_dict)
121 | 
122 |         prediction_folder = os.path.join(config.OUTPUT_DIR, 'predictions', exp_name)
123 |         os.makedirs(prediction_folder, exist_ok=True)
124 |         with open(os.path.join(prediction_folder, f'seed{config.DATASET.RANDOM_SEED_SAMPLING}_{config.DATASET.DATASET}.json' ) , 'w') as outfile:
125 |             outfile.write(json_string)
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     main()
130 | 


--------------------------------------------------------------------------------
/vision_benchmark/commands/prepare_submit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | submit predictions to leaderboard service
 3 | """
 4 | import argparse
 5 | from collections import defaultdict
 6 | import json
 7 | import logging
 8 | import pathlib
 9 | import zipfile
10 | import itertools
11 | import numpy as np
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser(description='Submit predictions to leaderboard service.')
15 |     parser.add_argument('--combine_path', required=True, help='Prediction json file path.', type=pathlib.Path)
16 |     parser.add_argument('--combine_name', default='all_predictions', required=False, help='Output file name.', type=str)
17 |     args = parser.parse_args()
18 | 
19 |     return args
20 | 
21 | 
22 | # if you find the accuracy is not enough, pleae consider increasing `prec`.
23 | def json_prec_dump(data, prec=6):
24 |     return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec)))
25 | 
26 | 
27 | def main():
28 |     logging.basicConfig(level=logging.INFO)
29 |     args = parse_args()
30 | 
31 |     all_predictions = defaultdict(list)
32 |     for prediction_file in args.combine_path.iterdir():
33 |         if prediction_file.suffix != '.json':
34 |             print(f'Ignoring file {prediction_file.name} by suffix.')
35 |             continue
36 |         prediction_data = json.loads(prediction_file.read_text())
37 |         all_predictions[prediction_data['dataset_name']].append(prediction_data)
38 | 
39 |     all_combine_predictions = []
40 | 
41 |     KNOWN_AVERAGE_KEYS = ['num_trainable_params']
42 |     KNOWN_MERGE_KEYS = ['rnd_seeds', 'predictions']
43 |     KNOWN_DIFF_KEYS = KNOWN_AVERAGE_KEYS + KNOWN_MERGE_KEYS
44 | 
45 |     for ds, prediction_data in all_predictions.items():
46 |         prediction_keys = list(prediction_data[0])
47 |         combined_dict = dict()
48 |         for key in prediction_keys:
49 |             values = [x[key] for x in prediction_data]
50 |             if key not in KNOWN_DIFF_KEYS:
51 |                 assert all(x == values[0] for x in values)
52 |                 values = values[0]
53 |             else:
54 |                 if key in KNOWN_MERGE_KEYS:
55 |                     values = list(itertools.chain.from_iterable(values))
56 |                 elif key in KNOWN_AVERAGE_KEYS:
57 |                     values = np.asarray(values).mean()
58 |                 else:
59 |                     assert False
60 |             combined_dict[key] = values
61 |         all_combine_predictions.append(combined_dict)
62 | 
63 |     all_predictions = {"data": all_combine_predictions}
64 |     all_predictions = json_prec_dump(all_predictions)
65 |     save_path = args.combine_path / f'{args.combine_name}.zip'
66 |     zf = zipfile.ZipFile(save_path, "w", zipfile.ZIP_DEFLATED)
67 |     zf.writestr('all_predictions.json', all_predictions)
68 |     zf.close()
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/vision_benchmark/commands/zeroshot.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zero shot evaluation.
  3 | """
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import os
  9 | import argparse
 10 | import logging
 11 | 
 12 | import numpy as np
 13 | 
 14 | from vision_benchmark.common.utils import log_arg_env_config
 15 | from vision_benchmark.utils import comm, create_logger
 16 | from vision_benchmark.datasets import SimpleTokenizer, HFPTTokenizer
 17 | from vision_benchmark.evaluation import extract_features, extract_text_features, clip_zeroshot_evaluator
 18 | from vision_benchmark.config import config, update_config
 19 | 
 20 | 
 21 | def add_zero_shot_args(parser):
 22 |     parser.add_argument('--ds', required=False, help='Evaluation dataset configure file name.', type=str)
 23 |     parser.add_argument('--model', required=True, help='Clip model configure file name', type=str)
 24 |     parser.add_argument('--text_feature_only', help='consider text feature or not.', default=False, action='store_true')
 25 |     parser.add_argument('--save-predictions', help='save predictions logits for analysis.', default=True, type=lambda x: (str(x).lower() == 'true'))
 26 |     parser.add_argument('opts',
 27 |                         help="Modify config options using the command-line",
 28 |                         default=None,
 29 |                         nargs=argparse.REMAINDER)    
 30 | 
 31 | def load_or_extract_features(args, cfg):
 32 |     if cfg.MODEL.SPEC.TEXT.TOKENIZER == 'clip':
 33 |         tokenizer = SimpleTokenizer()
 34 |     elif 'hf_' in cfg.MODEL.SPEC.TEXT.TOKENIZER:
 35 |         tokenizer = HFPTTokenizer(pt_name=cfg.MODEL.SPEC.TEXT.TOKENIZER[3:])
 36 |     else:
 37 |         tokenizer = None
 38 | 
 39 |     # Load or extract image features.
 40 |     feature_file = os.path.join(cfg.DATASET.ROOT, 'zeroshot_features_' + cfg.MODEL.NAME.replace('/', '') + f'_wiki_{cfg.KNOWLEDGE.WIKITIONARY.USE_DEFINITION}' + f'_gpt3_{cfg.KNOWLEDGE.GPT3.USE_GPT3}' + '.npy')
 41 |     logging.info(f'feature_file: {feature_file}')
 42 |     if os.path.exists(feature_file):
 43 |         logging.info('Loading features from existing files.')
 44 |         with open(feature_file, 'rb') as fread:
 45 |             image_features = np.load(fread)
 46 |             text_features = np.load(fread)
 47 |             image_labels = np.load(fread)
 48 |     else:
 49 |         image_features, image_labels = extract_features(cfg, test_split_only=True)
 50 |         text_features = extract_text_features(cfg, tokenizer, args)
 51 |     logging.info(f'Test size is {image_features.shape[0]}.')
 52 | 
 53 |     return image_features, text_features, image_labels
 54 | 
 55 | def load_or_extract_text_features(args, cfg):
 56 |     if cfg.MODEL.SPEC.TEXT.TOKENIZER == 'clip':
 57 |         tokenizer = SimpleTokenizer()
 58 |     elif 'hf_' in cfg.MODEL.SPEC.TEXT.TOKENIZER:
 59 |         tokenizer = HFPTTokenizer(pt_name=cfg.MODEL.SPEC.TEXT.TOKENIZER[3:])
 60 |     else:
 61 |         tokenizer = None
 62 | 
 63 |     # Load or extract image features.
 64 |     feature_file = os.path.join(cfg.DATASET.ROOT, 'zeroshot_text_features_' + cfg.MODEL.NAME.replace('/', '') + f'_wiki_{cfg.KNOWLEDGE.WIKITIONARY.USE_DEFINITION}' + f'_gpt3_{cfg.KNOWLEDGE.GPT3.USE_GPT3}' + '.npy')
 65 |     logging.info(f'feature_file: {feature_file}')
 66 |     if os.path.exists(feature_file):
 67 |         logging.info('Loading features from existing files.')
 68 |         with open(feature_file, 'rb') as fread:
 69 |             text_features = np.load(fread)
 70 |     else:
 71 |         wiki_dict, gpt3_dict = extract_text_features(cfg, tokenizer, args)
 72 |     logging.info(f'Test size is {len(wiki_dict)}.')
 73 | 
 74 |     return wiki_dict, gpt3_dict
 75 | 
 76 | def main():
 77 |     parser = argparse.ArgumentParser(description='Zero-shot evaluation script.')
 78 |     add_zero_shot_args(parser)
 79 |     args = parser.parse_args()
 80 | 
 81 |     args.cfg = args.ds
 82 |     update_config(config, args)
 83 |     args.cfg = args.model
 84 |     update_config(config, args)
 85 |     config.defrost()
 86 |     config.NAME = ""
 87 |     config.freeze()
 88 | 
 89 |     exp_name = 'zeroshot_eval_' + f'wiki_{config.KNOWLEDGE.WIKITIONARY.USE_DEFINITION}_wnh_{config.KNOWLEDGE.WORDNET.USE_HIERARCHY}_wnd_{config.KNOWLEDGE.WORDNET.USE_DEFINITION}_gpt3_{config.KNOWLEDGE.GPT3.USE_GPT3}'
 90 |     exp_name += f'agg_{config.KNOWLEDGE.AGGREGATION.MEHTOD}_gpt3count_{config.KNOWLEDGE.AGGREGATION.NUM_GPT3_ITEMS}'
 91 |     final_output_dir = create_logger(config, exp_name)
 92 | 
 93 |     if comm.is_main_process():
 94 |         log_arg_env_config(args, config, final_output_dir)
 95 | 
 96 |     if args.text_feature_only:
 97 |         wiki_dict, gpt3_dict = load_or_extract_text_features(args, config)
 98 | 
 99 |     else:
100 |         image_features, text_features, image_labels = load_or_extract_features(args, config)
101 |         result, test_predictions, metric = clip_zeroshot_evaluator(image_features, text_features, image_labels, config)
102 |         msg = f'=> TEST: {metric} {100 * result:.3f}% '
103 |         logging.info(msg)
104 | 
105 |     if args.save_predictions:
106 |         import json
107 | 
108 |         # a hack to control the json dump float accuracy
109 |         # if you find the accuracy is not enough, pleae consider increasing `prec`.
110 |         def json_prec_dump(data, prec=6):
111 |             return json.dumps(json.loads(json.dumps(data), parse_float=lambda x: round(float(x), prec)))
112 | 
113 |         results_dict = {
114 |             'model_name': f'CLIP-{config.MODEL.NAME}',
115 |             'dataset_name': config.DATASET.DATASET,
116 |             'num_trainable_params': 0,
117 |             'num_params': config.MODEL.STATS.get('n_params', None),
118 |             'num_visual_params': config.MODEL.STATS.get('n_visual_params', None),
119 |             'num_backbone_params': config.MODEL.STATS.get('n_backbone_params', None),
120 |             'n_shot': 0,
121 |             'rnd_seeds': [0],
122 |             'predictions': [test_predictions.cpu().data.numpy().tolist()],
123 |         }
124 |         json_string = json_prec_dump(results_dict)
125 | 
126 |         prediction_folder = os.path.join(config.OUTPUT_DIR, 'predictions', exp_name)
127 |         os.makedirs(prediction_folder, exist_ok=True)
128 |         with open(os.path.join(prediction_folder, f'{config.DATASET.DATASET}.json' ) , 'w') as outfile:
129 |             outfile.write(json_string)
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/vision_benchmark/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/common/__init__.py


--------------------------------------------------------------------------------
/vision_benchmark/common/constants.py:
--------------------------------------------------------------------------------
 1 | from vision_datasets import DatasetHub
 2 | import pathlib
 3 | 
 4 | VISION_DATASET_STORAGE = 'https://cvinthewildeus.blob.core.windows.net/datasets?sp=r&st=2023-08-28T01:41:20Z&se=3023-08-28T09:41:20Z&sv=2022-11-02&sr=c&sig=Msoq5dIl%2Fve6F01edGr8jgcZUt7rtsuJ896xvstSNfM%3D'
 5 | 
 6 | 
 7 | def get_dataset_hub():
 8 |     vision_dataset_json = (pathlib.Path(__file__).resolve().parents[1] / 'resources' / 'datasets' / 'vision_datasets.json').read_text()
 9 |     hub = DatasetHub(vision_dataset_json)
10 | 
11 |     return hub
12 | 


--------------------------------------------------------------------------------
/vision_benchmark/common/data_class_base.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import dataclasses
 3 | 
 4 | 
 5 | class DataClassBase:
 6 |     def __post_init__(self):
 7 |         self.validate()
 8 | 
 9 |     @classmethod
10 |     def from_dict(cls, data_content):
11 |         c = {}
12 |         for field in dataclasses.fields(cls):
13 |             d_type = DataClassBase._get_dataclass_type(field.type)
14 |             if field.name in data_content:
15 |                 c[field.name] = d_type.from_dict(data_content[field.name]) if d_type else data_content[field.name]
16 | 
17 |         assert len(data_content) == len(c), f"{data_content.keys()} vs {c.keys()}"
18 |         return cls(**c)
19 | 
20 |     def to_dict(self, skip_default=True):
21 |         result = {}
22 |         for f in dataclasses.fields(self):
23 |             value = getattr(self, f.name)
24 |             if dataclasses.is_dataclass(value):
25 |                 value = value.to_dict()
26 |             elif isinstance(value, (list, tuple)):
27 |                 value = type(value)(v.to_dict() if dataclasses.is_dataclass(v) else v for v in value)
28 |             if not skip_default or value != f.default:
29 |                 result[f.name] = value
30 |         return result
31 | 
32 |     def validate(self):
33 |         # Check the field types.
34 |         for field in dataclasses.fields(self):
35 |             if hasattr(field.type, '__origin__') and field.type.__origin__ in (tuple, collections.abc.Sequence):
36 |                 expected_types = field.type.__origin__
37 |             elif hasattr(field.type, '__args__'):
38 |                 # Optional[<type>].__args__ is (<type>, NoneType)
39 |                 expected_types = field.type.__args__
40 |             else:
41 |                 expected_types = field.type
42 | 
43 |             if not isinstance(self.__dict__[field.name], expected_types):
44 |                 raise TypeError(f"Unexpected field type for {field.name}: Expected: {expected_types}. Actual: {type(self.__dict__[field.name])}")
45 | 
46 |     def _raise_value_error(self, config_name, msg=None):
47 |         error_msg = f"Invalid {config_name}: {getattr(self, config_name)}."
48 |         if msg:
49 |             error_msg += ' ' + msg
50 | 
51 |         raise ValueError(error_msg)
52 | 
53 |     def _check_value(self, value_name, checker):
54 |         value = getattr(self, value_name)
55 |         if not checker(value):
56 |             raise ValueError(f"Invalid {value_name}: {value}.")
57 | 
58 |     def _get_dataclass_type(field_type):
59 |         """Returns dataclass type if the given type is dataclass or Optional[dataclass]."""
60 |         if dataclasses.is_dataclass(field_type):
61 |             return field_type
62 |         if hasattr(field_type, '__args__'):
63 |             args = field_type.__args__
64 |             if len(args) == 2 and type(None) in args:
65 |                 return next((t for t in args if dataclasses.is_dataclass(t)), None)
66 |         return None
67 | 


--------------------------------------------------------------------------------
/vision_benchmark/common/prediction_submission.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import datetime
  3 | import logging
  4 | import math
  5 | import pathlib
  6 | from typing import List
  7 | 
  8 | from .data_class_base import DataClassBase
  9 | from .constants import VISION_DATASET_STORAGE
 10 | from vision_datasets import DatasetTypes, DatasetHub, Usages, DatasetManifest
 11 | 
 12 | 
 13 | class Tasks:
 14 |     IC_MULTILABEL = DatasetTypes.IC_MULTILABEL
 15 |     IC_MULTICLASS = DatasetTypes.IC_MULTICLASS
 16 |     OBJECT_DETECTION = DatasetTypes.OD
 17 | 
 18 |     VALID_TYPES = [IC_MULTILABEL, IC_MULTICLASS, OBJECT_DETECTION]
 19 | 
 20 |     @staticmethod
 21 |     def is_valid(task):
 22 |         return task in Tasks.VALID_TYPES
 23 | 
 24 | 
 25 | class Tracks:
 26 |     LINEAR_PROBING = 'linear_probing'
 27 |     TRANSFER_LEARNING = 'transfer_learning'
 28 |     ZERO_SHOT = 'zero_shot'
 29 | 
 30 |     VALID_TYPES = [LINEAR_PROBING, TRANSFER_LEARNING, ZERO_SHOT]
 31 | 
 32 |     @staticmethod
 33 |     def is_valid(task, track):
 34 |         if track not in Tracks.VALID_TYPES:
 35 |             return False
 36 | 
 37 |         if task in [Tasks.IC_MULTICLASS, Tasks.IC_MULTILABEL]:
 38 |             return True
 39 | 
 40 |         if task == Tasks.OBJECT_DETECTION:
 41 |             return track != Tracks.LINEAR_PROBING
 42 | 
 43 |         return False
 44 | 
 45 | 
 46 | @dataclasses.dataclass(frozen=True)
 47 | class PredictionSubmission(DataClassBase):
 48 |     dataset_name: str
 49 |     model_name: str
 50 |     created_by: str
 51 |     task: str
 52 |     track: str
 53 |     predictions: List
 54 | 
 55 |     def validate(self):
 56 |         vision_dataset_json = (pathlib.Path(__file__).resolve().parents[1] / 'resources' / 'datasets' / 'vision_datasets.json').read_text()
 57 |         hub = DatasetHub(vision_dataset_json)
 58 |         dataset_names = set([x['name'] for x in hub.list_data_version_and_types()])
 59 | 
 60 |         self._check_value('dataset_name', lambda x: x and x in dataset_names)
 61 |         self._check_value('model_name', lambda x: x)
 62 |         self._check_value('created_by', lambda x: x)
 63 |         self._check_value('task', lambda x: Tasks.is_valid(x))
 64 |         self._check_value('track', lambda x: Tracks.is_valid(self.task, x))
 65 |         self._check_value('predictions', lambda x: x)
 66 |         dataset_manifest = hub.create_dataset_manifest(VISION_DATASET_STORAGE, None, self.dataset_name, usage=Usages.TEST_PURPOSE)[0]
 67 |         logging.info(f'Created test set manifest for {self.dataset_name}')
 68 |         for fold_idx, predictions in enumerate(self.predictions):
 69 |             PredictionSubmission.validate_predictions(dataset_manifest, predictions, fold_idx)
 70 | 
 71 |     @staticmethod
 72 |     def validate_predictions(dataset_manifest: DatasetManifest, predictions, fold_idx):
 73 |         assert predictions, f'fold {fold_idx}, empty predictions.'
 74 |         assert len(predictions) == len(dataset_manifest.images), f'fold {fold_idx}, Number of predictions does not match number of images.'
 75 | 
 76 |         if dataset_manifest.data_type in [DatasetTypes.IC_MULTICLASS, DatasetTypes.IC_MULTILABEL]:
 77 |             for i, probs in enumerate(predictions):
 78 |                 if dataset_manifest.data_type == DatasetTypes.IC_MULTICLASS:
 79 |                     sum_probs = sum(probs)
 80 |                     assert math.isclose(sum_probs, 1.0, rel_tol=1e-3), f'fold {fold_idx}, Sum of predicted prob vector for image {i}: {sum_probs}, should be 1.0.'
 81 | 
 82 |                 assert all([0.0 <= prob <= 1.0 for prob in probs]), f'fold {fold_idx}, Predicted prob for image {i} not in [0, 1]: {probs}'
 83 | 
 84 |         if dataset_manifest.data_type == DatasetTypes.OD:
 85 |             # [[[class_index, conf, L, T, R, B], [class_index, conf, L, T, R, B], ..., []], [...], ..., [...]]
 86 |             for i, img_wise_bboxes in enumerate(predictions):
 87 |                 for bbox_pred in img_wise_bboxes:
 88 |                     assert PredictionSubmission.is_valid_box(bbox_pred, len(dataset_manifest.labelmap)), f'fold {fold_idx}, Invalid predicted bbox for image {i}: {bbox_pred}'
 89 | 
 90 |     @staticmethod
 91 |     def is_valid_box(bbox_pred, num_classes):
 92 |         return len(bbox_pred) == 6 and (0 <= bbox_pred[0] < num_classes) and (0.0 <= bbox_pred[1] <= 1.0) and all([x >= 0 for x in bbox_pred[2:]]) and (bbox_pred[2] <= bbox_pred[4]) \
 93 |             and (bbox_pred[3] <= bbox_pred[5])
 94 | 
 95 | 
 96 | @dataclasses.dataclass(frozen=True)
 97 | class ModelInfoSubmission(DataClassBase):
 98 |     name: str
 99 |     author: str
100 |     num_params_in_millions: int
101 |     pretrained_data: str
102 |     creation_time: str
103 | 
104 |     def validate(self):
105 |         self._check_value('name', lambda x: x)
106 |         self._check_value('author', lambda x: x)
107 |         self._check_value('num_params_in_millions', lambda x: x > 0)
108 |         self._check_value('pretrained_data', lambda x: x)
109 |         self._check_value('creation_time', lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
110 | 


--------------------------------------------------------------------------------
/vision_benchmark/common/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pprint
 3 | 
 4 | from torch.utils.collect_env import get_pretty_env_info
 5 | 
 6 | 
 7 | def log_arg_env_config(args, config, output_dir):
 8 |     logging.info("=> collecting env info (might take some time)")
 9 |     logging.info("\n" + get_pretty_env_info())
10 |     logging.info(pprint.pformat(args))
11 |     logging.info(config)
12 |     logging.info(f'=> saving logging info into: {output_dir}')
13 | 
14 | 
15 | def submit_predictions(prediction_list, submit_by, config, track, task):
16 |     from vision_benchmark.commands.submit_predictions import submit_predictions_to_leaderboard, submit_model_to_leaderboard
17 | 
18 |     submission = {
19 |         'dataset_name': config.DATASET.DATASET,
20 |         'model_name': config.MODEL.NAME,
21 |         'track': track,
22 |         'task': task,
23 |         'created_by': submit_by,
24 |         'predictions': [prediction_list]
25 |     }
26 | 
27 |     logging.info('Submit model and predictions to leaderboard.')
28 |     submit_predictions_to_leaderboard(submission)
29 | 
30 |     model_info = {
31 |         "name": config.MODEL.NAME,
32 |         "author": config.MODEL.AUTHOR,
33 |         "num_params_in_millions": config.MODEL.NUM_PARAMS_IN_M,
34 |         "pretrained_data": config.MODEL.PRETRAINED_DATA,
35 |         "creation_time": config.MODEL.CREATION_TIME
36 |     }
37 | 
38 |     submit_model_to_leaderboard(model_info)
39 | 


--------------------------------------------------------------------------------
/vision_benchmark/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .default import _C as config
2 | from .default import update_config
3 | from .models import MODEL_SPECS
4 | 
5 | __all__ = ['config', 'update_config', 'MODEL_SPECS']
6 | 


--------------------------------------------------------------------------------
/vision_benchmark/config/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from yacs.config import CfgNode as CN
 6 | 
 7 | # high_resoluton_net related params for classification
 8 | HIGH_RESOLUTION_NET = CN()
 9 | HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*']
10 | HIGH_RESOLUTION_NET.STEM_INPLANES = 64
11 | HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1
12 | HIGH_RESOLUTION_NET.WITH_HEAD = True
13 | 
14 | HIGH_RESOLUTION_NET.STAGE2 = CN()
15 | HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1
16 | HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2
17 | HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4]
18 | HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64]
19 | HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC'
20 | HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'CAT'
21 | 
22 | HIGH_RESOLUTION_NET.STAGE3 = CN()
23 | HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1
24 | HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3
25 | HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4]
26 | HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128]
27 | HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC'
28 | HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'CAT'
29 | 
30 | HIGH_RESOLUTION_NET.STAGE4 = CN()
31 | HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1
32 | HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4
33 | HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
34 | HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
35 | HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC'
36 | HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'CAT'
37 | 
38 | RESNEXT = CN()
39 | RESNEXT.NUM_LAYERS = 50
40 | RESNEXT.BASE_WIDTH = 4
41 | RESNEXT.CARDINALITY = 32
42 | RESNEXT.KERNEL_SIZE_STEM = 7
43 | 
44 | RESNET = CN()
45 | RESNET.NUM_LAYERS = 50
46 | RESNET.KERNEL_SIZE_STEM = 7
47 | 
48 | 
49 | MODEL_SPECS = {
50 |     'cls_hrnet': HIGH_RESOLUTION_NET,
51 | }
52 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .prompts import class_map, template_map
2 | from .simple_tokenizer import SimpleTokenizer
3 | from .hfpt_tokenizer import HFPTTokenizer
4 | 
5 | __all__ = ['class_map', 'template_map', 'SimpleTokenizer', 'HFPTTokenizer']
6 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/datasets/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/vision_benchmark/datasets/hfpt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | from transformers import AutoTokenizer
 4 | import torch
 5 | 
 6 | 
 7 | class HFPTTokenizer(object):
 8 |     def __init__(self, pt_name=None):
 9 | 
10 |         self.pt_name = pt_name
11 |         self.added_sep_token = 0
12 |         self.added_cls_token = 0
13 |         self.enable_add_tokens = False
14 |         self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name))
15 | 
16 |         if (pt_name is None):
17 |             self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
18 |         else:
19 |             self.tokenizer = AutoTokenizer.from_pretrained(pt_name)
20 | 
21 |         # Adding tokens to GPT causing NaN training loss.
22 |         # Disable for now until further investigation.
23 |         if (self.enable_add_tokens):
24 |             if (self.tokenizer.sep_token is None):
25 |                 self.tokenizer.add_special_tokens({'sep_token': '<SEP>'})
26 |                 self.added_sep_token = 1
27 | 
28 |             if (self.tokenizer.cls_token is None):
29 |                 self.tokenizer.add_special_tokens({'cls_token': '<CLS>'})
30 |                 self.added_cls_token = 1
31 | 
32 |         if (self.gpt_special_case):
33 |             self.tokenizer.pad_token = self.tokenizer.eos_token
34 |             self.tokenizer.sep_token = self.tokenizer.eos_token
35 | 
36 |     def get_eot_token(self):
37 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0]
38 | 
39 |     def get_sot_token(self):
40 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0]
41 | 
42 |     def get_eot_token_list(self):
43 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)
44 | 
45 |     def get_sot_token_list(self):
46 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)
47 | 
48 |     def get_tokenizer_obj(self):
49 |         return self.tokenizer
50 | 
51 |     # Language model needs to know if new tokens
52 |     # were added to the dictionary.
53 |     def check_added_tokens(self):
54 |         return self.added_sep_token + self.added_cls_token
55 | 
56 |     def tokenize(self, texts: Union[str, List[str]], context_length: int = 77):
57 |         if isinstance(texts, str):
58 |             texts = [texts]
59 | 
60 |         padding = 'max_length'
61 | 
62 |         seqstart = []
63 |         seqend = []
64 | 
65 |         max_length = context_length
66 | 
67 |         if (self.added_cls_token > 0):
68 |             seqstart = self.get_sot_token_list()
69 |             max_length = max_length - 1
70 | 
71 |         if (self.added_sep_token > 0):
72 |             seqend = self.get_eot_token_list()
73 |             max_length = max_length - 1
74 | 
75 |         tokens = self.tokenizer(
76 |                     texts, padding=padding,
77 |                     truncation=True,
78 |                     max_length=max_length
79 |                 )['input_ids']
80 | 
81 |         for i in range(len(tokens)):
82 |             tokens[i] = seqstart + tokens[i] + seqend
83 | 
84 |         if (self.gpt_special_case):
85 |             for i in range(len(tokens)):
86 |                 tokens[i][-1] = self.get_eot_token()
87 | 
88 |         result = torch.Tensor(tokens).type(torch.LongTensor)
89 | 
90 |         return result
91 | 
92 |     def get_vocab_size(self):
93 |         return self.tokenizer.vocab_size
94 | 
95 |     def __call__(self, texts: Union[str, List[str]], context_length: int = 77):
96 |         return self.tokenize(texts, context_length)
97 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/languages/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from typing import Union, List
 6 | 
 7 | from .simple_tokenizer import SimpleTokenizer
 8 | from .hfpt_tokenizer import HFPTTokenizer
 9 | 
10 | from .build import build_tokenizer
11 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/datasets/languages/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/vision_benchmark/datasets/languages/build.py:
--------------------------------------------------------------------------------
 1 | from .hfpt_tokenizer import HFPTTokenizer
 2 | from .simple_tokenizer import SimpleTokenizer
 3 | 
 4 | 
 5 | def build_tokenizer(tokenizer_name):
 6 |     tokenizer = None
 7 |     if tokenizer_name == 'clip':
 8 |         tokenizer = SimpleTokenizer()
 9 |     elif 'hf_' in tokenizer_name:
10 |         tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:])
11 |     elif 'hfc_' in tokenizer_name:
12 |         tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:])
13 |     else:
14 |         raise ValueError('Unknown tokenizer')
15 | 
16 |     return tokenizer
17 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/languages/hfpt_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, List
  2 | 
  3 | from transformers import AutoTokenizer
  4 | import torch
  5 | 
  6 | 
  7 | class HFPTTokenizer(object):
  8 |     def __init__(self, pt_name = None):
  9 |         
 10 |         self.pt_name = pt_name
 11 |         self.added_sep_token = 0
 12 |         self.added_cls_token = 0
 13 |         self.enable_add_tokens = False
 14 |         self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name))
 15 | 
 16 |         if (pt_name is None):
 17 |             self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
 18 |         else:
 19 |             self.tokenizer = AutoTokenizer.from_pretrained(pt_name)
 20 | 
 21 |         # Adding tokens to GPT causing NaN training loss.  
 22 |         # Disable for now until further investigation. 
 23 |         if (self.enable_add_tokens):
 24 |             if (self.tokenizer.sep_token is None):
 25 |                 self.tokenizer.add_special_tokens({'sep_token': '<SEP>'})
 26 |                 self.added_sep_token = 1
 27 |     
 28 |             if (self.tokenizer.cls_token is None):
 29 |                 self.tokenizer.add_special_tokens({'cls_token': '<CLS>'})
 30 |                 self.added_cls_token = 1
 31 | 
 32 |         if (self.gpt_special_case):
 33 |             self.tokenizer.pad_token = self.tokenizer.eos_token
 34 |             self.tokenizer.sep_token = self.tokenizer.eos_token
 35 | 
 36 |     def get_eot_token(self):
 37 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0]
 38 | 
 39 |     def get_sot_token(self):
 40 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0]
 41 | 
 42 |     def get_eot_token_list(self):
 43 |         return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)
 44 | 
 45 |     def get_sot_token_list(self):
 46 |         return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)
 47 | 
 48 |     def get_tokenizer_obj(self):
 49 |         return self.tokenizer
 50 | 
 51 |     # Language model needs to know if new tokens
 52 |     # were added to the dictionary.
 53 |     def check_added_tokens(self):
 54 |         return self.added_sep_token + self.added_cls_token
 55 | 
 56 |     def tokenize(self, texts: Union[str, List[str]], context_length: int = 77):
 57 |         if isinstance(texts, str):
 58 |             texts = [texts]
 59 | 
 60 |         padding = 'max_length'
 61 | 
 62 |         seqstart = []
 63 |         seqtok = []
 64 |         seqend = []
 65 | 
 66 |         max_length = context_length
 67 | 
 68 |         if (self.added_cls_token > 0):
 69 |             seqstart = self.get_sot_token_list()
 70 |             max_length = max_length - 1
 71 | 
 72 |         if (self.added_sep_token > 0):
 73 |             seqend = self.get_eot_token_list()
 74 |             max_length = max_length - 1
 75 | 
 76 |         tokens = self.tokenizer(
 77 |                     texts, padding=padding,
 78 |                     truncation=True,
 79 |                     max_length=max_length
 80 |                 )['input_ids']
 81 | 
 82 |         for i in range(len(tokens)):
 83 |             tokens[i] = seqstart + tokens[i] + seqend
 84 | 
 85 |         if (self.gpt_special_case):
 86 |             for i in range(len(tokens)):
 87 |                 tokens[i][-1] = self.get_eot_token()
 88 | 
 89 |         #print(str(tokens))
 90 | 
 91 |         result = torch.Tensor(tokens).type(torch.LongTensor)
 92 | 
 93 |         return result
 94 | 
 95 |     def get_vocab_size(self):
 96 |         return self.tokenizer.vocab_size
 97 | 
 98 |     def __call__(self, texts: Union[str, List[str]], context_length: int = 77):
 99 |         return self.tokenize(texts, context_length)
100 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/languages/prompt_engineering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | def get_prompt_templates():
  6 |     prompt_templates = [
  7 |         '{}.',
  8 |         'a photo of a {}.',
  9 |         'a bad photo of a {}.',
 10 |         'a photo of many {}.',
 11 |         'a sculpture of a {}.',
 12 |         'a photo of the hard to see {}.',
 13 |         'a low resolution photo of the {}.',
 14 |         'a rendering of a {}.',
 15 |         'graffiti of a {}.',
 16 |         'a bad photo of the {}.',
 17 |         'a cropped photo of the {}.',
 18 |         'a tattoo of a {}.',
 19 |         'the embroidered {}.',
 20 |         'a photo of a hard to see {}.',
 21 |         'a bright photo of a {}.',
 22 |         'a photo of a clean {}.',
 23 |         'a photo of a dirty {}.',
 24 |         'a dark photo of the {}.',
 25 |         'a drawing of a {}.',
 26 |         'a photo of my {}.',
 27 |         'the plastic {}.',
 28 |         'a photo of the cool {}.',
 29 |         'a close-up photo of a {}.',
 30 |         'a black and white photo of the {}.',
 31 |         'a painting of the {}.',
 32 |         'a painting of a {}.',
 33 |         'a pixelated photo of the {}.',
 34 |         'a sculpture of the {}.',
 35 |         'a bright photo of the {}.',
 36 |         'a cropped photo of a {}.',
 37 |         'a plastic {}.',
 38 |         'a photo of the dirty {}.',
 39 |         'a jpeg corrupted photo of a {}.',
 40 |         'a blurry photo of the {}.',
 41 |         'a photo of the {}.',
 42 |         'a good photo of the {}.',
 43 |         'a rendering of the {}.',
 44 |         'a {} in a video game.',
 45 |         'a photo of one {}.',
 46 |         'a doodle of a {}.',
 47 |         'a close-up photo of the {}.',
 48 |         'the origami {}.',
 49 |         'the {} in a video game.',
 50 |         'a sketch of a {}.',
 51 |         'a doodle of the {}.',
 52 |         'a origami {}.',
 53 |         'a low resolution photo of a {}.',
 54 |         'the toy {}.',
 55 |         'a rendition of the {}.',
 56 |         'a photo of the clean {}.',
 57 |         'a photo of a large {}.',
 58 |         'a rendition of a {}.',
 59 |         'a photo of a nice {}.',
 60 |         'a photo of a weird {}.',
 61 |         'a blurry photo of a {}.',
 62 |         'a cartoon {}.',
 63 |         'art of a {}.',
 64 |         'a sketch of the {}.',
 65 |         'a embroidered {}.',
 66 |         'a pixelated photo of a {}.',
 67 |         'itap of the {}.',
 68 |         'a jpeg corrupted photo of the {}.',
 69 |         'a good photo of a {}.',
 70 |         'a plushie {}.',
 71 |         'a photo of the nice {}.',
 72 |         'a photo of the small {}.',
 73 |         'a photo of the weird {}.',
 74 |         'the cartoon {}.',
 75 |         'art of the {}.',
 76 |         'a drawing of the {}.',
 77 |         'a photo of the large {}.',
 78 |         'a black and white photo of a {}.',
 79 |         'the plushie {}.',
 80 |         'a dark photo of a {}.',
 81 |         'itap of a {}.',
 82 |         'graffiti of the {}.',
 83 |         'a toy {}.',
 84 |         'itap of my {}.',
 85 |         'a photo of a cool {}.',
 86 |         'a photo of a small {}.',
 87 |         'a tattoo of the {}.',
 88 |     ]
 89 |     return prompt_templates
 90 | 
 91 | 
 92 | def prompt_engineering(classnames):
 93 |     prompt_templates = get_prompt_templates()
 94 |     temp_idx = np.random.randint(len(prompt_templates))
 95 | 
 96 |     if isinstance(classnames, list):
 97 |         classname = random.choice(classnames)
 98 |     else:
 99 |         classname = classnames
100 | 
101 |     return prompt_templates[temp_idx].replace('{}', classname.replace(',', '').replace('+', ' '))
102 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/languages/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | from typing import Union, List
  9 | 
 10 | import torch
 11 | 
 12 | @lru_cache()
 13 | def default_bpe():
 14 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 15 | 
 16 | 
 17 | @lru_cache()
 18 | def bytes_to_unicode():
 19 |     """
 20 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 21 |     The reversible bpe codes work on unicode strings.
 22 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 23 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 24 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 25 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 26 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 27 |     """
 28 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 29 |     cs = bs[:]
 30 |     n = 0
 31 |     for b in range(2**8):
 32 |         if b not in bs:
 33 |             bs.append(b)
 34 |             cs.append(2**8+n)
 35 |             n += 1
 36 |     cs = [chr(n) for n in cs]
 37 |     return dict(zip(bs, cs))
 38 | 
 39 | 
 40 | def get_pairs(word):
 41 |     """Return set of symbol pairs in a word.
 42 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 43 |     """
 44 |     pairs = set()
 45 |     prev_char = word[0]
 46 |     for char in word[1:]:
 47 |         pairs.add((prev_char, char))
 48 |         prev_char = char
 49 |     return pairs
 50 | 
 51 | 
 52 | def basic_clean(text):
 53 |     text = ftfy.fix_text(text)
 54 |     text = html.unescape(html.unescape(text))
 55 |     return text.strip()
 56 | 
 57 | 
 58 | def whitespace_clean(text):
 59 |     text = re.sub(r'\s+', ' ', text)
 60 |     text = text.strip()
 61 |     return text
 62 | 
 63 | 
 64 | class SimpleTokenizer(object):
 65 |     def __init__(self, bpe_path: str = default_bpe()):
 66 |         self.byte_encoder = bytes_to_unicode()
 67 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 68 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 69 |         merges = merges[1:49152-256-2+1]
 70 |         merges = [tuple(merge.split()) for merge in merges]
 71 |         vocab = list(bytes_to_unicode().values())
 72 |         vocab = vocab + [v+'</w>' for v in vocab]
 73 |         for merge in merges:
 74 |             vocab.append(''.join(merge))
 75 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 76 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 77 |         self.decoder = {v: k for k, v in self.encoder.items()}
 78 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 79 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 80 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 81 | 
 82 |     def bpe(self, token):
 83 |         if token in self.cache:
 84 |             return self.cache[token]
 85 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 86 |         pairs = get_pairs(word)
 87 | 
 88 |         if not pairs:
 89 |             return token+'</w>'
 90 | 
 91 |         while True:
 92 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 93 |             if bigram not in self.bpe_ranks:
 94 |                 break
 95 |             first, second = bigram
 96 |             new_word = []
 97 |             i = 0
 98 |             while i < len(word):
 99 |                 try:
100 |                     j = word.index(first, i)
101 |                     new_word.extend(word[i:j])
102 |                     i = j
103 |                 except:
104 |                     new_word.extend(word[i:])
105 |                     break
106 | 
107 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
108 |                     new_word.append(first+second)
109 |                     i += 2
110 |                 else:
111 |                     new_word.append(word[i])
112 |                     i += 1
113 |             new_word = tuple(new_word)
114 |             word = new_word
115 |             if len(word) == 1:
116 |                 break
117 |             else:
118 |                 pairs = get_pairs(word)
119 |         word = ' '.join(word)
120 |         self.cache[token] = word
121 |         return word
122 | 
123 |     def encode(self, text):
124 |         bpe_tokens = []
125 |         text = whitespace_clean(basic_clean(text)).lower()
126 |         for token in re.findall(self.pat, text):
127 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
128 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
129 |         return bpe_tokens
130 | 
131 |     def decode(self, tokens):
132 |         text = ''.join([self.decoder[token] for token in tokens])
133 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
134 |         return text
135 | 
136 |     def get_vocab_size(self):
137 |         return 49408
138 | 
139 |     def get_eot_token(self):
140 |         return self.encoder["<|endoftext|>"]
141 | 
142 |     def get_sot_token(self):
143 |         return self.encoder["<|startoftext|>"]
144 | 
145 |     def check_added_tokens(self):
146 |         return 0
147 | 
148 |     def get_tokenizer_obj(self):
149 |         return None
150 | 
151 |     def tokenize(self, texts: Union[str, List[str]], context_length: int = 77):
152 |         if isinstance(texts, str):
153 |             texts = [texts]
154 | 
155 |         sot_token = self.encoder["<|startoftext|>"]
156 |         eot_token = self.encoder["<|endoftext|>"]
157 |         all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
158 |         result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
159 | 
160 |         for i, tokens in enumerate(all_tokens):
161 |             if len(tokens) > context_length:
162 |                 tokens = tokens[:context_length]
163 |                 # raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
164 | 
165 |             result[i, :len(tokens)] = torch.tensor(tokens)
166 | 
167 |         return result
168 | 
169 |     def __call__(self, texts: Union[str, List[str]], context_length: int = 77):
170 |         return self.tokenize(texts, context_length)
171 | 


--------------------------------------------------------------------------------
/vision_benchmark/datasets/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | from typing import Union, List
  9 | 
 10 | import torch
 11 | 
 12 | 
 13 | @lru_cache()
 14 | def default_bpe():
 15 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 16 | 
 17 | 
 18 | @lru_cache()
 19 | def bytes_to_unicode():
 20 |     """
 21 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 22 |     The reversible bpe codes work on unicode strings.
 23 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 24 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 25 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 26 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 27 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 28 |     """
 29 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 30 |     cs = bs[:]
 31 |     n = 0
 32 |     for b in range(2**8):
 33 |         if b not in bs:
 34 |             bs.append(b)
 35 |             cs.append(2**8+n)
 36 |             n += 1
 37 |     cs = [chr(n) for n in cs]
 38 |     return dict(zip(bs, cs))
 39 | 
 40 | 
 41 | def get_pairs(word):
 42 |     """Return set of symbol pairs in a word.
 43 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 44 |     """
 45 |     pairs = set()
 46 |     prev_char = word[0]
 47 |     for char in word[1:]:
 48 |         pairs.add((prev_char, char))
 49 |         prev_char = char
 50 |     return pairs
 51 | 
 52 | 
 53 | def basic_clean(text):
 54 |     text = ftfy.fix_text(text)
 55 |     text = html.unescape(html.unescape(text))
 56 |     return text.strip()
 57 | 
 58 | 
 59 | def whitespace_clean(text):
 60 |     text = re.sub(r'\s+', ' ', text)
 61 |     text = text.strip()
 62 |     return text
 63 | 
 64 | 
 65 | class SimpleTokenizer(object):
 66 |     def __init__(self, bpe_path: str = default_bpe()):
 67 |         self.byte_encoder = bytes_to_unicode()
 68 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 69 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 70 |         merges = merges[1:49152-256-2+1]
 71 |         merges = [tuple(merge.split()) for merge in merges]
 72 |         vocab = list(bytes_to_unicode().values())
 73 |         vocab = vocab + [v+'</w>' for v in vocab]
 74 |         for merge in merges:
 75 |             vocab.append(''.join(merge))
 76 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 77 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 78 |         self.decoder = {v: k for k, v in self.encoder.items()}
 79 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 80 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 81 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 82 | 
 83 |     def bpe(self, token):
 84 |         if token in self.cache:
 85 |             return self.cache[token]
 86 |         word = tuple(token[:-1]) + (token[-1] + '</w>',)
 87 |         pairs = get_pairs(word)
 88 | 
 89 |         if not pairs:
 90 |             return token+'</w>'
 91 | 
 92 |         while True:
 93 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
 94 |             if bigram not in self.bpe_ranks:
 95 |                 break
 96 |             first, second = bigram
 97 |             new_word = []
 98 |             i = 0
 99 |             while i < len(word):
100 |                 try:
101 |                     j = word.index(first, i)
102 |                     new_word.extend(word[i:j])
103 |                     i = j
104 |                 except Exception:
105 |                     new_word.extend(word[i:])
106 |                     break
107 | 
108 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
109 |                     new_word.append(first+second)
110 |                     i += 2
111 |                 else:
112 |                     new_word.append(word[i])
113 |                     i += 1
114 |             new_word = tuple(new_word)
115 |             word = new_word
116 |             if len(word) == 1:
117 |                 break
118 |             else:
119 |                 pairs = get_pairs(word)
120 |         word = ' '.join(word)
121 |         self.cache[token] = word
122 |         return word
123 | 
124 |     def encode(self, text):
125 |         bpe_tokens = []
126 |         text = whitespace_clean(basic_clean(text)).lower()
127 |         for token in re.findall(self.pat, text):
128 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
129 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
130 |         return bpe_tokens
131 | 
132 |     def decode(self, tokens):
133 |         text = ''.join([self.decoder[token] for token in tokens])
134 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
135 |         return text
136 | 
137 |     def get_vocab_size(self):
138 |         return 49408
139 | 
140 |     def get_eot_token(self):
141 |         return self.encoder["<|endoftext|>"]
142 | 
143 |     def get_sot_token(self):
144 |         return self.encoder["<|startoftext|>"]
145 | 
146 |     def check_added_tokens(self):
147 |         return 0
148 | 
149 |     def get_tokenizer_obj(self):
150 |         return None
151 | 
152 |     def tokenize(self, texts: Union[str, List[str]], context_length: int = 77):
153 |         if isinstance(texts, str):
154 |             texts = [texts]
155 | 
156 |         sot_token = self.encoder["<|startoftext|>"]
157 |         eot_token = self.encoder["<|endoftext|>"]
158 |         all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
159 |         result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
160 | 
161 |         for i, tokens in enumerate(all_tokens):
162 |             if len(tokens) > context_length:
163 |                 tokens = tokens[:context_length]
164 |                 # raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
165 | 
166 |             result[i, :len(tokens)] = torch.tensor(tokens)
167 | 
168 |         return result
169 | 
170 |     def __call__(self, texts: Union[str, List[str]], context_length: int = 77):
171 |         return self.tokenize(texts, context_length)
172 | 


--------------------------------------------------------------------------------
/vision_benchmark/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature import extract_features, extract_text_features, construct_dataloader
2 | from .full_model_finetune import full_model_finetune
3 | from .clip_zeroshot_evaluator import clip_zeroshot_evaluator
4 | 
5 | __all__ = ['extract_features', 'linear_classifier', 'lr_classifier', 'extract_text_features', 'clip_zeroshot_evaluator', 'construct_dataloader', 'full_model_finetune', 'linear_classifier_contrast']
6 | 


--------------------------------------------------------------------------------
/vision_benchmark/evaluation/clip_zeroshot_evaluator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | CLIP zeroshot evaluation
 3 | """
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from .metric import get_metric
 7 | 
 8 | 
 9 | def clip_zeroshot_evaluator(image_features, text_features, image_labels, config):
10 |     metric = get_metric(config.TEST.METRIC)
11 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
12 |     image_features = torch.from_numpy(image_features).to(device)
13 |     text_features = torch.from_numpy(text_features).to(device)
14 |     image_labels = torch.from_numpy(image_labels).to(device)
15 | 
16 |     # Normalize image_features
17 |     image_features = F.normalize(image_features)
18 | 
19 |     # Compute logits
20 |     logits = (100. * image_features @ text_features).softmax(dim=-1)
21 |     result = metric(image_labels.squeeze().cpu().detach().numpy(), logits.cpu().detach().numpy())
22 |     return result, logits, metric.__name__
23 | 


--------------------------------------------------------------------------------
/vision_benchmark/evaluation/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.utils.data
 4 | from PIL import Image
 5 | from torchvision import transforms
 6 | 
 7 | 
 8 | class Voc2007Classification(torch.utils.data.Dataset):
 9 |     def __init__(self, data_root, image_set="train", transform=None):
10 |         """
11 |         Pascal voc2007 training/validation data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
12 |         test data: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
13 |         """
14 |         self.data_root = self._update_path(data_root, image_set)
15 |         self.transform = transform
16 |         self.labels = self._read_annotation(image_set)
17 |         self.images = list(self.labels.keys())
18 | 
19 |     @staticmethod
20 |     def _update_path(data_root, image_set):
21 |         if image_set == "train" or image_set == "val":
22 |             data_root += "train/VOCdevkit/VOC2007"
23 |         elif image_set == "test":
24 |             data_root += "test/VOCdevkit 2/VOC2007"
25 |         else:
26 |             raise Exception("Incorrect image set!")
27 |         return data_root
28 | 
29 |     def __getitem__(self, index):
30 |         img_path = os.path.join(self.data_root, 'JPEGImages/' + self.images[index] + '.jpg')
31 |         image = Image.open(img_path).convert("RGB")
32 |         if self.transform is not None:
33 |             image = self.transform(image)
34 |         else:
35 |             image = transforms.ToTensor()(image)
36 |         label = self.labels[self.images[index]]
37 |         label = torch.LongTensor(label)
38 |         return image, label
39 | 
40 |     def __len__(self):
41 |         return len(self.images)
42 | 
43 |     def _read_annotation(self, image_set="train"):
44 |         """
45 |         Annotation interpolation, refer to:
46 |         http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00093000000000000000
47 |         """
48 |         object_categories = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
49 |                              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
50 |                              'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
51 |         annotation_folder = os.path.join(self.data_root, "ImageSets/Main/")
52 |         files = [file_name for file_name in os.listdir(annotation_folder) if file_name.endswith("_" + image_set + ".txt")]
53 |         labels_all = dict()
54 |         for file_name in files:
55 |             label_str = file_name.split("_")[0]
56 |             label_int = object_categories.index(label_str)
57 |             with open(annotation_folder + "/" + file_name, "r") as fread:
58 |                 for line in fread.readlines():
59 |                     index = line[:6]
60 |                     if index not in labels_all.keys():
61 |                         labels_all[index] = [0] * len(object_categories)
62 |                     flag = 1
63 |                     if line[7:9] and int(line[7:9]) != 1:
64 |                         flag = -1
65 |                     if flag == 1:
66 |                         labels_all[index][label_int] = 1
67 |         return labels_all
68 | 
69 | 


--------------------------------------------------------------------------------
/vision_benchmark/evaluation/metric.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from sklearn.metrics import roc_auc_score
 4 | import vision_evaluation.evaluators as v_eval
 5 | 
 6 | 
 7 | def accuracy(y_label, y_pred):
 8 |     """ Compute Top1 accuracy
 9 |     Args:
10 |         y_label: the ground truth labels. Shape (N,)
11 |         y_pred: the prediction of a model. Shape (N,)
12 |     """
13 |     evaluator = v_eval.TopKAccuracyEvaluator(1)
14 |     evaluator.add_predictions(predictions=y_pred, targets=y_label)
15 |     return evaluator.get_report()['accuracy_top1']
16 | 
17 | 
18 | def map_11_points(y_label, y_pred_proba):
19 |     evaluator = v_eval.MeanAveragePrecisionNPointsEvaluator(11)
20 |     evaluator.add_predictions(predictions=y_pred_proba, targets=y_label)
21 |     return evaluator.get_report()[evaluator._get_id()]
22 | 
23 | 
24 | def balanced_accuracy_score(y_label, y_pred):
25 |     evaluator = v_eval.BalancedAccuracyScoreEvaluator()
26 |     evaluator.add_predictions(y_pred, y_label)
27 |     return evaluator.get_report()[evaluator._get_id()]
28 | 
29 | 
30 | def roc_auc(y_true, y_score):
31 |     if y_score.shape[1] == 2:
32 |         return roc_auc_score(y_true, y_score[:, 1])
33 |     return roc_auc_score(y_true, y_score)
34 | 
35 | 
36 | def get_metric(metric_name):
37 |     if metric_name == "accuracy":
38 |         return accuracy
39 |     if metric_name == "mean-per-class":
40 |         return balanced_accuracy_score
41 |     if metric_name == "11point_mAP":
42 |         return map_11_points
43 |     if metric_name == "roc_auc":
44 |         return roc_auc
45 | 
46 |     logging.error("Undefined metric.")
47 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import cls_example
 2 | from . import clip_example
 3 | from . import clip_react
 4 | from . import cls_swin
 5 | from . import clip_swin
 6 | from . import mae
 7 | from . import mocov3
 8 | from . import declip
 9 | 
10 | __all__ = ['cls_example', 'clip_example', 'clip_react', 'cls_swin', 'clip_swin', 'mae', 'mocov3', 'declip']
11 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/clip_example.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class Example(nn.Module):
 5 |     def encode_image():
 6 |         """
 7 |         This method is called to extract image features for evaluation.
 8 |         """
 9 |         pass
10 | 
11 |     def encode_text():
12 |         """
13 |         This method is called to extract text features for evaluation.
14 |         """
15 |         pass
16 | 
17 | 
18 | def get_zeroshot_model(config, **kwargs):
19 |     """
20 |     Specify your model here
21 |     """
22 |     model = Example()
23 |     return model
24 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/cls_example.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class Example(nn.Module):
 5 |     def forward_features():
 6 |         """
 7 |         This method is called to extract features for evaluation.
 8 |         """
 9 |         pass
10 | 
11 | 
12 | def get_cls_model(config, **kwargs):
13 |     """
14 |     Specify your model here
15 |     """
16 |     model = Example()
17 |     return model
18 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import logging
 3 | 
 4 | from .declip_model import declip as _declip
 5 | from .declip_model import slip as _slip
 6 | from .declip_model import filip as _filip
 7 | 
 8 | def get_model(config):
 9 |     if config.MODEL.NAME in ['filip_vitb32', 'defilip_vitb32']:
10 |         model = _filip.filip_vitb32(**config.MODEL.SPEC.DECLIP)
11 |     elif config.MODEL.NAME == 'slip_vitb32':
12 |         model = _slip.slip_vitb32(**config.MODEL.SPEC.DECLIP)
13 |     else:
14 |         model = _declip.declip_clip_vitb32(**config.MODEL.SPEC.DECLIP)
15 | 
16 |     model_file = config.TEST.MODEL_FILE
17 |     logging.info(f'=> load model file: {model_file}')
18 | 
19 |     if model_file.startswith('http'):
20 |         checkpoint = torch.hub.load_state_dict_from_url(model_file, progress=False, map_location="cpu")
21 |     else:
22 |         checkpoint = torch.load(model_file, map_location="cpu")
23 | 
24 |     # rename moco pre-trained keys
25 |     state_dict = checkpoint['model']
26 |     for k in list(state_dict.keys()):
27 |         if k.startswith('module.'):
28 |             state_dict[k[len("module."):]] = state_dict[k]
29 |             del state_dict[k]
30 | 
31 |     incompatible = model.load_state_dict(state_dict, strict=False)
32 | 
33 |     if incompatible.missing_keys:
34 |         logging.warning('Missing keys: {}'.format(', '.join(incompatible.missing_keys)))
35 |     if incompatible.unexpected_keys:
36 |         logging.warning('Unexpected keys: {}'.format(', '.join(incompatible.unexpected_keys)))
37 | 
38 |     return model
39 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clip import (  # noqa: F401
 2 |     clip_vitb32
 3 | )
 4 | 
 5 | from .declip import declip_vitb32
 6 | 
 7 | from .filip import filip_vitb32
 8 | 
 9 | from .slip import slip_vitb32
10 | 
11 | from .defilip import defilip_vitb32
12 | 
13 | 
14 | 
15 | def model_entry(config):
16 |     return globals()[config['type']](**config['kwargs'])
17 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/clip.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from socket import IP_DEFAULT_MULTICAST_LOOP
  3 | from typing import Tuple, Union, List
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch import nn
  7 | import numpy as np
  8 | import os
  9 | 
 10 | import timm
 11 | from .image_encoder.visual_transformer import visual_transformer_B32, visual_transformer_B16
 12 | # from .image_encoder.modified_resnet import modified_resnet_R50, modified_resnet_R101
 13 | from .text_encoder.text_transformer import text_transformers
 14 | 
 15 | 
 16 | BN = None
 17 | 
 18 | __all__ = ['clip_res50', 'clip_vitb32']
 19 | 
 20 | class AllGather(torch.autograd.Function):
 21 |     
 22 |     @staticmethod
 23 |     def forward(ctx, tensor):
 24 |         ctx.rank = link.get_rank()
 25 |         ctx.world_size = link.get_world_size()
 26 | 
 27 | #         y = tensor.new(ctx.world_size, *tensor.size())
 28 |         
 29 |         y = [tensor.new(*tensor.size()) for _ in range(ctx.world_size)]
 30 |         
 31 |         link.allgather(y, tensor)
 32 | 
 33 |         y = torch.cat(y, 0).view(-1, *tensor.size())
 34 |         
 35 |         return y
 36 | 
 37 |     @staticmethod
 38 |     def backward(ctx, grad_output):
 39 |         in_grad = torch.zeros_like(grad_output)
 40 |         in_grad.copy_(grad_output)
 41 |         # sum grad for gathered tensor
 42 |         link.allreduce(in_grad)
 43 |         # split
 44 |         return in_grad[ctx.rank]
 45 | 
 46 |     
 47 | 
 48 | class CLIP(nn.Module):
 49 |     def __init__(self,image_encode, text_encode, use_allgather):
 50 |         super().__init__()
 51 |         self.use_allgather = use_allgather
 52 |         self.visual =image_encode
 53 |         self.encode_text = text_encode
 54 |         self.logit_scale = nn.Parameter(torch.ones([1]))
 55 |         # self.logit_scale = nn.Parameter(torch.ones([]))
 56 |         nn.init.constant_(self.logit_scale, np.log(1/0.07))
 57 |         #nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
 58 | 
 59 |     def text_parameters(self):
 60 |         param = [self.logit_scale]
 61 |         if self.encode_text.text_encode_type == 'Transformer':
 62 |             param.append(self.encode_text.positional_embedding)
 63 |         elif self.encode_text.text_encode_type == 'Bert':
 64 |             # print('Bert', self.encode_text.text_transformer.cls.predictions, flush=True)
 65 |             # param.extend([self.encode_text.text_transformer.cls.predictions.decoder.weight,
 66 |             #               self.encode_text.text_transformer.cls.predictions.bias])
 67 |             param.extend([self.encode_text.text_transformer.cls.predictions.bias])
 68 |         return param
 69 | 
 70 |     def text_modules(self):
 71 |         if self.encode_text.text_encode_type == 'Transformer':
 72 |             return [self.encode_text.transformer, self.encode_text.text_projection, self.encode_text.token_embedding, self.encode_text.ln_final]
 73 |         elif self.encode_text.text_encode_type == 'Bert':
 74 |             # print('Bert', self.encode_text.text_transformer, flush=True)
 75 |             return [self.encode_text.text_transformer.bert, self.encode_text.text_projection,
 76 |                     self.encode_text.text_transformer.cls.predictions.transform]
 77 |                     # self.encode_text.text_transformer.cls.predictions.decoder,  # decoder: bias
 78 |         else:
 79 |             import ipdb
 80 |             ipdb.set_trace()
 81 |             return [self.encode_text.text_transformer, self.encode_text.text_projection]
 82 | 
 83 |     def visual_parameters(self):
 84 |         return []
 85 | 
 86 |     def visual_modules(self):
 87 |         return [self.visual]
 88 | 
 89 |     @property
 90 |     def dtype(self):
 91 |         try:
 92 |             return self.visual.conv1.weight.dtype
 93 |         except:
 94 |             try:
 95 |                 return self.visual.head.weight.dtype
 96 |             except:
 97 |                 try:
 98 |                     return self.visual.stem[0].weight.dtype
 99 |                 except:
100 |                     return self.encode_text.text_projection.weight.dtype
101 | 
102 |     def encode_image(self, image):
103 |         return self.visual(image.type(self.dtype))
104 | 
105 |     def sample_captions(self, texts):
106 |         return [text[0] for text in texts]
107 | 
108 |     def all_gather(self, input):
109 |         output = AllGather.apply(input)
110 |         output = output.view(-1, *(output.shape[2:]))
111 |         return output
112 | 
113 |     def forward(self, input, all_gather=False):
114 |         # input
115 |         images = input['images']
116 |         texts = input['captions']
117 |         texts = self.sample_captions(texts)
118 |         # text&image encode 
119 |         image_features = self.encode_image(images)
120 |         text_features = self.encode_text(texts)
121 | 
122 | 
123 |         # normalized features
124 |         image_features = image_features / (image_features.norm(dim=-1, keepdim=True))
125 |         text_features = text_features / (text_features.norm(dim=-1, keepdim=True)+1e-10)
126 | 
127 |         # cosine similarity as logits
128 |         logit_scale = self.logit_scale.exp()
129 |         logit_scale.data = torch.clamp(logit_scale.data, max=100)
130 | 
131 |         if self.training and self.use_allgather or all_gather:
132 |             gathered_image_features = self.all_gather(image_features)
133 |             gathered_text_features = self.all_gather(text_features)
134 | 
135 |             logits_per_image = logit_scale * image_features @ gathered_text_features.t()
136 |             logits_per_text = logit_scale * text_features @ gathered_image_features.t()
137 |         else:
138 |             logits_per_image = logit_scale * image_features @ text_features.t()
139 |             logits_per_text = logit_scale * text_features @ image_features.t()
140 | 
141 |         return logits_per_image, logits_per_text
142 | 
143 | 
144 | def clip_res50(**kwargs): 
145 |     """
146 |     Constructs a clip_res50 model.
147 |     """
148 |     image_encode = modified_resnet_R50(**kwargs['image_encode'])
149 |     text_encode = text_transformers(**kwargs['text_encode'])
150 |     model = CLIP(image_encode,text_encode,**kwargs['clip'])
151 |     return model
152 | 
153 | def clip_vitb32(**kwargs):
154 |     """'
155 |     Constructs a clip_ViT_B32 model.
156 |     """
157 |     image_encode = visual_transformer_B32(**kwargs['image_encode'])
158 |     text_encode = text_transformers(**kwargs['text_encode'])
159 |     model = CLIP(image_encode,text_encode,**kwargs['clip'])
160 |     return model
161 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/image_encoder/base_transformer.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.utils.checkpoint import checkpoint_sequential
 6 | 
 7 | global LAYER_NORM 
 8 | LAYER_NORM = True
 9 | 
10 | class LayerNorm(nn.LayerNorm):
11 |     """Subclass torch's LayerNorm to handle fp16."""
12 | 
13 |     def forward(self, x: torch.Tensor):
14 |         if LAYER_NORM:
15 |             ret = super().forward(x)
16 |         else:
17 |             ret = x
18 |         return ret
19 |         # orig_type = x.dtype
20 |         # ret = super().forward(x.type(torch.float32))
21 |         # return ret.type(orig_type)
22 | 
23 | 
24 | class QuickGELU(nn.Module):
25 |     def forward(self, x: torch.Tensor):
26 |         return x * torch.sigmoid(1.702 * x)
27 | 
28 | 
29 | class ResidualAttentionBlock(nn.Module):
30 |     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, dropout: float = 0.):
31 |         super().__init__()
32 | 
33 |         self.attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout)
34 |         self.ln_1 = LayerNorm(d_model)
35 |         self.mlp = nn.Sequential(OrderedDict([
36 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
37 |             ("gelu", QuickGELU()),
38 |             # ("dropout_1", nn.Dropout(dropout)),
39 |             ("c_proj", nn.Linear(d_model * 4, d_model)),
40 |             # ("dropout_2", nn.Dropout(dropout))
41 |         ]))
42 |         self.ln_2 = LayerNorm(d_model)
43 |         self.attn_mask = attn_mask
44 | 
45 |     def attention(self, x: torch.Tensor):
46 |         self.attn_mask = self.attn_mask.to(
47 |             dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
48 |         return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)[0]
49 | 
50 |     def forward(self, x: torch.Tensor):
51 |         x = x + self.attention(self.ln_1(x))
52 |         x = x + self.mlp(self.ln_2(x))
53 |         return x
54 | 
55 | 
56 | class Transformer(nn.Module):
57 |     def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, checkpoint: bool = False, dropout: float = 0., emb_dropout: float = 0.):
58 |         super().__init__()
59 |         self.width = width
60 |         self.layers = layers
61 |         self.checkpoint = checkpoint
62 |         self.dropout = nn.Dropout(emb_dropout)
63 |         self.resblocks = nn.Sequential(
64 |             *[ResidualAttentionBlock(width, heads, attn_mask, dropout=dropout) for _ in range(layers)])
65 | 
66 |     def checkpoint_fwd(self, layer, input, segments=2):
67 |         """checkpoint forward"""
68 |         # Make sure that the input to checkpoint have requires_grad=True, so that
69 |         # the autograd can take care of the checkpointed part of model
70 |         if not input.requires_grad:
71 |             input = input.detach()
72 |             input.requires_grad = True
73 |         return checkpoint_sequential(layer, segments, input)
74 | 
75 |     def forward(self, x: torch.Tensor):
76 |         x = self.dropout(x)
77 |         if self.checkpoint:
78 |             return self.checkpoint_fwd(self.resblocks, x, self.layers)
79 |         return self.resblocks(x)
80 | 
81 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/image_encoder/visual_transformer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | from .base_transformer import Transformer, LayerNorm
  5 | 
  6 | class VisualTransformer(nn.Module):
  7 |     def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, embed_dim: int, checkpoint: bool, dropout: float=0, emb_dropout: float=0):
  8 |         super().__init__()
  9 |         self.input_resolution = input_resolution
 10 |         output_dim = embed_dim
 11 |         self.output_dim = output_dim
 12 |         self.freeze_conv1 = True
 13 |         # self.freeze_conv1 = False
 14 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=width,
 15 |                                kernel_size=patch_size, stride=patch_size, bias=False)
 16 | 
 17 |         scale = width ** -0.5
 18 |         self.class_embedding = nn.Parameter(scale * torch.randn(width))
 19 |         self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
 20 |         self.ln_pre = LayerNorm(width)
 21 | 
 22 |         self.transformer = Transformer(width, layers, heads, checkpoint=checkpoint, dropout=dropout, emb_dropout=emb_dropout)
 23 | 
 24 |         self.ln_post = LayerNorm(width)
 25 |         self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
 26 |         self.initialize_parameters()
 27 | 
 28 |     def initialize_parameters(self):
 29 |         nn.init.normal_(self.positional_embedding, std=0.01)
 30 | 
 31 |         proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
 32 |         attn_std = self.transformer.width ** -0.5
 33 |         fc_std = (2 * self.transformer.width) ** -0.5
 34 |         for block in self.transformer.resblocks:
 35 |             nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
 36 |             nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
 37 |             nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
 38 |             nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
 39 | 
 40 |     def train(self, mode=True):
 41 |         self.training = mode
 42 |         for module in self.children():
 43 |             module.train(mode)
 44 | 
 45 |         if self.freeze_conv1:
 46 |             for layer in [self.conv1]:
 47 |                 layer.eval()
 48 |                 for param in layer.parameters():
 49 |                     param.requires_grad = False
 50 |         return self
 51 | 
 52 | 
 53 |     def forward(self, x: torch.Tensor, return_dense=False, return_feature=False):
 54 |         x = self.conv1(x)  # shape = [*, width, grid, grid]
 55 |         # shape = [*, width, grid ** 2]
 56 |         x = x.reshape(x.shape[0], x.shape[1], -1)
 57 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
 58 |         x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1],
 59 |                                                                       dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
 60 |         x = x + self.positional_embedding.to(x.dtype)
 61 |         x = self.ln_pre(x)
 62 | 
 63 |         x = x.permute(1, 0, 2)  # NLD -> LND
 64 |         x = self.transformer(x)
 65 |         x = x.permute(1, 0, 2)  # LND -> NLD
 66 |         dense_feat = x[:, 1:, :]
 67 |         x = self.ln_post(x[:, 0, :])
 68 |         feature = x
 69 | 
 70 |         if self.proj is not None:
 71 |             x = x @ self.proj
 72 | 
 73 |         ret = [x]
 74 |         if return_dense:
 75 |             ret.append(dense_feat)
 76 |         if return_feature:
 77 |             ret.append(feature)
 78 |         if len(ret) == 1:
 79 |             return ret[0]
 80 |         return tuple(ret)
 81 |         # if return_dense:
 82 |         #     return x, dense_feat
 83 | 
 84 |         # return x
 85 | 
 86 | def visual_transformer_B32(**kwargs):
 87 |     vision_width = 768
 88 |     vision_layers = 12
 89 |     vision_heads = vision_width // 64
 90 | 
 91 |     default_kwargs = {
 92 |         # 'output_dim': 512, from config
 93 |         'layers':vision_layers,
 94 |         'heads': vision_heads, 
 95 |         'input_resolution': 224,
 96 |         'patch_size': 32,
 97 |         'width': vision_width,
 98 |         'checkpoint': False
 99 |     }
100 |     default_kwargs.update(**kwargs)
101 |     model = VisualTransformer(**default_kwargs)
102 |     return model
103 | 
104 | def visual_transformer_B16(**kwargs):
105 |     vision_width = 768
106 |     vision_layers = 12
107 |     vision_heads = vision_width // 64
108 | 
109 |     default_kwargs = {
110 |         # 'output_dim': 512, from config
111 |         'layers':vision_layers,
112 |         'heads': vision_heads,
113 |         'input_resolution': 224,
114 |         'patch_size': 16,
115 |         'width': vision_width,
116 |         'checkpoint': False
117 |     }
118 |     default_kwargs.update(**kwargs)
119 |     model = VisualTransformer(**default_kwargs)
120 |     return model
121 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/text_encoder/base_transformer.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.utils.checkpoint import checkpoint_sequential
 6 | 
 7 | global LAYER_NORM 
 8 | LAYER_NORM = True
 9 | 
10 | class LayerNorm(nn.LayerNorm):
11 |     """Subclass torch's LayerNorm to handle fp16."""
12 | 
13 |     def forward(self, x: torch.Tensor):
14 |         if LAYER_NORM:
15 |             ret = super().forward(x)
16 |         else:
17 |             ret = x
18 |         return ret
19 |         # orig_type = x.dtype
20 |         # ret = super().forward(x.type(torch.float32))
21 |         # return ret.type(orig_type)
22 | 
23 | 
24 | class QuickGELU(nn.Module):
25 |     def forward(self, x: torch.Tensor):
26 |         return x * torch.sigmoid(1.702 * x)
27 | 
28 | 
29 | class ResidualAttentionBlock(nn.Module):
30 |     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, dropout: float = 0.):
31 |         super().__init__()
32 | 
33 |         self.attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout)
34 |         self.ln_1 = LayerNorm(d_model)
35 |         self.mlp = nn.Sequential(OrderedDict([
36 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
37 |             ("gelu", QuickGELU()),
38 |             # ("dropout_1", nn.Dropout(dropout)),
39 |             ("c_proj", nn.Linear(d_model * 4, d_model)),
40 |             # ("dropout_2", nn.Dropout(dropout))
41 |         ]))
42 |         self.ln_2 = LayerNorm(d_model)
43 |         self.attn_mask = attn_mask
44 | 
45 |     def attention(self, x: torch.Tensor):
46 |         self.attn_mask = self.attn_mask.to(
47 |             dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
48 |         return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)[0]
49 | 
50 |     def forward(self, x: torch.Tensor):
51 |         x = x + self.attention(self.ln_1(x))
52 |         x = x + self.mlp(self.ln_2(x))
53 |         return x
54 | 
55 | 
56 | class Transformer(nn.Module):
57 |     def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, checkpoint: bool = False, dropout: float = 0., emb_dropout: float = 0.):
58 |         super().__init__()
59 |         self.width = width
60 |         self.layers = layers
61 |         self.checkpoint = checkpoint
62 |         self.dropout = nn.Dropout(emb_dropout)
63 |         self.resblocks = nn.Sequential(
64 |             *[ResidualAttentionBlock(width, heads, attn_mask, dropout=dropout) for _ in range(layers)])
65 | 
66 |     def checkpoint_fwd(self, layer, input, segments=2):
67 |         """checkpoint forward"""
68 |         # Make sure that the input to checkpoint have requires_grad=True, so that
69 |         # the autograd can take care of the checkpointed part of model
70 |         if not input.requires_grad:
71 |             input = input.detach()
72 |             input.requires_grad = True
73 |         return checkpoint_sequential(layer, segments, input)
74 | 
75 |     def forward(self, x: torch.Tensor):
76 |         x = self.dropout(x)
77 |         if self.checkpoint:
78 |             return self.checkpoint_fwd(self.resblocks, x, self.layers)
79 |         return self.resblocks(x)
80 | 
81 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/nnclr_modules/__init__.py:
--------------------------------------------------------------------------------
 1 | """The lightly.models.modules package provides reusable modules.
 2 | 
 3 | This package contains reusable modules such as the NNmemoryBankModule which
 4 | can be combined with any lightly model.
 5 | 
 6 | """
 7 | 
 8 | # Copyright (c) 2021. Lightly AG and its affiliates.
 9 | # All Rights Reserved
10 | 
11 | from .nn_memory_bank import NNMemoryBankModule
12 | from .memory_bank_cuda import MemoryBankModule
13 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/nnclr_modules/memory_bank.py:
--------------------------------------------------------------------------------
  1 | """ Memory Bank Wrapper """
  2 | 
  3 | # Copyright (c) 2020. Lightly AG and its affiliates.
  4 | # All Rights Reserved
  5 | 
  6 | import torch
  7 | import functools
  8 | 
  9 | class MemoryBankModule(torch.nn.Module):
 10 |     """Memory bank implementation
 11 | 
 12 |     This is a parent class to all loss functions implemented by the lightly
 13 |     Python package. This way, any loss can be used with a memory bank if 
 14 |     desired.
 15 | 
 16 |     Attributes:
 17 |         size:
 18 |             Number of keys the memory bank can store. If set to 0,
 19 |             memory bank is not used.
 20 | 
 21 |     Examples:
 22 |         >>> class MyLossFunction(MemoryBankModule):
 23 |         >>>
 24 |         >>>     def __init__(self, memory_bank_size: int = 2 ** 16):
 25 |         >>>         super(MyLossFunction, self).__init__(memory_bank_size)
 26 |         >>>
 27 |         >>>     def forward(self, output: torch.Tensor,
 28 |         >>>                 labels: torch.Tensor = None):
 29 |         >>>
 30 |         >>>         output, negatives = super(
 31 |         >>>             MyLossFunction, self).forward(output)
 32 |         >>>
 33 |         >>>         if negatives is not None:
 34 |         >>>             # evaluate loss with negative samples
 35 |         >>>         else:
 36 |         >>>             # evaluate loss without negative samples
 37 | 
 38 |     """
 39 | 
 40 |     def __init__(self, size: int = 2 ** 16):
 41 | 
 42 |         super(MemoryBankModule, self).__init__()
 43 | 
 44 |         if size < 0:
 45 |             msg = f'Illegal memory bank size {size}, must be non-negative.'
 46 |             raise ValueError(msg)
 47 | 
 48 |         self.size = size
 49 | 
 50 |         self.bank = None
 51 |         self.bank_ptr = None
 52 |     
 53 |     @torch.no_grad()
 54 |     def _init_memory_bank(self, dim: int):
 55 |         """Initialize the memory bank if it's empty
 56 | 
 57 |         Args:
 58 |             dim:
 59 |                 The dimension of the which are stored in the bank.
 60 | 
 61 |         """
 62 |         # create memory bank
 63 |         # we could use register buffers like in the moco repo
 64 |         # https://github.com/facebookresearch/moco but we don't
 65 |         # want to pollute our checkpoints
 66 |         self.bank = torch.randn(dim, self.size)
 67 |         self.bank = torch.nn.functional.normalize(self.bank, dim=0)
 68 |         self.bank_ptr = torch.LongTensor([0])
 69 | 
 70 |     @torch.no_grad()
 71 |     def _dequeue_and_enqueue(self, batch: torch.Tensor):
 72 |         """Dequeue the oldest batch and add the latest one
 73 | 
 74 |         Args:
 75 |             batch:
 76 |                 The latest batch of keys to add to the memory bank.
 77 | 
 78 |         """
 79 |         batch_size = batch.shape[0]
 80 |         ptr = int(self.bank_ptr)
 81 | 
 82 |         if ptr + batch_size >= self.size:
 83 |             self.bank[:, ptr:] = batch[:self.size - ptr].T.detach()
 84 |             self.bank_ptr[0] = 0
 85 |         else:
 86 |             self.bank[:, ptr:ptr + batch_size] = batch.T.detach()
 87 |             self.bank_ptr[0] = ptr + batch_size
 88 | 
 89 |     def forward(self,
 90 |                 output: torch.Tensor,
 91 |                 labels: torch.Tensor = None,
 92 |                 update: bool = False):
 93 |         """Query memory bank for additional negative samples
 94 | 
 95 |         Args:
 96 |             output:
 97 |                 The output of the model.
 98 |             labels:
 99 |                 Should always be None, will be ignored.
100 | 
101 |         Returns:
102 |             The output if the memory bank is of size 0, otherwise the output
103 |             and the entries from the memory bank.
104 | 
105 |         """
106 | 
107 |         # no memory bank, return the output
108 |         if self.size == 0:
109 |             return output, None
110 | 
111 |         _, dim = output.shape
112 | 
113 |         # initialize the memory bank if it is not already done
114 |         if self.bank is None:
115 |             self._init_memory_bank(dim)
116 | 
117 |         # query and update memory bank
118 |         bank = self.bank.clone().detach()
119 | 
120 |         # only update memory bank if we later do backward pass (gradient)
121 |         if update:
122 |             self._dequeue_and_enqueue(output)
123 | 
124 |         return output, bank
125 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/nnclr_modules/memory_bank_cuda.py:
--------------------------------------------------------------------------------
  1 | """ Memory Bank Wrapper """
  2 | 
  3 | # Copyright (c) 2020. Lightly AG and its affiliates.
  4 | # All Rights Reserved
  5 | 
  6 | import torch
  7 | import functools
  8 | 
  9 | class MemoryBankModule(torch.nn.Module):
 10 |     """Memory bank implementation
 11 | 
 12 |     This is a parent class to all loss functions implemented by the lightly
 13 |     Python package. This way, any loss can be used with a memory bank if 
 14 |     desired.
 15 | 
 16 |     Attributes:
 17 |         size:
 18 |             Number of keys the memory bank can store. If set to 0,
 19 |             memory bank is not used.
 20 | 
 21 |     Examples:
 22 |         >>> class MyLossFunction(MemoryBankModule):
 23 |         >>>
 24 |         >>>     def __init__(self, memory_bank_size: int = 2 ** 16):
 25 |         >>>         super(MyLossFunction, self).__init__(memory_bank_size)
 26 |         >>>
 27 |         >>>     def forward(self, output: torch.Tensor,
 28 |         >>>                 labels: torch.Tensor = None):
 29 |         >>>
 30 |         >>>         output, negatives = super(
 31 |         >>>             MyLossFunction, self).forward(output)
 32 |         >>>
 33 |         >>>         if negatives is not None:
 34 |         >>>             # evaluate loss with negative samples
 35 |         >>>         else:
 36 |         >>>             # evaluate loss without negative samples
 37 | 
 38 |     """
 39 | 
 40 |     def __init__(self, size: int = 2 ** 16):
 41 | 
 42 |         super(MemoryBankModule, self).__init__()
 43 | 
 44 |         if size < 0:
 45 |             msg = f'Illegal memory bank size {size}, must be non-negative.'
 46 |             raise ValueError(msg)
 47 | 
 48 |         self.size = size
 49 | 
 50 |         self.bank = None
 51 |         self.bank_ptr = None
 52 | 
 53 |     @torch.no_grad()
 54 |     def _init_memory_bank(self, dim: int):
 55 |         """Initialize the memory bank if it's empty
 56 | 
 57 |         Args:
 58 |             dim:
 59 |                 The dimension of the which are stored in the bank.
 60 | 
 61 |         """
 62 |         # create memory bank
 63 |         # we could use register buffers like in the moco repo
 64 |         # https://github.com/facebookresearch/moco but we don't
 65 |         # want to pollute our checkpoints
 66 |         self.bank = torch.randn(dim, self.size).cuda().half()
 67 |         self.bank = self.bank / (self.bank.norm(dim=0, keepdim=True)+1e-10)
 68 |         self.bank_ptr = torch.LongTensor([0])
 69 | 
 70 |     @torch.no_grad()
 71 |     def _dequeue_and_enqueue(self, batch: torch.Tensor):
 72 |         """Dequeue the oldest batch and add the latest one
 73 | 
 74 |         Args:
 75 |             batch:
 76 |                 The latest batch of keys to add to the memory bank.
 77 | 
 78 |         """
 79 |         batch_size = batch.shape[0]
 80 |         ptr = int(self.bank_ptr)
 81 | 
 82 |         while ptr + batch_size >= self.size:
 83 |             self.bank[:, ptr:] = batch[:self.size - ptr].T.detach()
 84 |             batch = batch[self.size - ptr:]
 85 |             self.bank_ptr[0] = 0
 86 | 
 87 |             batch_size = batch.shape[0]
 88 |             ptr = int(self.bank_ptr)
 89 | 
 90 |         if batch_size != 0:
 91 |             self.bank[:, ptr:ptr + batch_size] = batch.T.detach()
 92 |             self.bank_ptr[0] = ptr + batch_size
 93 | 
 94 |     def forward(self,
 95 |                 output: torch.Tensor,
 96 |                 labels: torch.Tensor = None,
 97 |                 update: bool = False):
 98 |         """Query memory bank for additional negative samples
 99 | 
100 |         Args:
101 |             output:
102 |                 The output of the model.
103 |             labels:
104 |                 Should always be None, will be ignored.
105 | 
106 |         Returns:
107 |             The output if the memory bank is of size 0, otherwise the output
108 |             and the entries from the memory bank.
109 | 
110 |         """
111 | 
112 |         # no memory bank, return the output
113 |         if self.size == 0:
114 |             return output, None
115 | 
116 |         _, dim = output.shape
117 | 
118 |         # initialize the memory bank if it is not already done
119 |         if self.bank is None:
120 |             self._init_memory_bank(dim)
121 | 
122 |         # query and update memory bank
123 |         bank = self.bank.clone().detach()
124 | 
125 |         # only update memory bank if we later do backward pass (gradient)
126 |         if update:
127 |             self._dequeue_and_enqueue(output)
128 | 
129 |         return output, bank
130 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/nnclr_modules/nn_memory_bank.py:
--------------------------------------------------------------------------------
 1 | """ Nearest Neighbour Memory Bank Module """
 2 | 
 3 | # Copyright (c) 2021. Lightly AG and its affiliates.
 4 | # All Rights Reserved
 5 | 
 6 | import torch
 7 | from .memory_bank import MemoryBankModule
 8 | 
 9 | 
10 | class NNMemoryBankModule(MemoryBankModule):
11 |     """Nearest Neighbour Memory Bank implementation
12 | 
13 |     This class implements a nearest neighbour memory bank as described in the 
14 |     NNCLR paper[0]. During the forward pass we return the nearest neighbour 
15 |     from the memory bank.
16 | 
17 |     [0] NNCLR, 2021, https://arxiv.org/abs/2104.14548
18 | 
19 |     Attributes:
20 |         size:
21 |             Number of keys the memory bank can store. If set to 0,
22 |             memory bank is not used.
23 | 
24 |     Examples:
25 |         >>> model = NNCLR(backbone)
26 |         >>> criterion = NTXentLoss(temperature=0.1)
27 |         >>> 
28 |         >>> nn_replacer = NNmemoryBankModule(size=2 ** 16)
29 |         >>>
30 |         >>> # forward pass
31 |         >>> (z0, p0), (z1, p1) = model(x0, x1)
32 |         >>> z0 = nn_replacer(z0.detach(), update=False)
33 |         >>> z1 = nn_replacer(z1.detach(), update=True)
34 |         >>>
35 |         >>> loss = 0.5 * (criterion(z0, p1) + criterion(z1, p0))
36 | 
37 |     """
38 |     def __init__(self, size: int = 2 ** 16, topk: int = 1):
39 |         super(NNMemoryBankModule, self).__init__(size)
40 |         self.topk = topk
41 | 
42 |     def forward(self,
43 |                 output: torch.Tensor,
44 |                 update: bool = False):
45 |         """Returns nearest neighbour of output tensor from memory bank
46 | 
47 |         Args:
48 |             output: The torch tensor for which you want the nearest neighbour
49 |             update: If `True` updated the memory bank by adding output to it
50 | 
51 |         """
52 | 
53 |         output, bank = super(NNMemoryBankModule, self).forward(output, update=update)
54 |         bank = bank.to(output.device).t()
55 | 
56 |         output_normed = torch.nn.functional.normalize(output, dim=1)
57 |         bank_normed = torch.nn.functional.normalize(bank, dim=1)
58 | 
59 |         similarity_matrix = torch.einsum("nd,md->nm", output_normed, bank_normed)
60 |         # index_nearest_neighbours = torch.argmax(similarity_matrix, dim=1)
61 |         # nearest_neighbours = torch.index_select(bank, dim=0, index=index_nearest_neighbours)
62 |         _, index_nearest_neighbours = torch.topk(similarity_matrix, k=self.topk, dim=1)
63 |         nearest_neighbours = [torch.index_select(bank, dim=0, index=index_nearest_neighbours[:,i]) for i in range(self.topk)]
64 | 
65 |         return nearest_neighbours
66 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/text_utils/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Computer-Vision-in-the-Wild/Elevater_Toolkit_IC/00d0af78559d5f3d800ae4668210e6bd1f2f84b9/vision_benchmark/models/declip_model/utils/text_utils/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/text_utils/mask_tokens.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Tuple, List
 3 | 
 4 | 
 5 | def mask_tokens(inputs, special_tokens, mask_token, tokenizer_length, mlm_probability=0.15, special_tokens_mask=None) -> Tuple[torch.Tensor, torch.Tensor]:
 6 |     """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
 7 |     labels = inputs.clone()
 8 |     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
 9 |     probability_matrix = torch.full(labels.shape, mlm_probability)
10 |     if special_tokens_mask is None:
11 |         special_tokens_mask = [1 if val in special_tokens else 0 for val in labels.tolist()]
12 |     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
13 |     # if tokenizer._pad_token is not None:
14 |     #     padding_mask = labels.eq(tokenizer.pad_token_id)
15 |     #     probability_matrix.masked_fill_(padding_mask, value=0.0)
16 |     masked_indices = torch.bernoulli(probability_matrix).bool()
17 |     labels[~masked_indices] = -100  # We only compute loss on masked tokens
18 | 
19 |     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
20 |     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
21 |     inputs[indices_replaced] = mask_token
22 | 
23 |     # 10% of the time, we replace masked input tokens with random word
24 |     indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
25 |     random_words = torch.randint(tokenizer_length, labels.shape, dtype=torch.long)
26 |     inputs[indices_random] = random_words[indices_random]
27 | 
28 |     # The rest of the time (10% of the time) we keep the masked input tokens unchanged
29 |     return inputs, labels
30 | 
31 | 
32 | def MaskTokens(tokens, mask_type, mask_token, special_tokens=None, tokenizer_length=None, sepcial_tokens_mask=None, special_tokens_mask=None):
33 |     if mask_type == 'MLM':
34 |         tokens, labels = mask_tokens(inputs=tokens, special_tokens=special_tokens, mask_token=mask_token, tokenizer_length=tokenizer_length, special_tokens_mask=special_tokens_mask)
35 |     else:
36 |         raise NotImplementedError(mask_type)
37 |     return tokens, labels
38 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/declip_model/utils/text_utils/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | # Change: Extend <|mask|> tokenizer-size+=1
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(os.path.join(os.path.dirname(os.path.abspath(__file__)), bpe_path)).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|mask|>'])
 74 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 75 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 76 |         self.decoder = {v: k for k, v in self.encoder.items()}
 77 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 78 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 79 |         self.cache['<|mask|>'] = '<|mask|>'
 80 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 81 | 
 82 |     def bpe(self, token):
 83 |         if token in self.cache:
 84 |             return self.cache[token]
 85 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 86 |         pairs = get_pairs(word)
 87 | 
 88 |         if not pairs:
 89 |             return token+'</w>'
 90 | 
 91 |         while True:
 92 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 93 |             if bigram not in self.bpe_ranks:
 94 |                 break
 95 |             first, second = bigram
 96 |             new_word = []
 97 |             i = 0
 98 |             while i < len(word):
 99 |                 try:
100 |                     j = word.index(first, i)
101 |                     new_word.extend(word[i:j])
102 |                     i = j
103 |                 except:
104 |                     new_word.extend(word[i:])
105 |                     break
106 | 
107 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
108 |                     new_word.append(first+second)
109 |                     i += 2
110 |                 else:
111 |                     new_word.append(word[i])
112 |                     i += 1
113 |             new_word = tuple(new_word)
114 |             word = new_word
115 |             if len(word) == 1:
116 |                 break
117 |             else:
118 |                 pairs = get_pairs(word)
119 |         word = ' '.join(word)
120 |         self.cache[token] = word
121 |         return word
122 | 
123 |     def encode(self, text):
124 |         bpe_tokens = []
125 |         text = whitespace_clean(basic_clean(text)).lower()
126 |         for token in re.findall(self.pat, text):
127 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
128 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
129 |         return bpe_tokens
130 | 
131 |     def decode(self, tokens):
132 |         text = ''.join([self.decoder[token] for token in tokens])
133 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
134 |         return text
135 | 


--------------------------------------------------------------------------------
/vision_benchmark/models/mae.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | # --------------------------------------------------------
  7 | # References:
  8 | # timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
  9 | # DeiT: https://github.com/facebookresearch/deit
 10 | # --------------------------------------------------------
 11 | 
 12 | from functools import partial
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | 
 17 | import timm.models.vision_transformer
 18 | 
 19 | import logging
 20 | 
 21 | 
 22 | class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
 23 |     """ Vision Transformer with support for global average pooling
 24 |     """
 25 |     def __init__(self, global_pool=False, **kwargs):
 26 |         super(VisionTransformer, self).__init__(**kwargs)
 27 | 
 28 |         self.global_pool = global_pool
 29 |         if self.global_pool:
 30 |             norm_layer = kwargs['norm_layer']
 31 |             embed_dim = kwargs['embed_dim']
 32 |             self.fc_norm = norm_layer(embed_dim)
 33 | 
 34 |             del self.norm  # remove the original norm
 35 | 
 36 |     def forward_features(self, x):
 37 |         B = x.shape[0]
 38 |         x = self.patch_embed(x)
 39 | 
 40 |         cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
 41 |         x = torch.cat((cls_tokens, x), dim=1)
 42 |         x = x + self.pos_embed
 43 |         x = self.pos_drop(x)
 44 | 
 45 |         for blk in self.blocks:
 46 |             x = blk(x)
 47 | 
 48 |         if self.global_pool:
 49 |             x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
 50 |             outcome = self.fc_norm(x)
 51 |         else:
 52 |             x = self.norm(x)
 53 |             outcome = x[:, 0]
 54 | 
 55 |         return outcome
 56 | 
 57 | 
 58 | def vit_base_patch16(**kwargs):
 59 |     model = VisionTransformer(
 60 |         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
 61 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
 62 |     return model
 63 | 
 64 | 
 65 | def vit_large_patch16(**kwargs):
 66 |     model = VisionTransformer(
 67 |         patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
 68 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
 69 |     return model
 70 | 
 71 | 
 72 | def vit_huge_patch14(**kwargs):
 73 |     model = VisionTransformer(
 74 |         patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True,
 75 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
 76 |     return model
 77 | 
 78 | def get_model(config):
 79 |     mae_specs = config.MODEL.SPEC
 80 | 
 81 |     model = VisionTransformer(
 82 |         patch_size=mae_specs.PATCH_SIZE, embed_dim=mae_specs.EMBED_DIM,
 83 |         depth=mae_specs.DEPTH, num_heads=mae_specs.NUM_HEADS, mlp_ratio=mae_specs.MLP_RATIO,
 84 |         qkv_bias=mae_specs.QKV_BIAS,
 85 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
 86 |         global_pool=mae_specs.GLOBAL_POOL)
 87 | 
 88 |     model_file = config.TEST.MODEL_FILE
 89 |     logging.info(f'=> load model file: {model_file}')
 90 | 
 91 |     if model_file.startswith('http'):
 92 |         checkpoint = torch.hub.load_state_dict_from_url(model_file, progress=False, map_location="cpu")
 93 |     else:
 94 |         checkpoint = torch.load(model_file, map_location="cpu")
 95 | 
 96 |     state_dict = checkpoint['model']
 97 | 
 98 |     incompatible = model.load_state_dict(state_dict, strict=False)
 99 | 
100 |     if incompatible.missing_keys:
101 |         logging.warning('Missing keys: {}'.format(', '.join(incompatible.missing_keys)))
102 |     if incompatible.unexpected_keys:
103 |         logging.warning('Unexpected keys: {}'.format(', '.join(incompatible.unexpected_keys)))
104 | 
105 |     return model


--------------------------------------------------------------------------------
/vision_benchmark/models/mocov3.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import math
  8 | import torch
  9 | import torch.nn as nn
 10 | from functools import partial, reduce
 11 | from operator import mul
 12 | 
 13 | from timm.models.vision_transformer import VisionTransformer, _cfg
 14 | from timm.models.layers.helpers import to_2tuple
 15 | from timm.models.layers import PatchEmbed
 16 | 
 17 | import logging
 18 | 
 19 | 
 20 | class VisionTransformerMoCo(VisionTransformer):
 21 |     def __init__(self, stop_grad_conv1=False, **kwargs):
 22 |         super().__init__(**kwargs)
 23 |         # Use fixed 2D sin-cos position embedding
 24 |         self.build_2d_sincos_position_embedding()
 25 | 
 26 |         # weight initialization
 27 |         for name, m in self.named_modules():
 28 |             if isinstance(m, nn.Linear):
 29 |                 if 'qkv' in name:
 30 |                     # treat the weights of Q, K, V separately
 31 |                     val = math.sqrt(6. / float(m.weight.shape[0] // 3 + m.weight.shape[1]))
 32 |                     nn.init.uniform_(m.weight, -val, val)
 33 |                 else:
 34 |                     nn.init.xavier_uniform_(m.weight)
 35 |                 nn.init.zeros_(m.bias)
 36 |         nn.init.normal_(self.cls_token, std=1e-6)
 37 | 
 38 |         if isinstance(self.patch_embed, PatchEmbed):
 39 |             # xavier_uniform initialization
 40 |             val = math.sqrt(6. / float(3 * reduce(mul, self.patch_embed.patch_size, 1) + self.embed_dim))
 41 |             nn.init.uniform_(self.patch_embed.proj.weight, -val, val)
 42 |             nn.init.zeros_(self.patch_embed.proj.bias)
 43 | 
 44 |             if stop_grad_conv1:
 45 |                 self.patch_embed.proj.weight.requires_grad = False
 46 |                 self.patch_embed.proj.bias.requires_grad = False
 47 | 
 48 |     def build_2d_sincos_position_embedding(self, temperature=10000.):
 49 |         h, w = self.patch_embed.grid_size
 50 |         grid_w = torch.arange(w, dtype=torch.float32)
 51 |         grid_h = torch.arange(h, dtype=torch.float32)
 52 |         grid_w, grid_h = torch.meshgrid(grid_w, grid_h)
 53 |         assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
 54 |         pos_dim = self.embed_dim // 4
 55 |         omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
 56 |         omega = 1. / (temperature**omega)
 57 |         out_w = torch.einsum('m,d->md', [grid_w.flatten(), omega])
 58 |         out_h = torch.einsum('m,d->md', [grid_h.flatten(), omega])
 59 |         pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :]
 60 | 
 61 |         assert self.num_tokens == 1, 'Assuming one and only one token, [cls]'
 62 |         pe_token = torch.zeros([1, 1, self.embed_dim], dtype=torch.float32)
 63 |         self.pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1))
 64 |         self.pos_embed.requires_grad = False
 65 | 
 66 | 
 67 | class ConvStem(nn.Module):
 68 |     """ 
 69 |     ConvStem, from Early Convolutions Help Transformers See Better, Tete et al. https://arxiv.org/abs/2106.14881
 70 |     """
 71 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
 72 |         super().__init__()
 73 | 
 74 |         assert patch_size == 16, 'ConvStem only supports patch size of 16'
 75 |         assert embed_dim % 8 == 0, 'Embed dimension must be divisible by 8 for ConvStem'
 76 | 
 77 |         img_size = to_2tuple(img_size)
 78 |         patch_size = to_2tuple(patch_size)
 79 |         self.img_size = img_size
 80 |         self.patch_size = patch_size
 81 |         self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
 82 |         self.num_patches = self.grid_size[0] * self.grid_size[1]
 83 |         self.flatten = flatten
 84 | 
 85 |         # build stem, similar to the design in https://arxiv.org/abs/2106.14881
 86 |         stem = []
 87 |         input_dim, output_dim = 3, embed_dim // 8
 88 |         for l in range(4):
 89 |             stem.append(nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=2, padding=1, bias=False))
 90 |             stem.append(nn.BatchNorm2d(output_dim))
 91 |             stem.append(nn.ReLU(inplace=True))
 92 |             input_dim = output_dim
 93 |             output_dim *= 2
 94 |         stem.append(nn.Conv2d(input_dim, embed_dim, kernel_size=1))
 95 |         self.proj = nn.Sequential(*stem)
 96 | 
 97 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 98 | 
 99 |     def forward(self, x):
100 |         B, C, H, W = x.shape
101 |         assert H == self.img_size[0] and W == self.img_size[1], \
102 |             f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
103 |         x = self.proj(x)
104 |         if self.flatten:
105 |             x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
106 |         x = self.norm(x)
107 |         return x
108 | 
109 | 
110 | def vit_small(**kwargs):
111 |     model = VisionTransformerMoCo(
112 |         patch_size=16, embed_dim=384, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
113 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
114 |     model.default_cfg = _cfg()
115 |     return model
116 | 
117 | def vit_base(**kwargs):
118 |     model = VisionTransformerMoCo(
119 |         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
120 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
121 |     model.default_cfg = _cfg()
122 |     return model
123 | 
124 | def vit_conv_small(**kwargs):
125 |     # minus one ViT block
126 |     model = VisionTransformerMoCo(
127 |         patch_size=16, embed_dim=384, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True,
128 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs)
129 |     model.default_cfg = _cfg()
130 |     return model
131 | 
132 | def vit_conv_base(**kwargs):
133 |     # minus one ViT block
134 |     model = VisionTransformerMoCo(
135 |         patch_size=16, embed_dim=768, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True,
136 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs)
137 |     model.default_cfg = _cfg()
138 |     return model
139 | 
140 | 
141 | def get_model(config):
142 |     mae_specs = config.MODEL.SPEC
143 | 
144 |     model = VisionTransformerMoCo(
145 |         patch_size=mae_specs.PATCH_SIZE, embed_dim=mae_specs.EMBED_DIM,
146 |         depth=mae_specs.DEPTH, num_heads=mae_specs.NUM_HEADS, mlp_ratio=mae_specs.MLP_RATIO,
147 |         qkv_bias=mae_specs.QKV_BIAS,
148 |         norm_layer=partial(nn.LayerNorm, eps=1e-6))
149 | 
150 |     model_file = config.TEST.MODEL_FILE
151 |     logging.info(f'=> load model file: {model_file}')
152 | 
153 |     if model_file.startswith('http'):
154 |         checkpoint = torch.hub.load_state_dict_from_url(model_file, progress=False, map_location="cpu")
155 |     else:
156 |         checkpoint = torch.load(model_file, map_location="cpu")
157 | 
158 |     # rename moco pre-trained keys
159 |     state_dict = checkpoint['state_dict']
160 |     for k in list(state_dict.keys()):
161 |         if k.startswith('module.base_encoder'):
162 |             state_dict[k[len("module.base_encoder."):]] = state_dict[k]
163 |             del state_dict[k]
164 |         elif k.startswith('module.'):
165 |             state_dict[k[len("module."):]] = state_dict[k]
166 |             del state_dict[k]
167 | 
168 |     incompatible = model.load_state_dict(state_dict, strict=False)
169 | 
170 |     if incompatible.missing_keys:
171 |         logging.warning('Missing keys: {}'.format(', '.join(incompatible.missing_keys)))
172 |     if incompatible.unexpected_keys:
173 |         logging.warning('Unexpected keys: {}'.format(', '.join(incompatible.unexpected_keys)))
174 | 
175 |     return model
176 | 


--------------------------------------------------------------------------------
/vision_benchmark/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_optimizer
2 | 
3 | __all__ = ['build_optimizer']
4 | 


--------------------------------------------------------------------------------
/vision_benchmark/optim/build.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | 
  8 | from timm.optim import create_optimizer
  9 | 
 10 | def _is_depthwise(m):
 11 |     return (
 12 |             isinstance(m, nn.Conv2d)
 13 |             and m.groups == m.in_channels
 14 |             and m.groups == m.out_channels
 15 |     )
 16 | 
 17 | 
 18 | def _set_wd(cfg, model):
 19 |     without_decay_list = cfg.TRAIN.WITHOUT_WD_LIST
 20 |     without_decay_depthwise = []
 21 |     without_decay_norm = []
 22 |     for m in model.modules():
 23 |         if _is_depthwise(m) and 'depthwise' in without_decay_list:
 24 |             without_decay_depthwise.append(m.weight)
 25 |         elif isinstance(m, nn.BatchNorm2d) and 'bn' in without_decay_list:
 26 |             without_decay_norm.append(m.weight)
 27 |             without_decay_norm.append(m.bias)
 28 |         elif isinstance(m, nn.GroupNorm) and 'gn' in without_decay_list:
 29 |             without_decay_norm.append(m.weight)
 30 |             without_decay_norm.append(m.bias)
 31 |         elif isinstance(m, nn.LayerNorm) and 'ln' in without_decay_list:
 32 |             without_decay_norm.append(m.weight)
 33 |             without_decay_norm.append(m.bias)
 34 | 
 35 |     with_decay = []
 36 |     without_decay = []
 37 | 
 38 |     skip = {}
 39 |     if hasattr(model, 'no_weight_decay'):
 40 |         skip = model.no_weight_decay()
 41 | 
 42 |     for n, p in model.named_parameters():
 43 |         ever_set = False
 44 | 
 45 |         if p.requires_grad is False:
 46 |             continue
 47 | 
 48 |         if n in skip:
 49 |             print('=> set {} wd to 0'.format(n))
 50 |             without_decay.append(p)
 51 |             continue
 52 | 
 53 |         for pp in without_decay_depthwise:
 54 |             if p is pp:
 55 |                 if cfg.VERBOSE:
 56 |                     print('=> set depthwise({}) wd to 0'.format(n))
 57 |                 without_decay.append(p)
 58 |                 ever_set = True
 59 |                 break
 60 | 
 61 |         for pp in without_decay_norm:
 62 |             if p is pp:
 63 |                 if cfg.VERBOSE:
 64 |                     print('=> set norm({}) wd to 0'.format(n))
 65 |                 without_decay.append(p)
 66 |                 ever_set = True
 67 |                 break
 68 | 
 69 |         if (
 70 |                 (not ever_set)
 71 |                 and 'bias' in without_decay_list
 72 |                 and n.endswith('.bias')
 73 |         ):
 74 |             if cfg.VERBOSE:
 75 |                 print('=> set bias({}) wd to 0'.format(n))
 76 |             without_decay.append(p)
 77 |         elif not ever_set:
 78 |             with_decay.append(p)
 79 | 
 80 |     # assert (len(with_decay) + len(without_decay) == len(list(model.parameters())))
 81 |     params = [
 82 |         {'params': with_decay},
 83 |         {'params': without_decay, 'weight_decay': 0.}
 84 |     ]
 85 |     return params
 86 | 
 87 | 
 88 | def build_optimizer(cfg, model):
 89 |     if cfg.TRAIN.OPTIMIZER == 'timm':
 90 |         args = cfg.TRAIN.OPTIMIZER_ARGS
 91 | 
 92 |         print(f'=> usage timm optimizer args: {cfg.TRAIN.OPTIMIZER_ARGS}')
 93 |         optimizer = create_optimizer(args, model)
 94 | 
 95 |         return optimizer
 96 | 
 97 |     optimizer = None
 98 |     params = _set_wd(cfg, model)
 99 |     if cfg.TRAIN.OPTIMIZER == 'sgd':
100 |         if cfg.TRAIN.TWO_LR:
101 | 
102 |             trunk_parameters = []
103 |             head_parameters = []
104 |             for name, param in model.named_parameters():
105 |                 if 'backbone' in name:
106 |                     trunk_parameters.append(param)
107 |                 else:
108 |                     head_parameters.append(param)
109 | 
110 |             optimizer = optim.SGD(
111 |                 [{'params': trunk_parameters},
112 |                 {'params': head_parameters, 'lr': cfg.TRAIN.LR }],
113 |                 lr=cfg.TRAIN.LR * 0.1,
114 |                 momentum=cfg.TRAIN.MOMENTUM,
115 |                 weight_decay=cfg.TRAIN.WD,
116 |                 nesterov=cfg.TRAIN.NESTEROV
117 |             )
118 | 
119 |         else:    
120 |             optimizer = optim.SGD(
121 |                 params,
122 |                 # filter(lambda p: p.requires_grad, model.parameters()),
123 |                 lr=cfg.TRAIN.LR,
124 |                 momentum=cfg.TRAIN.MOMENTUM,
125 |                 weight_decay=cfg.TRAIN.WD,
126 |                 nesterov=cfg.TRAIN.NESTEROV
127 |             )
128 |     elif cfg.TRAIN.OPTIMIZER == 'adam':
129 | 
130 |         if cfg.TRAIN.TWO_LR:
131 | 
132 |             trunk_parameters = []
133 |             head_parameters = []
134 |             for name, param in model.named_parameters():
135 |                 if 'backbone' in name:
136 |                     trunk_parameters.append(param)
137 |                 else:
138 |                     head_parameters.append(param)
139 | 
140 |             optimizer = optim.Adam(
141 |                 [{'params': trunk_parameters},
142 |                 {'params': head_parameters, 'lr': cfg.TRAIN.LR}],
143 |                 lr=cfg.TRAIN.LR * 0.1,
144 |                 weight_decay=cfg.TRAIN.WD,
145 |             )
146 |         else:
147 |             optimizer = optim.Adam(
148 |                 params,
149 |                 # filter(lambda p: p.requires_grad, model.parameters()),
150 |                 lr=cfg.TRAIN.LR,
151 |                 weight_decay=cfg.TRAIN.WD,
152 |             )
153 |     elif cfg.TRAIN.OPTIMIZER == 'adamW':
154 |         optimizer = optim.AdamW(
155 |             params,
156 |             lr=cfg.TRAIN.LR,
157 |             weight_decay=cfg.TRAIN.WD,
158 |         )
159 |     elif cfg.TRAIN.OPTIMIZER == 'rmsprop':
160 |         optimizer = optim.RMSprop(
161 |             params,
162 |             # filter(lambda p: p.requires_grad, model.parameters()),
163 |             lr=cfg.TRAIN.LR,
164 |             momentum=cfg.TRAIN.MOMENTUM,
165 |             weight_decay=cfg.TRAIN.WD,
166 |             alpha=cfg.TRAIN.RMSPROP_ALPHA,
167 |             centered=cfg.TRAIN.RMSPROP_CENTERED
168 |         )
169 | 
170 |     return optimizer
171 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/caltech101.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'caltech-101'
5 |   ROOT: '../DATASET/caltech101-tf/'
6 |   NUM_CLASSES: 102
7 | TEST:
8 |   METRIC: 'mean-per-class'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/cifar10.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'cifar-10'
5 |   ROOT: '../../DATASET/cifar10/'
6 |   NUM_CLASSES: 10
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/cifar100.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'cifar-100'
5 |   ROOT: '../DATASET/cifar100/'
6 |   NUM_CLASSES: 100
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/country211.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'country211'
4 |   ROOT: '../DATASET/country211/'
5 |   NUM_CLASSES: 211
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/dtd.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'dtd'
5 |   ROOT: '../DATASET/dtd-v1/'
6 |   NUM_CLASSES: 47
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/eurosat-clip.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'eurosat_clip'
4 |   ROOT: '../DATASET/eurosat_clip/'
5 |   NUM_CLASSES: 10
6 | TEST:
7 |   METRIC: 'accuracy'


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/fer2013.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'fer-2013'
5 |   ROOT: '../DATASET/fer2013-v1/'
6 |   NUM_CLASSES: 7
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/fgvc-aircraft-2013b.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | DATA_DIR: ''
 3 | 
 4 | DATASET:
 5 |   DATASET: 'fgvc-aircraft-2013b-variants102'
 6 |   ROOT: '../DATASET/fgvc-aircraft-2013b-variants102/'
 7 |   NUM_CLASSES: 100
 8 | TEST:
 9 |   METRIC: 'mean-per-class'
10 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/flower102.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'oxford-flower-102'
4 |   ROOT: '../DATASET/flower102/'
5 |   NUM_CLASSES: 102
6 | TEST:
7 |   METRIC: 'mean-per-class'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/food101.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | DATA_DIR: ''
 3 | 
 4 | DATASET:
 5 |   DATASET: 'food-101'
 6 |   ROOT: '../DATASET/food101/'
 7 |   NUM_CLASSES: 101
 8 | TEST:
 9 |   METRIC: 'accuracy'
10 | DEBUG:
11 |   DEBUG: false
12 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/gtsrb.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'gtsrb'
4 |   ROOT: '../DATASET/gtsrb/'
5 |   NUM_CLASSES: 43
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/hateful-memes.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   DATASET: 'hateful-memes'
3 |   ROOT: '../DATASET/hateful_memes/'
4 |   NUM_CLASSES: 2
5 | TEST:
6 |   METRIC: 'roc_auc'


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/imagenet-1k.yaml:
--------------------------------------------------------------------------------
1 | DATASET:
2 |   DATASET: 'imagenet-1k'
3 |   ROOT: '../DATASET/imagenet-1k/'
4 | TEST:
5 |   METRIC: 'accuracy'
6 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/kitti-distance.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'kitti-distance'
4 |   CENTER_CROP: false
5 |   ROOT: '../DATASET/kitti_distance_20210923/'
6 |   NUM_CLASSES: 4
7 | TEST:
8 |   METRIC: 'accuracy'


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/mnist.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'mnist'
4 |   ROOT: '../DATASET/mnist/'
5 |   NUM_CLASSES: 10
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/oxford-iiit-pets.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'oxford-iiit-pets'
4 |   ROOT: '../DATASET/pet37/'
5 |   NUM_CLASSES: 37
6 | TEST:
7 |   METRIC: 'mean-per-class'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/patchcamelyon.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'patch-camelyon'
5 |   ROOT: '../DATASET/patchcamelyon/'
6 |   NUM_CLASSES: 2
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/rendered-sst2.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'rendered-sst2'
4 |   ROOT: '../DATASET/rendered-sst2/'
5 |   NUM_CLASSES: 2
6 | TEST:
7 |   METRIC: 'accuracy'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/resisc45-clip.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'resisc45_clip'
4 |   ROOT: '../DATASET/resisc45_clip/'
5 |   NUM_CLASSES: 45
6 | TEST:
7 |   METRIC: 'accuracy'


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/stanfordcar.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | 
3 | DATASET:
4 |   DATASET: 'stanford-cars'
5 |   ROOT: '../DATASET/stanfordcars/'
6 |   NUM_CLASSES: 196
7 | TEST:
8 |   METRIC: 'accuracy'
9 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/datasets/voc2007classification.yaml:
--------------------------------------------------------------------------------
1 | # GPUS: (0,)
2 | DATASET:
3 |   DATASET: 'voc-2007-classification'
4 |   ROOT: '../DATASET/voc2007/'
5 |   NUM_CLASSES: 20
6 | TEST:
7 |   METRIC: '11point_mAP'
8 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/cifar-10_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "airplane", "def_wiki": "A powered heavier-than-air aircraft with fixed wings.", "path_wn": ["airplane", "heavier-than-air_craft", "aircraft", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an aircraft that has a fixed wing and is powered by propellers or jets"}, {"classname": "automobile", "def_wiki": "A type of vehicle designed to move on the ground under its own stored power and intended to carry a driver, a small number of additional passengers, and a very limited amount of other load. A car or motorcar.", "path_wn": ["car", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a motor vehicle with four wheels; usually propelled by an internal combustion engine"}, {"classname": "bird", "def_wiki": "A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", "path_wn": ["bird", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"}, {"classname": "cat", "def_wiki": "An animal of the family Felidae:\n Synonym: felid", "path_wn": ["cat", "feline", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats"}, {"classname": "deer", "def_wiki": "A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", "path_wn": ["deer", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "distinguished from Bovidae by the male's having solid deciduous antlers"}, {"classname": "dog", "def_wiki": "A mammal, Canis familiaris or Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.", "path_wn": ["dog", "canine", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds"}, {"classname": "frog", "def_wiki": "A small tailless amphibian of the order Anura that typically hops.", "path_wn": ["frog", "amphibian", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "any of various tailless stout-bodied amphibians with long hind limbs for leaping; semiaquatic and terrestrial species"}, {"classname": "horse", "def_wiki": "Any of several animals related to Equus ferus caballus.", "path_wn": ["horse", "equine", "odd-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "solid-hoofed herbivorous quadruped domesticated since prehistoric times"}, {"classname": "ship", "def_wiki": "A water-borne vessel generally larger than a boat.", "path_wn": ["ship", "vessel", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a vessel that carries passengers or freight"}, {"classname": "truck", "def_wiki": "A small wheel or roller, specifically the wheel of a gun carriage.", "path_wn": ["truck", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an automotive vehicle suitable for hauling"}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/eurosat_clip_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "annual crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "forest", "def_wiki": "A dense uncultivated tract of trees and undergrowth, larger than woods.", "path_wn": ["forest", "vegetation", "collection", "group", "abstraction", "entity"], "def_wn": "the trees and other plants in a large densely wooded area"}, {"classname": "brushland or shrubland", "def_wiki": "Land that is covered mostly with shrubs.", "path_wn": "", "def_wn": ""}, {"classname": "highway or road", "def_wiki": "A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard: a country road is the same as a country lane.", "path_wn": "", "def_wn": ""}, {"classname": "industrial buildings or commercial buildings", "def_wiki": "The act or process by which something is built; construction.", "path_wn": "", "def_wn": ""}, {"classname": "pasture land", "def_wiki": "land used for grazing animals", "path_wn": "", "def_wn": ""}, {"classname": "permanent crop land", "def_wiki": "arable land", "path_wn": "", "def_wn": ""}, {"classname": "residential buildings or homes or apartments", "def_wiki": "A complete domicile occupying only part of a building, especially one for rent; a flat.", "path_wn": "", "def_wn": ""}, {"classname": "river", "def_wiki": "A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", "path_wn": ["river", "stream", "body_of_water", "thing", "physical_entity", "entity"], "def_wn": "a large natural stream of water (larger than a creek)"}, {"classname": "lake or sea", "def_wiki": "A large body of salt water.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/fer-2013_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "angry", "def_wiki": "Displaying or feeling anger.", "path_wn": ["angry"], "def_wn": "feeling or showing anger"}, {"classname": "disgusted", "def_wiki": "Filled with disgust.", "path_wn": ["disgust", "dislike", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "fill with distaste"}, {"classname": "fearful", "def_wiki": "Frightening.", "path_wn": ["fearful"], "def_wn": "experiencing or showing fear"}, {"classname": "happy", "def_wiki": "Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", "path_wn": ["happy"], "def_wn": "enjoying or showing or marked by joy or pleasure"}, {"classname": "neutral", "def_wiki": "Not taking sides in a conflict such as war; nonaligned.", "path_wn": ["neutral", "person", "causal_agent", "physical_entity", "entity"], "def_wn": "one who does not side with any party in a war or dispute"}, {"classname": "sad", "def_wiki": "Emotionally negative.", "path_wn": ["sad"], "def_wn": "experiencing or showing sorrow or unhappiness; ; - Christina Rossetti"}, {"classname": "surprised", "def_wiki": "Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", "path_wn": ["surprise", "astonishment", "feeling", "state", "attribute", "abstraction", "entity"], "def_wn": "cause to be surprised"}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/hateful-memes_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": ["meme", "acculturation", "content", "cognition", "psychological_feature", "abstraction", "entity"], "def_wn": "a cultural unit (an idea or value or pattern of behavior) that is passed from one person to another by non-genetic means (as by imitation)"}, {"classname": "hatespeech meme", "def_wiki": "Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/kitti-distance_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "a photo i took of a car on my left or right side.", "def_wiki": "The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car nearby.", "def_wiki": "adjacent, near, close by", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with a car in the distance.", "def_wiki": "far away; a long distance away", "path_wn": "", "def_wn": ""}, {"classname": "a photo i took with no car.", "def_wiki": "A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/mnist_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "0", "def_wiki": "0.", "path_wn": ["nothing", "relative_quantity", "measure", "abstraction", "entity"], "def_wn": "a mathematical element that when added to another number yields the same number"}, {"classname": "1", "def_wiki": "The number one (1).", "path_wn": ["one", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the smallest whole number or a numeral representing this number"}, {"classname": "2", "def_wiki": "A particle used for marking the following verb as an infinitive.", "path_wn": ["two", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one or a numeral representing this number"}, {"classname": "3", "def_wiki": null, "path_wn": ["three", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of one and one and one"}, {"classname": "4", "def_wiki": "Because, as, since.", "path_wn": ["four", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of three and one"}, {"classname": "5", "def_wiki": null, "path_wn": ["five", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of four and one"}, {"classname": "6", "def_wiki": "MI6; the agency or a particular agent.", "path_wn": ["six", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of five and one"}, {"classname": "7", "def_wiki": null, "path_wn": ["seven", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of six and one"}, {"classname": "8", "def_wiki": "To ingest; to be ingested.", "path_wn": ["eight", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of seven and one"}, {"classname": "9", "def_wiki": null, "path_wn": ["nine", "digit", "integer", "number", "definite_quantity", "measure", "abstraction", "entity"], "def_wn": "the cardinal number that is the sum of eight and one"}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/patch-camelyon_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "lymph node", "def_wiki": "Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue filled with lymphocytes and macrophages that collect and destroy bacteria, viruses and foreign matter from lymph. When the body is fighting an infection, these lymphocytes multiply rapidly and produce a characteristic swelling of the lymph nodes.", "path_wn": "", "def_wn": ""}, {"classname": "lymph node containing metastatic tumor tissue", "def_wiki": "Thin, woven, gauze-like fabric.", "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/rendered-sst2_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "negative", "def_wiki": "Not positive nor neutral.", "path_wn": ["negative", "denial", "speech_act", "act", "event", "psychological_feature", "abstraction", "entity"], "def_wn": "a reply of denial"}, {"classname": "positive", "def_wiki": "Not negative or neutral.", "path_wn": ["positive", "adjective", "modifier", "content_word", "word", "language_unit", "part", "relation", "abstraction", "entity"], "def_wn": "the primary form of an adjective or adverb; denotes a quality without qualification, comparison, or relation to increase or diminution"}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/external/voc-2007-classification_knowledge.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "aeroplane", "def_wiki": "A powered heavier-than-air aircraft with fixed wings.", "path_wn": ["airplane", "heavier-than-air_craft", "aircraft", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an aircraft that has a fixed wing and is powered by propellers or jets"}, {"classname": "bicycle", "def_wiki": "A vehicle that has two wheels, one behind the other, a steering handle, and a saddle seat or seats and is usually propelled by the action of a rider\u2019s feet upon pedals.", "path_wn": ["bicycle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a wheeled vehicle that has two wheels and is moved by foot pedals"}, {"classname": "bird", "def_wiki": "A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", "path_wn": ["bird", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"}, {"classname": "boat", "def_wiki": "A craft used for transportation of goods, fishing, racing, recreational cruising, or military use on or in the water, propelled by oars or outboard motor or inboard motor or by wind.", "path_wn": ["boat", "vessel", "craft", "vehicle", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a small vessel for travel on water"}, {"classname": "bottle", "def_wiki": "A container, typically made of glass or plastic and having a tapered neck, used primarily for holding liquids.", "path_wn": ["bottle", "vessel", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a glass or plastic vessel used for storing drinks or other liquids; typically cylindrical without handles and with a narrow neck that can be plugged or capped"}, {"classname": "bus", "def_wiki": "A motor vehicle for transporting large numbers of people along roads.", "path_wn": ["bus", "public_transport", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a vehicle carrying many passengers; used for public transport"}, {"classname": "car", "def_wiki": "A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", "path_wn": ["car", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a motor vehicle with four wheels; usually propelled by an internal combustion engine"}, {"classname": "cat", "def_wiki": "An animal of the family Felidae:\n Synonym: felid", "path_wn": ["cat", "feline", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats"}, {"classname": "chair", "def_wiki": "An item of furniture used to sit on or in, comprising a seat, legs or wheels, back, and sometimes arm rests, for use by one person. Compare stool, couch, sofa, settee, loveseat and bench.", "path_wn": ["chair", "seat", "furniture", "furnishing", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "a seat for one person, with a support for the back"}, {"classname": "cow", "def_wiki": "An adult female of the species Bos taurus, especially one that has calved.", "path_wn": ["cow", "cattle", "bovine", "bovid", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "female of domestic cattle:"}, {"classname": "diningtable", "def_wiki": null, "path_wn": "", "def_wn": ""}, {"classname": "dog", "def_wiki": "A mammal, Canis familiaris or Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.", "path_wn": ["dog", "canine", "carnivore", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds"}, {"classname": "horse", "def_wiki": "Any of several animals related to Equus ferus caballus.", "path_wn": ["horse", "equine", "odd-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "solid-hoofed herbivorous quadruped domesticated since prehistoric times"}, {"classname": "motorbike", "def_wiki": "A motorcycle.", "path_wn": ["minibike", "motorcycle", "motor_vehicle", "self-propelled_vehicle", "wheeled_vehicle", "container", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "small motorcycle with a low frame and small wheels and elevated handlebars"}, {"classname": "person", "def_wiki": "An individual; usually a human being. [from 13th c.]\n\nEach person is unique, both mentally and physically.", "path_wn": ["person", "causal_agent", "physical_entity", "entity"], "def_wn": "a human being"}, {"classname": "pottedplant", "def_wiki": null, "path_wn": "", "def_wn": ""}, {"classname": "sheep", "def_wiki": "A woolly ruminant of the genus Ovis.", "path_wn": ["sheep", "bovid", "ruminant", "even-toed_ungulate", "ungulate", "placental", "mammal", "vertebrate", "chordate", "animal", "organism", "living_thing", "whole", "object", "physical_entity", "entity"], "def_wn": "woolly usually horned ruminant mammal related to the goat"}, {"classname": "sofa", "def_wiki": "A raised area of a building's floor, usually covered with carpeting, used for sitting.", "path_wn": ["sofa", "seat", "furniture", "furnishing", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "an upholstered seat for more than one person"}, {"classname": "train", "def_wiki": "Elongated portion.", "path_wn": ["train", "public_transport", "conveyance", "instrumentality", "artifact", "whole", "object", "physical_entity", "entity"], "def_wn": "public transport provided by a line of railway cars coupled together and drawn by a locomotive"}, {"classname": "tvmonitor", "def_wiki": null, "path_wn": "", "def_wn": ""}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_cifar-10.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "airplane", "gpt3": [" A machine that is used for flying through the air.", " A vehicle that flies through the air under its own power.", " A machine that is used for flying through the air.", " Any of various heavier-than-air craft that travel by means of aerodynamic lift, as opposed to lighter-than-air craft such as balloons.", " A heavier-than-air craft that derives its lift from aerodynamic forces and that depends on its engine for propulsion."]}, {"classname": "automobile", "gpt3": [" A wheeled vehicle that runs on land and is propelled by an engine.", " A motor vehicle.", " A type of vehicle designed to move on the ground under its own stored power and intended to carry a driver, a small number of additional passengers, and a very limited amount of other load. A car or motorcar.", " A motor vehicle with four wheels; usually propelled by an internal-combustion engine.", " A motor vehicle with four wheels; typically propelled by an internal combustion engine."]}, {"classname": "bird", "gpt3": [" A warm-blooded vertebrate with wings and feathers and a beak, and able to fly.", " Any of numerous warm-blooded egg-laying vertebrates of the class Aves, having a body covered with feathers and forelimbs modified into wings.", " A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs.", " A warm-blooded vertebrate with wings, feathers, a beak, and usually the power of flight.", " A member of the class of animals Aves in the phylum Chordata, characterized by being warm-blooded, having feathers and wings usually capable of flight, having a beaked mouth, and laying eggs."]}, {"classname": "cat", "gpt3": [" Any of numerous carnivorous mammals of the family Felidae, having thick soft fur, a short snout, and usually sharp retractile claws.", " A small domesticated carnivorous mammal, Felis catus, having thick soft fur, a short snout, and retractile claws.", " Any of numerous carnivorous mammals of the family Felidae, having thick soft fur, a short tail, and usually sharp retractile claws.", " A small domesticated carnivorous mammal with soft fur, a short snout, and retractile claws.", " A small domesticated carnivorous mammal with soft fur, a short snout, and retractile claws."]}, {"classname": "deer", "gpt3": [" Any of various ruminants of the family Cervidae, such as the red deer or elk, having antlers in the male.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla.", " A ruminant mammal with antlers and hooves of the family Cervidae, or one of several similar animals from related families of the order Artiodactyla."]}, {"classname": "dog", "gpt3": [" A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, and a barking, howling, or whining voice.", " A member of the genus \"Canis\" (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds; has a long association with humans; and is widely kept as a pet or for work"]}, {"classname": "frog", "gpt3": [" Any of numerous tailless stout-bodied amphibians of the order Anura, having a short broad head, short limbs, and webbed toes.", " Any of numerous tailless stout-bodied amphibians with long hind legs for leaping, short forelegs for hopping, and a long, tapering, usually pointed head.", " Any of numerous tailless stout-bodied amphibians with long hind legs adapted for leaping, found in a wide variety of habitats.", " A small tailless amphibian of the order Anura that typically hops.", " A small tailless amphibian of the order Anura that typically hops."]}, {"classname": "horse", "gpt3": [" A hoofed mammal of the family Equidae, related to the ass, having a thick, shaggy coat, a long head, and a long, slender, muscular neck.", " A hoofed mammal of the genus \"Equus\", domesticated since prehistoric times.", " A hoofed mammal of the genus Equus, domesticated since prehistoric times.", " A large hoofed mammal, Equus caballus, domesticated since prehistoric times, having a thick, shaggy coat, a heavy body, and a large head with a short, pointed muzzle.", " A hoofed mammal of the family Equidae, related to the zebra."]}, {"classname": "ship", "gpt3": [" A water-borne vessel generally larger than a boat.", " A water-borne vessel generally larger than a boat.", " A large vessel for transporting goods or passengers, typically ocean-going.", " A large vessel for transporting goods or passengers.", " A large vessel for transporting goods or passengers, typically ocean-going."]}, {"classname": "truck", "gpt3": [" A motor vehicle with a large carrying capacity, typically having a boxlike body and a relatively small passenger compartment.", " A motor vehicle with a large carrying capacity, typically having a long wheelbase and an enclosed body.", " A motor vehicle with a large carrying capacity, typically having a boxlike body and two rear wheels.", " A motor vehicle with a large carrying capacity, typically having a boxlike body and two rear wheels.", " A motor vehicle with a large carrying space, typically having a boxlike body and a separate cab."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_eurosat_clip.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "annual crop land", "gpt3": [" arable land", " arable land", " land used for growing crops that are harvested once a year", " Land that is used to grow crops for one year.", " arable land"]}, {"classname": "forest", "gpt3": [" a large area of land covered with trees", " land covered with trees", " a tract of land covered with trees and underbrush", " A large area of land covered with trees.", " A tract of land covered with trees and undergrowth, larger than woods."]}, {"classname": "brushland or shrubland", "gpt3": [" A land area covered with low-growing woody plants, such as bushes, small trees, and shrubs.", " Land that is covered mostly with shrubs.", " land covered with bushes, shrubs, and small trees.", " land covered with low, scrubby vegetation", " land covered with low, scrubby vegetation, especially thorny bushes."]}, {"classname": "highway or road", "gpt3": [" A way or course for the passage of vehicles, persons, and merchandise, usually including paved or graveled surface, curbs, and usually sidewalks.", " a way (usually public) for the transportation of people or goods", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way used for travelling between places, originally one wide enough to allow foot passengers and horses to travel, now (US) usually one surfaced with asphalt or concrete and designed to accommodate many vehicles travelling in both directions. In the UK both senses are heard", " A way or path for the travel of people or vehicles."]}, {"classname": "industrial buildings or commercial buildings", "gpt3": [" The act or process by which something is built; construction.", " A building or group of buildings where goods are produced, processed or manufactured, or where services are provided.", " A building or group of buildings where goods are manufactured, processed or repaired, or goods and services are sold.", " The act or process by which something is built; construction.", " Buildings used for manufacturing or storing and selling goods."]}, {"classname": "pasture land", "gpt3": [" grassland used for grazing livestock", " Land used for grazing.", " land used for grazing animals", " Land used for grazing.", " Land used for grazing."]}, {"classname": "permanent crop land", "gpt3": [" land used for growing crops that are not harvested annually", " arable land", " Land that is used for growing crops that are not intended for sale or for feeding livestock.", " Land used for growing crops that are not intended for harvest in the current year.", " land used for growing crops that are not harvested annually, such as trees, shrubs, and vines."]}, {"classname": "residential buildings or homes or apartments", "gpt3": [" A complete domicile occupying only part of a building, especially one for rent; a flat.", " A building or group of buildings containing a number of homes.", " The act or process by which something is built; construction.", " buildings where people live", " A complete domicile occupying only part of a building, especially one for rent; a flat."]}, {"classname": "river", "gpt3": [" A large natural stream of water flowing in a channel to the sea, a lake, or another river.", " a natural stream of water larger than a creek", " A natural stream of water of considerable volume flowing in a definite course from higher to lower ground.", " A large and often winding stream which drains a land mass, carrying water down from higher areas to a lower point, oftentimes ending in another body of water, such as an ocean or in an inland sea.", " A large natural stream of water (larger than a creek) flowing in a channel on the surface of the earth."]}, {"classname": "lake or sea", "gpt3": [" A large body of salt water.", " a large body of water surrounded by land", " A large body of water surrounded by land.", " A large body of water surrounded by land.", " A large body of water surrounded by land."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_fer-2013.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "angry", "gpt3": [" Feeling or showing anger.", " Displaying or feeling anger.", " Feeling or showing anger; wrathful; irate.", " Displaying or feeling anger.", " A feeling of displeasure and antagonism aroused by a wrong; wrath; ire; rage."]}, {"classname": "disgusted", "gpt3": [" Having a feeling of loathing or revulsion; repulsed.", " Feeling or displaying disgust.", " Affected with a feeling of loathing or repugnance; having a feeling of extreme repugnance or aversion.", " Filled with disgust.", " Feeling or showing a strong dislike of something."]}, {"classname": "fearful", "gpt3": [" Causing fear.", " Causing fear.", " Frightening.", " Frightening.", " Causing fear or dread; frightening."]}, {"classname": "happy", "gpt3": [" Emotionally positive.", " Feeling or showing pleasure and satisfaction.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous.", " Having a feeling arising from a consciousness of well-being or of enjoyment; enjoying good of any kind, such as comfort, peace, or tranquillity; blissful, contented, joyous."]}, {"classname": "neutral", "gpt3": [" Not showing any emotion.", " Having no particular feeling or interest; not caring one way or the other; indifferent.", " Not taking sides in a conflict such as war; nonaligned.", " Neither positive nor negative.", " Neither good nor bad."]}, {"classname": "sad", "gpt3": [" Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; gloomy; dismal; dreary; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless; dismal; gloomy; dismal; cheerless", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness.", " Feeling or showing sorrow or unhappiness; mournful; melancholy; dismal; gloomy; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; dreary; dismal; cheerless; d"]}, {"classname": "surprised", "gpt3": [" Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Caused to feel surprise, amazement or wonder, or showing an emotion due to an unexpected event.", " Astonished; astounded; amazed; dumbfounded; flabbergasted; flummoxed; flabbergasted; thunderstruck; dumbstruck; thunderstruck; dumbstruck; thunderstruck; dumbstruck;", " Surprised.", " Surprised."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_hateful-memes.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}, {"classname": "hatespeech meme", "gpt3": [" Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes.", " Any unit of cultural information, such as a practice or idea, that is transmitted verbally or by repeated action from one mind to another in a comparable way to the transmission of genes."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_kitti-distance.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "a photo i took of a car on my left or right side.", "gpt3": [" The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project.", " beside, next to, by", " a wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " The side of a fabric, often with a more visible color or pattern, that is intended to face outward on a finished project."]}, {"classname": "a photo i took with a car nearby.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " near; close", " Nearby; close by; close at hand; close to hand; close by; close to; close on; close to at hand; close to hand; close by; close to; close on; close to at hand; close to hand;"]}, {"classname": "a photo i took with a car in the distance.", "gpt3": [" far away; a long distance away", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " far away; a long distance away", " distant, far away, far off", " far away; a long distance away"]}, {"classname": "a photo i took with no car.", "gpt3": [" A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation.", " no car; no cars", " A wheeled vehicle that moves independently, with at least three wheels, powered mechanically, steered by a driver and mostly for personal transportation."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_mnist.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "0", "gpt3": [" 0.", " The number zero (0).", " A particle used for marking the following verb as an infinitive.", " 0.", " To be in a state of confusion."]}, {"classname": "1", "gpt3": [" 1.", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The act of ingesting.", " The number one (1)."]}, {"classname": "2", "gpt3": [" A particle used for marking the following verb as an infinitive.", " The number two (2).", " The number two (2).", " A particle used for marking the following verb as an infinitive.", " The first person to visit the moon."]}, {"classname": "3", "gpt3": [" 0.", " The CIA.", " Because, as, since.", " A particle used for marking the following verb as an infinitive.", " Because, as, since."]}, {"classname": "4", "gpt3": [" The number four (4).", " The first of the four basic operations of arithmetic, that is, the operation of finding the remainder when one number is divided by another.", " Because, as, since.", " A type of small, flat, round cake.", " To be in a state of disrepair."]}, {"classname": "5", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " A kind of animal."]}, {"classname": "6", "gpt3": [" A particle used for marking the following verb as an infinitive.", " To be able to.", " The first person to be killed in the novel.", " The number six (6).", " Because, as, since."]}, {"classname": "7", "gpt3": [" To be in a state of disrepair.", " To be ingested.", " To be in a state of confusion.", " To be in a state of being.", " Because, as, since."]}, {"classname": "8", "gpt3": [" To ingest; to be ingested.", " To be in a certain state.", " Because, as, since.", " Because, as, since.", " Because, as, since."]}, {"classname": "9", "gpt3": [" The number nine (9).", " A particle used for marking the preceding noun as a subject.", " The number nine (9).", " To be in a state of being.", " The first person to do something."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_patch-camelyon.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "lymph node", "gpt3": [" A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infection by trapping ger", " Each of the small oval bodies of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of connective tissue", " Thin, woven, gauze-like fabric.", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger", " A small, oval-shaped organ that is part of the lymphatic system. Lymph nodes are located in the neck, under the arms, in the groin, and in other areas of the body. They help the body fight infections by trapping ger"]}, {"classname": "lymph node containing metastatic tumor tissue", "gpt3": [" Thin, woven, gauze-like fabric.", " Thin, woven, gauze-like fabric.", " A small, oval, soft, elastic body of the lymphatic system, distributed along the lymphatic vessels, that are clustered in the armpits, groin, neck, chest and abdomen. They act as filters, with an internal honeycomb of", " Thin, woven, gauze-like fabric.", " A small, oval, solid organ of the lymphatic system, distributed along the lymphatic vessels, that acts as a filter for bacteria, viruses, and foreign matter."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/knowledge/gpt3/GPT3_rendered-sst2.tsv:
--------------------------------------------------------------------------------
1 | [{"classname": "negative", "gpt3": [" Not positive or neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral.", " Not positive nor neutral."]}, {"classname": "positive", "gpt3": [" Not negative or neutral.", " Not negative or neutral.", " Not negative nor neutral.", " Not negative nor neutral.", " Not negative or neutral."]}]


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/clip_example.yaml:
--------------------------------------------------------------------------------
 1 | INPUT:
 2 |   MEAN:
 3 |   - 0.485
 4 |   - 0.456
 5 |   - 0.406
 6 |   STD:
 7 |   - 0.229
 8 |   - 0.224
 9 |   - 0.225
10 | MODEL:
11 |   NAME: clip_example
12 |   NUM_PARAMS_IN_M: 11.0
13 |   AUTHOR: 'MSFT'
14 |   PRETRAINED_DATA: 'ImageNet22K'
15 |   CREATION_TIME: '2019-05-27'
16 |   # Following configuration is needed for CLIP model.
17 |   SPEC:
18 |     TEXT:
19 |       TOKENIZER: clip
20 |       STYLE: clip
21 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/clip_swin_tiny.yaml:
--------------------------------------------------------------------------------
 1 | INPUT:
 2 |   MEAN:
 3 |   - 0.485
 4 |   - 0.456
 5 |   - 0.406
 6 |   STD:
 7 |   - 0.229
 8 |   - 0.224
 9 |   - 0.225
10 | MODEL:
11 |   NAME: clip_swin
12 |   NUM_PARAMS_IN_M: 11.0
13 |   AUTHOR: 'MSFT'
14 |   PRETRAINED_DATA: 'ImageNet22K_YFCC15M'
15 |   CREATION_TIME: '2021-10-27'
16 |   # Following configuration is needed for CLIP model.
17 |   PRETRAINED: ''
18 |   PRETRAINED_LAYERS: ['*']
19 |   SPEC:
20 |     EMBED_DIM: 512
21 |     GATHER_TENSORS: True
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       CONTEXT_LENGTH: 77
25 |       WIDTH: 512
26 |       HEADS: 8
27 |       LAYERS: 12
28 |     VISION:
29 |       PATCH_SIZE: 4
30 |       IN_CHANS: 3
31 |       EMBED_DIM: 96
32 |       DEPTHS: [2, 2, 6, 2]
33 |       NUM_HEADS: [3, 6, 12, 24]
34 |       WINDOW_SIZE: 7
35 |       MLP_RATIO: 4.
36 |       QKV_BIAS: True
37 |       APE: False
38 |       PATCH_NORM: True
39 |       DROP_RATE: 0.0
40 |       DROP_PATH_RATE: 0.0
41 | 
42 | KNOWLEDGE:
43 |   WORDNET:
44 |     USE_HIERARCHY: False # False
45 |     USE_DEFINITION: False # True
46 | 
47 | # DATASET:
48 | #   DATASET: 'imagenet'
49 | #   ROOT: ../../data/zeroshot/classification/imagenet 
50 | OUTPUT_DIR: /home/chunyl/azure_mount/chunyleu_output/cvinwild/ic_benchmark/debug/swin_tiny/unicl_imagenet21k 
51 | # ../../output/hcl_exp/hcl_yfcc15m_half_imagenet22k_half/wordnet_h_true_d_false
52 | TEST:
53 |   MODEL_FILE: '/home/chunyl/azure_mount/chunyleu_output/ckpts/benchmark/swin_tiny/unicl_imagenet21k/model_state_dict.pt' 
54 |   BATCH_SIZE_PER_GPU: 128
55 | 
56 | TRAIN:
57 |   BATCH_SIZE_PER_GPU: 64
58 |   BEGIN_EPOCH: 0
59 |   END_EPOCH: 10
60 |   EXTRA_FINAL_TRAIN_EPOCH: 40
61 |   OPTIMIZER: sgd
62 |   WD: 0.
63 |   MOMENTUM: 0.9
64 |   NESTEROV: false
65 |   SHUFFLE: true
66 |   LR_SCHEDULER:
67 |     METHOD: 'WarmupCosine'
68 |     WARMUP_EPOCH: 5
69 | 
70 | # hcl_imagenet_21k_wiki
71 | # hcl_imagenet21k
72 |   # hcl_yfcc15m_half_imagenet21k_half_multitask
73 |   # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_half_imagenet22k_half/model_state_dict.pt'
74 | 
75 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k_multitask/model_state_dict.pt'
76 | # '/home/msrdl/azure_mounts/exp_output/ckpts/hcl/hcl_swin_tiny/hcl_yfcc15m_imagenet22k/model_state_dict.pt'
77 | 
78 | # hcl_imagenet22k  hcl_yfcc15m  hcl_yfcc15m_half_imagenet21k_half  hcl_yfcc15m_half_imagenet22k_half  hcl_yfcc15m_imagenet21k  hcl_yfcc15m_imagenet22k  hcl_yfcc15m_imagenet22k_multitask
79 | # hcl_imagenet1k
80 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/deit_base_patch16_224.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: 'OUTPUT/DEIT_BASE_PATCH16_224/'
 3 | 
 4 | MODEL:
 5 |   NAME: deit_base_patch16_224
 6 |   NUM_PARAMS_IN_M: 86.5
 7 |   AUTHOR: 'timm'
 8 |   PRETRAINED_DATA: 'ImageNet1K'
 9 |   CREATION_TIME: '2020-10-13'
10 |   SPEC:
11 |     EMBED_DIM: 768
12 | 
13 | TEST:
14 |   BATCH_SIZE_PER_GPU: 128
15 |   MODEL_FILE: ''
16 | 
17 | TRAIN:
18 |   BATCH_SIZE_PER_GPU: 64
19 |   BEGIN_EPOCH: 0
20 |   END_EPOCH: 10
21 |   EXTRA_FINAL_TRAIN_EPOCH: 40
22 |   OPTIMIZER: sgd
23 |   WD: 0.
24 |   MOMENTUM: 0.9
25 |   NESTEROV: false
26 |   SHUFFLE: true
27 |   LR_SCHEDULER:
28 |     METHOD: 'WarmupCosine'
29 |     WARMUP_EPOCH: 5
30 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/example.yaml:
--------------------------------------------------------------------------------
 1 | INPUT:
 2 |   MEAN:
 3 |   - 0.485
 4 |   - 0.456
 5 |   - 0.406
 6 |   STD:
 7 |   - 0.229
 8 |   - 0.224
 9 |   - 0.225
10 | MODEL:
11 |   NAME: cls_example
12 |   NUM_PARAMS_IN_M: 11
13 |   AUTHOR: 'MSFT'
14 |   PRETRAINED_DATA: 'ImageNet22K'
15 |   CREATION_TIME: '2019-05-27'


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/mae_vitb16.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/'
 3 | 
 4 | MODEL:
 5 |   NAME: mae_vitb16
 6 |   NUM_PARAMS_IN_M: 86.6
 7 |   AUTHOR: 'Facebook'
 8 |   PRETRAINED_DATA: 'ImageNet22K'
 9 |   CREATION_TIME: '2020-10-13'
10 |   SPEC:
11 |     EMBED_DIM: 768
12 |     PATCH_SIZE: 16
13 |     DEPTH: 12
14 |     NUM_HEADS: 12
15 |     MLP_RATIO: 4
16 |     QKV_BIAS: True
17 |     GLOBAL_POOL: True
18 | 
19 | TEST:
20 |   BATCH_SIZE_PER_GPU: 128
21 |   MODEL_FILE: 'https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth'
22 | 
23 | TRAIN:
24 |   BATCH_SIZE_PER_GPU: 64
25 |   BEGIN_EPOCH: 0
26 |   END_EPOCH: 10
27 |   EXTRA_FINAL_TRAIN_EPOCH: 40
28 |   OPTIMIZER: sgd
29 |   WD: 0.
30 |   MOMENTUM: 0.9
31 |   NESTEROV: false
32 |   SHUFFLE: true
33 |   LR_SCHEDULER:
34 |     METHOD: 'WarmupCosine'
35 |     WARMUP_EPOCH: 5
36 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/mocov3_vitb16.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/MAE_VIT_BASE_16/'
 3 | 
 4 | MODEL:
 5 |   NAME: mocov3_vitb16
 6 |   NUM_PARAMS_IN_M: 86.6
 7 |   AUTHOR: 'Facebook'
 8 |   PRETRAINED_DATA: 'ImageNet22K'
 9 |   CREATION_TIME: '2020-10-13'
10 |   SPEC:
11 |     EMBED_DIM: 768
12 |     PATCH_SIZE: 16
13 |     DEPTH: 12
14 |     NUM_HEADS: 12
15 |     MLP_RATIO: 4
16 |     QKV_BIAS: True
17 |     GLOBAL_POOL: True
18 | 
19 | TEST:
20 |   BATCH_SIZE_PER_GPU: 128
21 |   MODEL_FILE: 'https://dl.fbaipublicfiles.com/moco-v3/vit-b-300ep/vit-b-300ep.pth.tar'
22 | 
23 | TRAIN:
24 |   BATCH_SIZE_PER_GPU: 64
25 |   BEGIN_EPOCH: 0
26 |   END_EPOCH: 10
27 |   EXTRA_FINAL_TRAIN_EPOCH: 40
28 |   OPTIMIZER: sgd
29 |   WD: 0.
30 |   MOMENTUM: 0.9
31 |   NESTEROV: false
32 |   SHUFFLE: true
33 |   LR_SCHEDULER:
34 |     METHOD: 'WarmupCosine'
35 |     WARMUP_EPOCH: 5
36 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/react_vitG14_OpenCLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'clip_react'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'Haotian Liu'
12 |   PRETRAINED_DATA: 'CLIP-data/REACT-data'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 1280
17 |     USE_QUICK_GELU: False
18 |     VISION:
19 |       MODEL: vit
20 |       PATCH_SIZE: 14
21 |       WIDTH: 1664
22 |       LAYERS: 48
23 |       HEADS: 16
24 |       MLP_RATIO: 4.9231
25 |       USE_RCP_BLOCK: true
26 |     TEXT:
27 |       TOKENIZER: clip
28 |       STYLE: clip
29 |       CONTEXT_LENGTH: 77
30 |       WIDTH: 1280
31 |       HEADS: 20
32 |       LAYERS: 32
33 |       USE_RCP_BLOCK: False
34 |     RCP_BLOCK:
35 |       MODE: gated_attn
36 |       GUMBEL_SAMPLE: False
37 |       USE_LAST_K: 12
38 | 
39 | TEST:
40 |   BATCH_SIZE_PER_GPU: 128
41 |   MODEL_FILE: 'hf:react-vl/react-in1k:openclip-vit-bigG-14-gated-image-laion2b.pt'
42 | 
43 | TRAIN:
44 |   BATCH_SIZE_PER_GPU: 64
45 |   BEGIN_EPOCH: 0
46 |   END_EPOCH: 10
47 |   EXTRA_FINAL_TRAIN_EPOCH: 40
48 |   OPTIMIZER: sgd
49 |   WD: 0.
50 |   MOMENTUM: 0.9
51 |   NESTEROV: false
52 |   SHUFFLE: true
53 |   LR_SCHEDULER:
54 |     METHOD: 'WarmupCosine'
55 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/react_vitb16_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'clip_react'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'Haotian Liu'
12 |   PRETRAINED_DATA: 'CLIP-data/REACT-data'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 16
20 |       WIDTH: 768
21 |       LAYERS: 12
22 |       USE_RCP_BLOCK: true
23 |     TEXT:
24 |       TOKENIZER: clip
25 |       STYLE: clip
26 |       CONTEXT_LENGTH: 77
27 |       VOCAB_SIZE: 49408
28 |       WIDTH: 512
29 |       HEADS: 8
30 |       LAYERS: 12
31 |       USE_RCP_BLOCK: false
32 |     RCP_BLOCK:
33 |       MODE: gated_attn
34 |       GUMBEL_SAMPLE: False
35 |       USE_LAST_K: 6
36 | 
37 | TEST:
38 |   BATCH_SIZE_PER_GPU: 128
39 |   MODEL_FILE: 'hf:react-vl/react-in1k:clip-vit-base-16-gated-image.pt'
40 | 
41 | TRAIN:
42 |   BATCH_SIZE_PER_GPU: 64
43 |   BEGIN_EPOCH: 0
44 |   END_EPOCH: 10
45 |   EXTRA_FINAL_TRAIN_EPOCH: 40
46 |   OPTIMIZER: sgd
47 |   WD: 0.
48 |   MOMENTUM: 0.9
49 |   NESTEROV: false
50 |   SHUFFLE: true
51 |   LR_SCHEDULER:
52 |     METHOD: 'WarmupCosine'
53 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/react_vitb32_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'clip_react'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'Haotian Liu'
12 |   PRETRAINED_DATA: 'CLIP-data/REACT-data'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 768
21 |       LAYERS: 12
22 |       USE_RCP_BLOCK: true
23 |     TEXT:
24 |       TOKENIZER: clip
25 |       STYLE: clip
26 |       CONTEXT_LENGTH: 77
27 |       VOCAB_SIZE: 49408
28 |       WIDTH: 512
29 |       HEADS: 8
30 |       LAYERS: 12
31 |       USE_RCP_BLOCK: false
32 |     RCP_BLOCK:
33 |       MODE: gated_attn
34 |       GUMBEL_SAMPLE: False
35 |       USE_LAST_K: 6
36 | 
37 | TEST:
38 |   BATCH_SIZE_PER_GPU: 128
39 |   MODEL_FILE: 'hf:react-vl/react-in1k:clip-vit-base-32-gated-image.pt'
40 | 
41 | TRAIN:
42 |   BATCH_SIZE_PER_GPU: 64
43 |   BEGIN_EPOCH: 0
44 |   END_EPOCH: 10
45 |   EXTRA_FINAL_TRAIN_EPOCH: 40
46 |   OPTIMIZER: sgd
47 |   WD: 0.
48 |   MOMENTUM: 0.9
49 |   NESTEROV: false
50 |   SHUFFLE: true
51 |   LR_SCHEDULER:
52 |     METHOD: 'WarmupCosine'
53 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/react_vitl14_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'clip_react'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'Haotian Liu'
12 |   PRETRAINED_DATA: 'CLIP-data/REACT-data'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 768
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 14
20 |       WIDTH: 1024
21 |       LAYERS: 24
22 |       USE_RCP_BLOCK: True
23 |     TEXT:
24 |       TOKENIZER: clip
25 |       STYLE: clip
26 |       CONTEXT_LENGTH: 77
27 |       WIDTH: 768
28 |       HEADS: 12
29 |       LAYERS: 12
30 |       USE_RCP_BLOCK: False
31 |     RCP_BLOCK:
32 |       MODE: gated_attn
33 |       USE_LAST_K: 6
34 |       USE_FFN: True
35 |       WIDTH: 1024
36 | 
37 | TEST:
38 |   BATCH_SIZE_PER_GPU: 128
39 |   MODEL_FILE: 'hf:react-vl/react-in1k:clip-vit-large-14-gated-image.pt'
40 | 
41 | TRAIN:
42 |   BATCH_SIZE_PER_GPU: 64
43 |   BEGIN_EPOCH: 0
44 |   END_EPOCH: 10
45 |   EXTRA_FINAL_TRAIN_EPOCH: 40
46 |   OPTIMIZER: sgd
47 |   WD: 0.
48 |   MOMENTUM: 0.9
49 |   NESTEROV: false
50 |   SHUFFLE: true
51 |   LR_SCHEDULER:
52 |     METHOD: 'WarmupCosine'
53 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vit_base_patch16_224.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: 'OUTPUT/VIT_BASE_PATCH16_224/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.5, 0.5, 0.5]
 6 |   STD: [0.5, 0.5, 0.5]
 7 | 
 8 | MODEL:
 9 |   NAME: vit_base_patch16_224
10 |   NUM_PARAMS_IN_M: 86.5
11 |   AUTHOR: 'timm'
12 |   PRETRAINED_DATA: 'ImageNet22K'
13 |   CREATION_TIME: '2020-10-13'
14 |   SPEC:
15 |     EMBED_DIM: 768
16 | 
17 | TEST:
18 |   BATCH_SIZE_PER_GPU: 128
19 |   MODEL_FILE: ''
20 | 
21 | TRAIN:
22 |   BATCH_SIZE_PER_GPU: 64
23 |   BEGIN_EPOCH: 0
24 |   END_EPOCH: 10
25 |   EXTRA_FINAL_TRAIN_EPOCH: 40
26 |   OPTIMIZER: sgd
27 |   WD: 0.
28 |   MOMENTUM: 0.9
29 |   NESTEROV: false
30 |   SHUFFLE: true
31 |   LR_SCHEDULER:
32 |     METHOD: 'WarmupCosine'
33 |     WARMUP_EPOCH: 5
34 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vit_base_patch32_224.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VIT_BASE_PATCH32_224/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.5, 0.5, 0.5]
 6 |   STD: [0.5, 0.5, 0.5]
 7 | 
 8 | MODEL:
 9 |   NAME: vit_base_patch32_224
10 |   NUM_PARAMS_IN_M: 88.2
11 |   AUTHOR: 'timm'
12 |   PRETRAINED_DATA: 'ImageNet22K'
13 |   CREATION_TIME: '2020-10-13'
14 |   SPEC:
15 |     EMBED_DIM: 768
16 | 
17 | TEST:
18 |   BATCH_SIZE_PER_GPU: 128
19 |   MODEL_FILE: ''
20 | 
21 | TRAIN:
22 |   BATCH_SIZE_PER_GPU: 64
23 |   BEGIN_EPOCH: 0
24 |   END_EPOCH: 10
25 |   EXTRA_FINAL_TRAIN_EPOCH: 40
26 |   OPTIMIZER: sgd
27 |   WD: 0.
28 |   MOMENTUM: 0.9
29 |   NESTEROV: false
30 |   SHUFFLE: true
31 |   LR_SCHEDULER:
32 |     METHOD: 'WarmupCosine'
33 |     WARMUP_EPOCH: 5
34 | 


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vitb16_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'ViT-B/16'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'OpenAI'
12 |   PRETRAINED_DATA: 'CLIP-data'
13 |   CREATION_TIME: '2021-01-05'
14 | 
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 16
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 | 
31 | TEST:
32 |   BATCH_SIZE_PER_GPU: 128
33 |   MODEL_FILE: ''
34 | 
35 | TRAIN:
36 |   BATCH_SIZE_PER_GPU: 64
37 |   BEGIN_EPOCH: 0
38 |   END_EPOCH: 10
39 |   EXTRA_FINAL_TRAIN_EPOCH: 40
40 |   OPTIMIZER: sgd
41 |   WD: 0.
42 |   MOMENTUM: 0.9
43 |   NESTEROV: false
44 |   SHUFFLE: true
45 |   LR_SCHEDULER:
46 |     METHOD: 'WarmupCosine'
47 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vitb32_CLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'ViT-B/32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'OpenAI'
12 |   PRETRAINED_DATA: 'CLIP-data'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 | 
31 | TEST:
32 |   BATCH_SIZE_PER_GPU: 128
33 |   MODEL_FILE: ''
34 | 
35 | TRAIN:
36 |   BATCH_SIZE_PER_GPU: 64
37 |   BEGIN_EPOCH: 0
38 |   END_EPOCH: 10
39 |   EXTRA_FINAL_TRAIN_EPOCH: 40
40 |   OPTIMIZER: sgd
41 |   WD: 0.
42 |   MOMENTUM: 0.9
43 |   NESTEROV: false
44 |   SHUFFLE: true
45 |   LR_SCHEDULER:
46 |     METHOD: 'WarmupCosine'
47 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vitb32_DeCLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'declip_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'DeCLIP'
12 |   PRETRAINED_DATA: 'DeCLIP-88M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 3072
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 |       SKIP_TOKENIZE: true
31 |     DECLIP:
32 |       image_encode:
33 |         embed_dim: 3072
34 |       text_encode:
35 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
36 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
37 |         text_model_utils:
38 |           random: False
39 |           freeze: False
40 |         embed_dim: 3072
41 |       # clip:
42 |       #   use_allgather: True
43 |       #   text_mask_type: MLM
44 |       #   return_nn_bank: True
45 |       #   EDA: True
46 |       #   feature_dim: 3072
47 | 
48 | TEST:
49 |   BATCH_SIZE_PER_GPU: 128
50 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_vitb32.pth.tar'
51 | 
52 | TRAIN:
53 |   BATCH_SIZE_PER_GPU: 64
54 |   BEGIN_EPOCH: 0
55 |   END_EPOCH: 10
56 |   EXTRA_FINAL_TRAIN_EPOCH: 40
57 |   OPTIMIZER: sgd
58 |   WD: 0.
59 |   MOMENTUM: 0.9
60 |   NESTEROV: false
61 |   SHUFFLE: true
62 |   LR_SCHEDULER:
63 |     METHOD: 'WarmupCosine'
64 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vitb32_DeCLIP_YFCC15M.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'declip_yfcc_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'DeCLIP'
12 |   PRETRAINED_DATA: 'YFCC-15M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 3072
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 |       SKIP_TOKENIZE: true
31 |     DECLIP:
32 |       image_encode:
33 |         embed_dim: 512
34 |       text_encode:
35 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
36 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
37 |         text_model_utils:
38 |           random: False
39 |           freeze: False
40 |         embed_dim: 512
41 |       # clip:
42 |       #   use_allgather: True
43 |       #   text_mask_type: MLM
44 |       #   return_nn_bank: True
45 |       #   EDA: True
46 |       #   feature_dim: 512
47 | 
48 | TEST:
49 |   BATCH_SIZE_PER_GPU: 128
50 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/DeCLIP_YFCC15M_vitb32.pth.tar'
51 | 
52 | TRAIN:
53 |   BATCH_SIZE_PER_GPU: 64
54 |   BEGIN_EPOCH: 0
55 |   END_EPOCH: 10
56 |   EXTRA_FINAL_TRAIN_EPOCH: 40
57 |   OPTIMIZER: sgd
58 |   WD: 0.
59 |   MOMENTUM: 0.9
60 |   NESTEROV: false
61 |   SHUFFLE: true
62 |   LR_SCHEDULER:
63 |     METHOD: 'WarmupCosine'
64 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vitb32_FILIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'filip_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'FILIP'
12 |   PRETRAINED_DATA: 'DeCLIP-88M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 768
17 |     DENSE_EVAL: true
18 |     VISION:
19 |       MODEL: vit
20 |       PATCH_SIZE: 32
21 |       WIDTH: 384
22 |       LAYERS: 12
23 |     TEXT:
24 |       TOKENIZER: clip
25 |       STYLE: clip
26 |       CONTEXT_LENGTH: 77
27 |       VOCAB_SIZE: 49408
28 |       WIDTH: 512
29 |       HEADS: 8
30 |       LAYERS: 12
31 |       SKIP_TOKENIZE: true
32 |     DECLIP:
33 |       image_encode:
34 |         embed_dim: 768
35 |       text_encode:
36 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
37 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
38 |         text_model_utils:
39 |           random: False
40 |           freeze: False
41 |         embed_dim: 768
42 |       clip:
43 |         mask_rate: 0.5
44 |         patch_number: 14
45 |         use_allgather: False
46 |         text_mask_type: MLM
47 |         return_nn_bank: False
48 |         return_dense: True
49 |         feature_dim: 768
50 |         select_topk: True
51 | 
52 | TEST:
53 |   BATCH_SIZE_PER_GPU: 128
54 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/FILIP_YFCC15M_vitb32.pth.tar'
55 | 
56 | TRAIN:
57 |   BATCH_SIZE_PER_GPU: 64
58 |   BEGIN_EPOCH: 0
59 |   END_EPOCH: 10
60 |   EXTRA_FINAL_TRAIN_EPOCH: 40
61 |   OPTIMIZER: sgd
62 |   WD: 0.
63 |   MOMENTUM: 0.9
64 |   NESTEROV: false
65 |   SHUFFLE: true
66 |   LR_SCHEDULER:
67 |     METHOD: 'WarmupCosine'
68 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/resources/model/vitb32_SLIP.yaml:
--------------------------------------------------------------------------------
 1 | # GPUS: (0,)
 2 | OUTPUT_DIR: '../../OUTPUT/VITB32_CLIP/'
 3 | 
 4 | INPUT:
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]
 7 | 
 8 | MODEL:
 9 |   NAME: 'slip_vitb32'
10 |   NUM_PARAMS_IN_M: 151.2
11 |   AUTHOR: 'SLIP'
12 |   PRETRAINED_DATA: 'YFCC-15M'
13 |   CREATION_TIME: '2021-01-05'
14 | # Following configuration is needed for runing linear probe with Pytorch based linear model.
15 |   SPEC:
16 |     EMBED_DIM: 512
17 |     VISION:
18 |       MODEL: vit
19 |       PATCH_SIZE: 32
20 |       WIDTH: 384
21 |       LAYERS: 12
22 |     TEXT:
23 |       TOKENIZER: clip
24 |       STYLE: clip
25 |       CONTEXT_LENGTH: 77
26 |       VOCAB_SIZE: 49408
27 |       WIDTH: 512
28 |       HEADS: 8
29 |       LAYERS: 12
30 |       SKIP_TOKENIZE: true
31 |     DECLIP:
32 |       image_encode:
33 |         embed_dim: 512
34 |       text_encode:
35 |         bpe_path: 'bpe_simple_vocab_16e6.txt.gz'
36 |         text_encode_type: Transformer #Transformer,Bert,GPT2,Bert_gvx
37 |         text_model_utils:
38 |           random: False
39 |           freeze: False
40 |         embed_dim: 512
41 |       clip:
42 |         use_allgather: False
43 |         return_sim: True
44 |         feature_dim: 768
45 |         sim_dim: 256
46 | 
47 | TEST:
48 |   BATCH_SIZE_PER_GPU: 128
49 |   MODEL_FILE: 'https://haotliudb.blob.core.windows.net/checkpoints/icinw/SLIP_YFCC15M_vitb32.pth.tar'
50 | 
51 | TRAIN:
52 |   BATCH_SIZE_PER_GPU: 64
53 |   BEGIN_EPOCH: 0
54 |   END_EPOCH: 10
55 |   EXTRA_FINAL_TRAIN_EPOCH: 40
56 |   OPTIMIZER: sgd
57 |   WD: 0.
58 |   MOMENTUM: 0.9
59 |   NESTEROV: false
60 |   SHUFFLE: true
61 |   LR_SCHEDULER:
62 |     METHOD: 'WarmupCosine'
63 |     WARMUP_EPOCH: 5


--------------------------------------------------------------------------------
/vision_benchmark/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .comm import comm
2 | from .utils import create_logger
3 | 
4 | __all__ = ['comm', 'create_logger']
5 | 


--------------------------------------------------------------------------------
/vision_benchmark/utils/comm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains primitives for multi-gpu communication.
  3 | This is useful when doing distributed training.
  4 | """
  5 | 
  6 | import pickle
  7 | 
  8 | import torch
  9 | import torch.distributed as dist
 10 | 
 11 | 
 12 | class Comm(object):
 13 |     def __init__(self):
 14 |         self.local_rank = 0
 15 | 
 16 |     @property
 17 |     def world_size(self):
 18 |         if not dist.is_available():
 19 |             return 1
 20 |         if not dist.is_initialized():
 21 |             return 1
 22 |         return dist.get_world_size()
 23 | 
 24 |     @property
 25 |     def rank(self):
 26 |         if not dist.is_available():
 27 |             return 0
 28 |         if not dist.is_initialized():
 29 |             return 0
 30 |         return dist.get_rank()
 31 | 
 32 |     @property
 33 |     def local_rank(self):
 34 |         if not dist.is_available():
 35 |             return 0
 36 |         if not dist.is_initialized():
 37 |             return 0
 38 |         return self._local_rank
 39 | 
 40 |     @local_rank.setter
 41 |     def local_rank(self, value):
 42 |         if not dist.is_available():
 43 |             self._local_rank = 0
 44 |         if not dist.is_initialized():
 45 |             self._local_rank = 0
 46 |         self._local_rank = value
 47 | 
 48 |     @property
 49 |     def head(self):
 50 |         return 'Rank[{}/{}]'.format(self.rank, self.world_size)
 51 | 
 52 |     def is_main_process(self):
 53 |         return self.rank == 0
 54 | 
 55 |     def synchronize(self):
 56 |         """
 57 |         Helper function to synchronize (barrier) among all processes when
 58 |         using distributed training
 59 |         """
 60 |         if self.world_size == 1:
 61 |             return
 62 |         dist.barrier()
 63 | 
 64 | 
 65 | comm = Comm()
 66 | 
 67 | 
 68 | def all_gather(data):
 69 |     """
 70 |     Run all_gather on arbitrary picklable data (not necessarily tensors)
 71 |     Args:
 72 |         data: any picklable object
 73 |     Returns:
 74 |         list[data]: list of data gathered from each rank
 75 |     """
 76 |     world_size = comm.world_size
 77 |     if world_size == 1:
 78 |         return [data]
 79 | 
 80 |     # serialized to a Tensor
 81 |     buffer = pickle.dumps(data)
 82 |     storage = torch.ByteStorage.from_buffer(buffer)
 83 |     tensor = torch.ByteTensor(storage).to("cuda")
 84 | 
 85 |     # obtain Tensor size of each rank
 86 |     local_size = torch.LongTensor([tensor.numel()]).to("cuda")
 87 |     size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
 88 |     dist.all_gather(size_list, local_size)
 89 |     size_list = [int(size.item()) for size in size_list]
 90 |     max_size = max(size_list)
 91 | 
 92 |     # receiving Tensor from all ranks
 93 |     # we pad the tensor because torch all_gather does not support
 94 |     # gathering tensors of different shapes
 95 |     tensor_list = []
 96 |     for _ in size_list:
 97 |         tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
 98 |     if local_size != max_size:
 99 |         padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
100 |         tensor = torch.cat((tensor, padding), dim=0)
101 |     dist.all_gather(tensor_list, tensor)
102 | 
103 |     data_list = []
104 |     for size, tensor in zip(size_list, tensor_list):
105 |         buffer = tensor.cpu().numpy().tobytes()[:size]
106 |         data_list.append(pickle.loads(buffer))
107 | 
108 |     return data_list
109 | 
110 | 
111 | def reduce_dict(input_dict, average=True):
112 |     """
113 |     Args:
114 |         input_dict (dict): all the values will be reduced
115 |         average (bool): whether to do average or sum
116 |     Reduce the values in the dictionary from all processes so that process with rank
117 |     0 has the averaged results. Returns a dict with the same fields as
118 |     input_dict, after reduction.
119 |     """
120 |     world_size = comm.world_size
121 |     if world_size < 2:
122 |         return input_dict
123 |     with torch.no_grad():
124 |         names = []
125 |         values = []
126 |         # sort the keys so that they are consistent across processes
127 |         for k in sorted(input_dict.keys()):
128 |             names.append(k)
129 |             values.append(input_dict[k])
130 |         values = torch.stack(values, dim=0)
131 |         dist.reduce(values, dst=0)
132 |         if dist.get_rank() == 0 and average:
133 |             # only main process gets accumulated, so only divide by
134 |             # world_size in this case
135 |             values /= world_size
136 |         reduced_dict = {k: v for k, v in zip(names, values)}
137 |     return reduced_dict
138 | 
139 | 
140 | def gather_tensors(tensor):
141 |     """
142 |     Performs all_gather operation on the provided tensors.
143 |     *** Warning ***: torch.distributed.all_gather has no gradient.
144 |     """
145 |     tensors_gather = [
146 |         torch.ones_like(tensor)
147 |         for _ in range(comm.world_size)
148 |     ]
149 | 
150 |     dist.all_gather(tensors_gather, tensor, async_op=False)
151 |     # need to do this to restore propagation of the gradients
152 |     tensors_gather[comm.rank] = tensor
153 |     output = torch.cat(tensors_gather, dim=0)
154 |     return output
155 | 


--------------------------------------------------------------------------------
/vision_benchmark/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | import os
 8 | import logging
 9 | import time
10 | 
11 | from .comm import comm
12 | 
13 | 
14 | def setup_logger(final_output_dir, rank, phase):
15 |     time_str = time.strftime('%Y-%m-%d-%H-%M')
16 |     log_file = f'{phase}_{time_str}_rank{rank}.txt'
17 |     final_log_file = os.path.join(final_output_dir, log_file)
18 |     head = "%(asctime)-15s:[P:%(process)d]:" + comm.head + ' %(message)s'
19 |     logging.basicConfig(
20 |         filename=str(final_log_file), format=head
21 |     )
22 |     logger = logging.getLogger()
23 |     logger.setLevel(logging.INFO)
24 |     console = logging.StreamHandler()
25 |     console.setFormatter(
26 |         logging.Formatter(head)
27 |     )
28 |     logging.getLogger('').addHandler(console)
29 | 
30 | 
31 | def create_logger(cfg, phase='train'):
32 |     root_output_dir = Path(cfg.OUTPUT_DIR)
33 |     dataset = cfg.DATASET.DATASET
34 |     cfg_name = cfg.NAME
35 | 
36 |     final_output_dir = root_output_dir / dataset / cfg_name
37 | 
38 |     print('=> creating {} ...'.format(root_output_dir))
39 |     root_output_dir.mkdir(parents=True, exist_ok=True)
40 |     print('=> creating {} ...'.format(final_output_dir))
41 |     final_output_dir.mkdir(parents=True, exist_ok=True)
42 | 
43 |     print('=> setup logger ...')
44 |     setup_logger(final_output_dir, cfg.RANK, phase)
45 | 
46 |     return str(final_output_dir)
47 | 
48 | 


--------------------------------------------------------------------------------