├── .flake8
├── .github
    ├── CONTRIBUTING.md
    └── ISSUE_TEMPLATE
    │   ├── 1-bug-report.yml
    │   ├── 2-feature-request.yml
    │   └── 3-documentation.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── INSTALLATION.md
├── LICENSE
├── README.md
├── README_zh.md
├── classification
    ├── README.md
    ├── config.py
    ├── configs
    │   ├── attn_pooling_probing
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   └── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
    │   ├── intern_vit_6b_1k_224.yaml
    │   ├── intern_vit_6b_1k_224_test_imagenet_a.yaml
    │   ├── intern_vit_6b_1k_224_test_imagenet_r.yaml
    │   ├── intern_vit_6b_1k_224_test_imagenet_real.yaml
    │   ├── intern_vit_6b_1k_224_test_imagenet_sketch.yaml
    │   ├── intern_vit_6b_1k_224_test_imagenetv2.yaml
    │   └── linear_probing
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
    │   │   ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
    │   │   ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
    │   │   └── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
    ├── dataset
    │   ├── __init__.py
    │   ├── build.py
    │   ├── cached_image_folder.py
    │   ├── imagenet_a_r_indices.py
    │   ├── imagenet_real.py
    │   ├── imagenetv2.py
    │   ├── samplers.py
    │   └── zipreader.py
    ├── ddp_hooks.py
    ├── gflops.py
    ├── hf2pytorch.py
    ├── logger.py
    ├── lr_scheduler.py
    ├── main.py
    ├── meta_data
    │   ├── 22k_class_to_idx.json
    │   ├── imagenet_classes.json
    │   ├── map22kto1k.txt
    │   ├── real.json
    │   ├── train.txt.zip
    │   └── val.txt.zip
    ├── models
    │   ├── __init__.py
    │   ├── build.py
    │   ├── clip_vit.py
    │   ├── flash_attention.py
    │   └── intern_vit_6b.py
    ├── optimizer.py
    ├── train_in1k.sh
    ├── utils.py
    └── work_dirs
    │   └── intern_vit_6b_1k_224
    │       └── log_rank0.txt
├── clip_benchmark
    ├── AUTHORS.rst
    ├── CONTRIBUTING.rst
    ├── HISTORY.rst
    ├── LICENSE
    ├── MANIFEST.in
    ├── Makefile
    ├── README.md
    ├── benchmark.png
    ├── benchmark
    │   ├── README.md
    │   ├── benchmark.csv
    │   ├── dataset_type.csv
    │   ├── datasets.txt
    │   ├── datasets_multilingual.txt
    │   ├── models.txt
    │   ├── results.ipynb
    │   └── webdatasets.txt
    ├── clip_benchmark
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ar_classnames.json
    │   │   ├── ar_zeroshot_classification_templates.json
    │   │   ├── birdsnap.py
    │   │   ├── builder.py
    │   │   ├── caltech101.py
    │   │   ├── cn_classnames.json
    │   │   ├── cn_zeroshot_classification_templates.json
    │   │   ├── cupl_prompts.json
    │   │   ├── en_classnames.json
    │   │   ├── en_zeroshot_classification_templates.json
    │   │   ├── flickr.py
    │   │   ├── imagenetv2.py
    │   │   ├── it_classnames.json
    │   │   ├── it_zeroshot_classification_templates.json
    │   │   ├── jp_classnames.json
    │   │   ├── jp_zeroshot_classification_templates.json
    │   │   ├── kitti.py
    │   │   ├── multilingual_mscoco.py
    │   │   ├── objectnet.py
    │   │   ├── tfds.py
    │   │   ├── tools.py
    │   │   └── voc2007.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── linear_probe.py
    │   │   ├── mscoco_generative.py
    │   │   ├── zeroshot_classification.py
    │   │   └── zeroshot_retrieval.py
    │   ├── model_collection.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── intern_vit_6b
    │   │   │   ├── configuration_intern_vit.py
    │   │   │   ├── flash_attention.py
    │   │   │   └── modeling_intern_vit.py
    │   │   ├── internvl.py
    │   │   ├── internvl_c_pytorch
    │   │   │   ├── __init__.py
    │   │   │   ├── chinese_alpaca_lora_7b
    │   │   │   │   ├── config.json
    │   │   │   │   ├── generation_config.json
    │   │   │   │   ├── pytorch_model.bin.index.json
    │   │   │   │   ├── special_tokens_map.json
    │   │   │   │   ├── tokenizer.model
    │   │   │   │   └── tokenizer_config.json
    │   │   │   ├── flash_attention.py
    │   │   │   └── internvl_c.py
    │   │   ├── internvl_huggingface
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_intern_vit.py
    │   │   │   ├── configuration_internvl.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── modeling_intern_vit.py
    │   │   │   ├── modeling_internvl.py
    │   │   │   └── modeling_qllama.py
    │   │   ├── japanese_clip.py
    │   │   └── open_clip.py
    │   └── webdataset_builder.py
    ├── data
    │   ├── birdsnap
    │   │   └── test_images_valid.txt
    │   ├── flickr30k
    │   │   └── flickr30k_cn_test.txt
    │   └── mscoco_captions
    │   │   └── coco-cn_test.json
    ├── probe_benchmark
    │   ├── PROBES.md
    │   ├── build_df_scaling_experiments.py
    │   ├── clip_table_2.csv
    │   ├── generate_table.py
    │   ├── gmacs_vs_perf_retrieval.pdf
    │   ├── imagenet_cifar_lp.pdf
    │   ├── imagenet_cifar_lp_vtab.pdf
    │   ├── laion5b_fewshot_experiments.py
    │   ├── openclip_results.csv
    │   ├── process_vtab.py
    │   ├── scaling_experiment_data2.json
    │   ├── scaling_experiment_data_vtab.json
    │   ├── scaling_experiments.py
    │   └── scaling_plot.ipynb
    ├── requirements-test.txt
    ├── requirements.txt
    ├── setup.cfg
    ├── setup.py
    ├── test_internvl_c_classification.sh
    ├── test_internvl_c_imagenet.sh
    ├── test_internvl_c_retrieval.sh
    ├── test_internvl_c_xtd.sh
    ├── test_internvl_g_classification.sh
    ├── test_internvl_g_imagenet.sh
    ├── test_internvl_g_retrieval.sh
    ├── test_internvl_g_retrieval_finetune.sh
    ├── test_internvl_g_xtd.sh
    ├── tests
    │   └── test_clip_benchmark.py
    └── tox.ini
├── internvl_chat
    ├── README.md
    ├── eval
    │   ├── README.md
    │   ├── caption
    │   │   ├── README.md
    │   │   └── evaluate_caption.py
    │   ├── domain_specific
    │   │   ├── drivelm
    │   │   │   └── evaluate.py
    │   │   ├── mme_rw
    │   │   │   └── evaluate.py
    │   │   ├── rs_det
    │   │   │   ├── caculate.py
    │   │   │   └── evaluate.py
    │   │   └── rs_vqa
    │   │   │   ├── evaluate.py
    │   │   │   └── score.py
    │   ├── llava_bench
    │   │   ├── README.md
    │   │   ├── eval_gpt_review_bench.py
    │   │   ├── evaluate_llava_bench.py
    │   │   ├── rule.json
    │   │   └── summarize_gpt_review.py
    │   ├── mantis_eval
    │   │   ├── README.md
    │   │   └── evaluate_mantis.py
    │   ├── mathvista
    │   │   ├── README.md
    │   │   ├── calculate_score.py
    │   │   ├── evaluate_mathvista.py
    │   │   ├── extract_answer.py
    │   │   ├── prompts
    │   │   │   └── ext_ans.py
    │   │   └── utilities.py
    │   ├── mirb
    │   │   ├── README.md
    │   │   └── evaluate_mirb.py
    │   ├── mmbench
    │   │   ├── README.md
    │   │   └── evaluate_mmbench.py
    │   ├── mme
    │   │   ├── README.md
    │   │   ├── Your_Results
    │   │   │   ├── OCR.txt
    │   │   │   ├── artwork.txt
    │   │   │   ├── celebrity.txt
    │   │   │   ├── code_reasoning.txt
    │   │   │   ├── color.txt
    │   │   │   ├── commonsense_reasoning.txt
    │   │   │   ├── count.txt
    │   │   │   ├── existence.txt
    │   │   │   ├── landmark.txt
    │   │   │   ├── numerical_calculation.txt
    │   │   │   ├── position.txt
    │   │   │   ├── posters.txt
    │   │   │   ├── scene.txt
    │   │   │   └── text_translation.txt
    │   │   ├── calculation.py
    │   │   └── eval.py
    │   ├── mmhal
    │   │   ├── README.md
    │   │   ├── eval_gpt_mmhal.py
    │   │   └── evaluate_mmhal.py
    │   ├── mmiu
    │   │   ├── README.md
    │   │   ├── evaluate_mmiu.py
    │   │   └── mmiu.jsonl
    │   ├── mmmu
    │   │   ├── README.md
    │   │   ├── answer_dict_val.json
    │   │   ├── data_utils.py
    │   │   ├── eval_utils.py
    │   │   ├── evaluate_mmmu.py
    │   │   └── main_eval_only.py
    │   ├── mmmu_pro
    │   │   ├── README.md
    │   │   ├── evaluate.py
    │   │   ├── evaluate_mmmu_pro.py
    │   │   └── prompts.yaml
    │   ├── mmvet
    │   │   ├── README.md
    │   │   └── evaluate_mmvet.py
    │   ├── mmvetv2
    │   │   ├── README.md
    │   │   └── evaluate_mmvet_v2.py
    │   ├── mmvp
    │   │   ├── README.md
    │   │   └── evaluate_mmvp.py
    │   ├── mpdocvqa
    │   │   ├── README.md
    │   │   ├── evaluate_vqa.py
    │   │   └── infographicsvqa_eval.py
    │   ├── mvbench
    │   │   ├── README.md
    │   │   └── evaluate_mvbench.py
    │   ├── pope
    │   │   ├── README.md
    │   │   ├── eval_pope.py
    │   │   └── evaluate_pope.py
    │   ├── refcoco
    │   │   ├── README.md
    │   │   └── evaluate_grounding.py
    │   ├── scienceqa
    │   │   ├── README.md
    │   │   └── evaluate_scienceqa.py
    │   ├── seed
    │   │   ├── README.md
    │   │   ├── calculation.py
    │   │   └── evaluate_seed.py
    │   ├── tiny_lvlm
    │   │   ├── README.md
    │   │   ├── calculate_score.py
    │   │   ├── evaluate_lvlm.py
    │   │   └── tools.py
    │   └── vqa
    │   │   ├── README.md
    │   │   ├── convert_gqa_for_eval.py
    │   │   ├── evaluate_vqa.py
    │   │   ├── infographicsvqa_eval.py
    │   │   └── textvqa_eval.py
    ├── evaluate.sh
    ├── examples
    │   ├── image1.jpg
    │   ├── image2.jpg
    │   ├── image3.jpg
    │   ├── image4.jpg
    │   └── image5.jpg
    ├── internvl
    │   ├── conversation.py
    │   ├── dist_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── internlm2
    │   │   │   ├── configuration_internlm2.py
    │   │   │   ├── modeling_internlm2.py
    │   │   │   ├── tokenization_internlm2.py
    │   │   │   └── tokenization_internlm2_fast.py
    │   │   ├── internvl_chat
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_intern_vit.py
    │   │   │   ├── configuration_internvl_chat.py
    │   │   │   ├── modeling_intern_vit.py
    │   │   │   └── modeling_internvl_chat.py
    │   │   └── phi3
    │   │   │   ├── configuration_phi3.py
    │   │   │   └── modeling_phi3.py
    │   ├── patch
    │   │   ├── __init__.py
    │   │   ├── internlm2_packed_training_patch.py
    │   │   ├── internvit_liger_monkey_patch.py
    │   │   ├── llama2_flash_attn_monkey_patch.py
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llama_packed_training_patch.py
    │   │   ├── llama_rmsnorm_monkey_patch.py
    │   │   ├── pad_data_collator.py
    │   │   ├── phi3_packed_training_patch.py
    │   │   ├── qwen2_packed_training_patch.py
    │   │   ├── train_dataloader_patch.py
    │   │   └── train_sampler_patch.py
    │   └── train
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── dataset.py
    │   │   ├── dataset_packed.py
    │   │   ├── internvl_chat_finetune.py
    │   │   ├── internvl_chat_mpo.py
    │   │   ├── internvl_chat_pretrain.py
    │   │   └── trainer_dpo.py
    ├── pyproject.toml
    ├── shell
    │   ├── data
    │   │   ├── coco_caption.json
    │   │   ├── internvl_1_2_finetune.json
    │   │   └── internvl_1_2_finetune_custom.json
    │   ├── internvl1.2
    │   │   ├── 2nd_finetune
    │   │   │   ├── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh
    │   │   │   └── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh
    │   │   └── hermes2_yi34b
    │   │   │   └── internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh
    │   ├── internvl1.5
    │   │   ├── 2nd_finetune
    │   │   │   ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   ├── hermes2_yi34b
    │   │   │   ├── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh
    │   │   ├── internlm2_1_8b
    │   │   │   ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh
    │   │   ├── internlm2_20b
    │   │   │   ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh
    │   │   └── phi3_3_8b
    │   │   │   ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh
    │   ├── internvl2.0
    │   │   └── 2nd_finetune
    │   │   │   ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh
    │   │   │   ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh
    │   │   │   └── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh
    │   ├── internvl2.0_mpo
    │   │   ├── README.md
    │   │   └── preference_optimization
    │   │   │   └── internvl2_8b_internlm2_7b_dynamic_res_mpo_full.sh
    │   ├── internvl2.5
    │   │   ├── 2nd_finetune
    │   │   │   ├── internvl2_5_1b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_5_1b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_5_26b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_5_26b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_5_2b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_5_2b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_5_2b_dynamic_res_2nd_finetune_lora_coco.sh
    │   │   │   ├── internvl2_5_38b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_5_38b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_5_4b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_5_4b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_5_78b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl2_5_78b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl2_5_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   └── internvl2_5_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   ├── stage1.5
    │   │   │   ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5.sh
    │   │   │   └── internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5.sh
    │   │   ├── stage1
    │   │   │   ├── internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1.sh
    │   │   │   ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1.sh
    │   │   │   ├── internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1.sh
    │   │   │   ├── internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1.sh
    │   │   │   ├── internvl2_5_4b_qwen2_5_3b_dynamic_res_stage1.sh
    │   │   │   ├── internvl2_5_78b_qwen2_5_72b_dynamic_res_stage1.sh
    │   │   │   └── internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1.sh
    │   │   └── stage2
    │   │   │   ├── internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage2.sh
    │   │   │   ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_stage2.sh
    │   │   │   ├── internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage2.sh
    │   │   │   ├── internvl2_5_38b_qwen2_5_32b_dynamic_res_stage2.sh
    │   │   │   ├── internvl2_5_4b_qwen2_5_3b_dynamic_res_stage2.sh
    │   │   │   ├── internvl2_5_78b_qwen2_5_72b_dynamic_res_stage2.sh
    │   │   │   └── internvl2_5_8b_internlm2_5_7b_dynamic_res_stage2.sh
    │   ├── internvl2.5_mpo
    │   │   └── preference_optimization
    │   │   │   ├── internvl2_5_1b_qwen2_5_0_5b_dynamic_res_mpo.sh
    │   │   │   ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_mpo.sh
    │   │   │   ├── internvl2_5_2b_internlm2_5_1_8b_dynamic_res_mpo.sh
    │   │   │   ├── internvl2_5_38b_qwen2_5_32b_dynamic_res_mpo.sh
    │   │   │   ├── internvl2_5_4b_qwen2_5_3b_dynamic_res_mpo.sh
    │   │   │   ├── internvl2_5_78b_qwen2_5_72b_dynamic_res_mpo.sh
    │   │   │   └── internvl2_5_8b_internlm2_5_7b_dynamic_res_mpo.sh
    │   ├── internvl3.0
    │   │   ├── 2nd_finetune
    │   │   │   ├── internvl3_14b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl3_1b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl3_2b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl3_38b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl3_78b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl3_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   └── internvl3_9b_dynamic_res_2nd_finetune_full.sh
    │   │   ├── mpo
    │   │   │   ├── internvl3_14b_mpo.sh
    │   │   │   ├── internvl3_1b_mpo.sh
    │   │   │   ├── internvl3_2b_mpo.sh
    │   │   │   ├── internvl3_38b_mpo.sh
    │   │   │   ├── internvl3_78b_mpo.sh
    │   │   │   ├── internvl3_8b_mpo.sh
    │   │   │   └── internvl3_9b_mpo.sh
    │   │   ├── mpo_data_construction
    │   │   │   ├── correctness_build_data.sh
    │   │   │   └── correctness_mmpr_8b.sh
    │   │   └── visualprm_data_construction
    │   │   │   ├── visualprm_build_data.sh
    │   │   │   └── visualprm_mmpr_8b.sh
    │   └── mini_internvl
    │   │   ├── README.md
    │   │   └── domain_adaptation
    │   │       ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_bdd.sh
    │   │       ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_drivelm.sh
    │   │       ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_medical.sh
    │   │       ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_remote.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_bdd.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_drivelm.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_medical.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_remote.sh
    │   │       ├── internvl2_4b_phi3_3_8b_dynamic_res_finetune_bdd.sh
    │   │       ├── internvl2_4b_phi3_3_8b_dynamic_res_finetune_drivelm.sh
    │   │       ├── internvl2_4b_phi3_3_8b_dynamic_res_finetune_medical.sh
    │   │       └── internvl2_4b_phi3_3_8b_dynamic_res_finetune_remote.sh
    ├── tools
    │   ├── convert_to_int8.py
    │   ├── extract_mlp.py
    │   ├── extract_video_frames.py
    │   ├── extract_vit.py
    │   ├── images_stitching.py
    │   ├── json2jsonl.py
    │   ├── jsonl2jsonl.py
    │   ├── merge_lora.py
    │   ├── reasoning_data_pipeline
    │   │   ├── mmpr_data_pipeline_correctness.py
    │   │   ├── mmpr_data_pipeline_correctness_postprocess.py
    │   │   ├── mmpr_data_pipeline_dropout_ntp.py
    │   │   ├── utils
    │   │   │   ├── accuracy_reward.py
    │   │   │   ├── constants.py
    │   │   │   └── utils.py
    │   │   ├── visualprm_data_pieline.py
    │   │   └── visualprm_data_pipeline_postprocess.py
    │   ├── replace_llm.py
    │   └── resize_pos_embed.py
    ├── zero_stage1_config.json
    ├── zero_stage2_config.json
    ├── zero_stage3_config.json
    ├── zero_stage3_config_100b.json
    ├── zero_stage3_config_100b_1e7_offload.json
    ├── zero_stage3_config_100b_1e8.json
    ├── zero_stage3_config_34b.json
    └── zero_stage3_config_70b.json
├── internvl_chat_llava
    ├── LICENSE
    ├── README.md
    ├── docs
    │   ├── Customize_Component.md
    │   ├── Data.md
    │   ├── Evaluation.md
    │   ├── LLaVA_Bench.md
    │   ├── LLaVA_from_LLaMA2.md
    │   ├── LoRA.md
    │   ├── MODEL_ZOO.md
    │   └── ScienceQA.md
    ├── images
    │   ├── demo_cli.gif
    │   ├── llava_example_cmp.png
    │   ├── llava_logo.png
    │   └── llava_v1_5_radar.jpg
    ├── llava
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── eval_gpt_review.py
    │   │   ├── eval_gpt_review_bench.py
    │   │   ├── eval_gpt_review_visual.py
    │   │   ├── eval_pope.py
    │   │   ├── eval_science_qa.py
    │   │   ├── eval_science_qa_gpt4.py
    │   │   ├── eval_science_qa_gpt4_requery.py
    │   │   ├── eval_textvqa.py
    │   │   ├── generate_webpage_data_from_table.py
    │   │   ├── m4c_evaluator.py
    │   │   ├── model_qa.py
    │   │   ├── model_vqa.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_mmbench.py
    │   │   ├── model_vqa_science.py
    │   │   ├── qa_baseline_gpt35.py
    │   │   ├── run_llava.py
    │   │   ├── summarize_gpt_review.py
    │   │   ├── table
    │   │   │   ├── answer
    │   │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   │   ├── answer_bard.jsonl
    │   │   │   │   ├── answer_gpt35.jsonl
    │   │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   │   └── answer_vicuna-13b.jsonl
    │   │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   │   ├── model.jsonl
    │   │   │   ├── prompt.jsonl
    │   │   │   ├── question.jsonl
    │   │   │   ├── results
    │   │   │   │   ├── test_sqa_llava_13b_v0.json
    │   │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
    │   │   │   ├── review
    │   │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   │   ├── reviewer.jsonl
    │   │   │   └── rule.json
    │   │   └── webpage
    │   │   │   ├── figures
    │   │   │       ├── alpaca.png
    │   │   │       ├── bard.jpg
    │   │   │       ├── chatgpt.svg
    │   │   │       ├── llama.jpg
    │   │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │   │       └── vicuna.jpeg
    │   │   │   ├── index.html
    │   │   │   ├── script.js
    │   │   │   └── styles.css
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── apply_delta.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── language_model
    │   │   │   ├── llava_llama.py
    │   │   │   ├── llava_mpt.py
    │   │   │   └── mpt
    │   │   │   │   ├── adapt_tokenizer.py
    │   │   │   │   ├── attention.py
    │   │   │   │   ├── blocks.py
    │   │   │   │   ├── configuration_mpt.py
    │   │   │   │   ├── custom_embedding.py
    │   │   │   │   ├── flash_attn_triton.py
    │   │   │   │   ├── hf_prefixlm_converter.py
    │   │   │   │   ├── meta_init_context.py
    │   │   │   │   ├── modeling_mpt.py
    │   │   │   │   ├── norm.py
    │   │   │   │   └── param_init_fns.py
    │   │   ├── llava_arch.py
    │   │   ├── make_delta.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   ├── clip_encoder.py
    │   │   │   ├── eva_clip
    │   │   │   │   ├── configuration_evaclip.py
    │   │   │   │   └── modeling_evaclip.py
    │   │   │   ├── intern_vit_6b
    │   │   │   │   ├── configuration_intern_vit.py
    │   │   │   │   ├── flash_attention.py
    │   │   │   │   └── modeling_intern_vit.py
    │   │   │   └── internvl_14b
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── configuration_intern_vit.py
    │   │   │   │   ├── configuration_internvl.py
    │   │   │   │   ├── flash_attention.py
    │   │   │   │   ├── modeling_intern_vit.py
    │   │   │   │   ├── modeling_internvl.py
    │   │   │   │   └── modeling_qllama.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── utils.py
    │   ├── serve
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── controller.py
    │   │   ├── examples
    │   │   │   ├── extreme_ironing.jpg
    │   │   │   ├── img1.jpg
    │   │   │   ├── img2.jpg
    │   │   │   ├── img3.jpg
    │   │   │   ├── img4.jpg
    │   │   │   ├── img5.jpg
    │   │   │   ├── img6.jpg
    │   │   │   └── waterview.jpg
    │   │   ├── gradio_web_server.py
    │   │   ├── model_worker.py
    │   │   ├── register_worker.py
    │   │   └── test_message.py
    │   ├── train
    │   │   ├── dist_utils.py
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llava_trainer.py
    │   │   ├── train.py
    │   │   ├── train_custom.py
    │   │   ├── train_mem.py
    │   │   └── train_mem_custom.py
    │   └── utils.py
    ├── pyproject.toml
    ├── scripts
    │   ├── convert_gqa_for_eval.py
    │   ├── convert_mmbench_for_submission.py
    │   ├── convert_mmvet_for_eval.py
    │   ├── convert_seed_for_submission.py
    │   ├── convert_sqa_to_llava.py
    │   ├── convert_sqa_to_llava_base_prompt.py
    │   ├── convert_vizwiz_for_submission.py
    │   ├── convert_vqav2_for_submission.py
    │   ├── finetune.sh
    │   ├── finetune_full_schedule.sh
    │   ├── finetune_lora.sh
    │   ├── finetune_qlora.sh
    │   ├── finetune_sqa.sh
    │   ├── merge_lora_weights.py
    │   ├── pretrain.sh
    │   ├── sqa_eval_batch.sh
    │   ├── sqa_eval_gather.sh
    │   ├── v1_5
    │   │   ├── eval
    │   │   │   ├── gqa.sh
    │   │   │   ├── llavabench.sh
    │   │   │   ├── mmbench.sh
    │   │   │   ├── mmbench_cn.sh
    │   │   │   ├── mme.sh
    │   │   │   ├── mmvet.sh
    │   │   │   ├── pope.sh
    │   │   │   ├── seed.sh
    │   │   │   ├── sqa.sh
    │   │   │   ├── textvqa.sh
    │   │   │   ├── vizwiz.sh
    │   │   │   └── vqav2.sh
    │   │   ├── finetune.sh
    │   │   └── pretrain.sh
    │   ├── zero1.json
    │   ├── zero2.json
    │   ├── zero3.json
    │   └── zero3_offload.json
    └── scripts_internvl
    │   ├── eval
    │       ├── gqa.sh
    │       ├── llavabench.sh
    │       ├── mmbench.sh
    │       ├── mme.sh
    │       ├── mmvet.sh
    │       ├── pope.sh
    │       ├── sqa.sh
    │       ├── textvqa.sh
    │       ├── vizwiz.sh
    │       └── vqav2.sh
    │   ├── finetune_internvit6b_224to336_vicuna13b.sh
    │   ├── finetune_internvit6b_224to336_vicuna13b_custom_data.sh
    │   ├── finetune_internvit6b_224to336_vicuna7b.sh
    │   ├── finetune_internvit6b_448_v1_2_vicuna13b.sh
    │   ├── finetune_internvit6b_448_v1_5_vicuna13b.sh
    │   ├── finetune_internvit6b_448_vicuna13b.sh
    │   ├── finetune_internvit6b_448_vicuna7b.sh
    │   ├── meta
    │       └── custom_data.json
    │   ├── pretrain_internvit6b_224to336_vicuna13b.sh
    │   ├── pretrain_internvit6b_224to336_vicuna7b.sh
    │   ├── pretrain_internvit6b_448_v1_2_vicuna13b.sh
    │   ├── pretrain_internvit6b_448_v1_5_vicuna13b.sh
    │   ├── pretrain_internvit6b_448_vicuna13b.sh
    │   └── pretrain_internvit6b_448_vicuna7b.sh
├── internvl_g
    ├── README.md
    ├── eval
    │   └── evaluate_caption.py
    ├── evaluate.sh
    ├── internvl
    │   ├── dist_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── internvl_stage2
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_intern_vit.py
    │   │   │   ├── configuration_internvl.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── modeling_intern_vit.py
    │   │   │   ├── modeling_internvl.py
    │   │   │   └── modeling_qllama.py
    │   │   └── internvl_stage2_retrieval
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_intern_vit.py
    │   │   │   ├── configuration_internvl.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── modeling_intern_vit.py
    │   │   │   ├── modeling_internvl.py
    │   │   │   └── modeling_qllama.py
    │   └── train
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── internvl_stage2_finetune.py
    │   │   └── trainer_monkey_patch.py
    ├── shell
    │   ├── finetune
    │   │   ├── internvl_stage2_finetune_coco_364_bs1024_ep5.sh
    │   │   ├── internvl_stage2_finetune_flickr_364_bs1024_ep10.sh
    │   │   └── internvl_stage2_finetune_flickrcn_364_bs1024_ep10.sh
    │   ├── head_finetune
    │   │   ├── internvl_stage2_finetune_coco_224_bs1024_ep5_head_4gpu.sh
    │   │   ├── internvl_stage2_finetune_flickr_224_bs1024_ep10_head_4gpu.sh
    │   │   └── internvl_stage2_finetune_flickrcn_224_bs1024_ep10_head_4gpu.sh
    │   └── lora_finetune
    │   │   ├── internvl_stage2_finetune_coco_224_bs1024_ep5_lora16_4gpu.sh
    │   │   ├── internvl_stage2_finetune_flickr_224_bs1024_ep10_lora16_4gpu.sh
    │   │   └── internvl_stage2_finetune_flickrcn_224_bs1024_ep10_lora16_4gpu.sh
    ├── zero_stage1_config.json
    ├── zero_stage2_config.json
    └── zero_stage3_config.json
├── requirements.txt
├── requirements
    ├── classification.txt
    ├── clip_benchmark.txt
    ├── internvl_chat.txt
    ├── segmentation.txt
    └── streamlit_demo.txt
├── segmentation
    ├── README.md
    ├── configs
    │   ├── _base_
    │   │   ├── datasets
    │   │   │   ├── ade20k.py
    │   │   │   ├── ade20k_504x504.py
    │   │   │   ├── ade20k_504x504_1of16.py
    │   │   │   ├── ade20k_504x504_1of2.py
    │   │   │   ├── ade20k_504x504_1of4.py
    │   │   │   ├── ade20k_504x504_1of8.py
    │   │   │   ├── ade20k_640x640.py
    │   │   │   ├── ade20k_896x896.py
    │   │   │   ├── chase_db1.py
    │   │   │   ├── cityscapes.py
    │   │   │   ├── cityscapes_1024x1024.py
    │   │   │   ├── cityscapes_768x768.py
    │   │   │   ├── cityscapes_769x769.py
    │   │   │   ├── cityscapes_832x832.py
    │   │   │   ├── coco-stuff10k.py
    │   │   │   ├── coco-stuff164k.py
    │   │   │   ├── coco-stuff164k_896x896.py
    │   │   │   ├── drive.py
    │   │   │   ├── hrf.py
    │   │   │   ├── isaid.py
    │   │   │   ├── loveda.py
    │   │   │   ├── pascal_context.py
    │   │   │   ├── pascal_context_59.py
    │   │   │   ├── pascal_voc12.py
    │   │   │   ├── pascal_voc12_aug.py
    │   │   │   ├── potsdam.py
    │   │   │   ├── stare.py
    │   │   │   └── vaihingen.py
    │   │   ├── default_runtime.py
    │   │   ├── models
    │   │   │   ├── ann_r50-d8.py
    │   │   │   ├── apcnet_r50-d8.py
    │   │   │   ├── bisenetv1_r18-d32.py
    │   │   │   ├── bisenetv2.py
    │   │   │   ├── ccnet_r50-d8.py
    │   │   │   ├── cgnet.py
    │   │   │   ├── danet_r50-d8.py
    │   │   │   ├── deeplabv3_r50-d8.py
    │   │   │   ├── deeplabv3_unet_s5-d16.py
    │   │   │   ├── deeplabv3plus_r50-d8.py
    │   │   │   ├── dmnet_r50-d8.py
    │   │   │   ├── dnl_r50-d8.py
    │   │   │   ├── dpt_vit-b16.py
    │   │   │   ├── emanet_r50-d8.py
    │   │   │   ├── encnet_r50-d8.py
    │   │   │   ├── erfnet_fcn.py
    │   │   │   ├── fast_scnn.py
    │   │   │   ├── fastfcn_r50-d32_jpu_psp.py
    │   │   │   ├── fcn_hr18.py
    │   │   │   ├── fcn_r50-d8.py
    │   │   │   ├── fcn_unet_s5-d16.py
    │   │   │   ├── fpn_r50.py
    │   │   │   ├── gcnet_r50-d8.py
    │   │   │   ├── icnet_r50-d8.py
    │   │   │   ├── isanet_r50-d8.py
    │   │   │   ├── lraspp_m-v3-d8.py
    │   │   │   ├── mask2former_beit.py
    │   │   │   ├── nonlocal_r50-d8.py
    │   │   │   ├── ocrnet_hr18.py
    │   │   │   ├── ocrnet_r50-d8.py
    │   │   │   ├── pointrend_r50.py
    │   │   │   ├── psanet_r50-d8.py
    │   │   │   ├── pspnet_r50-d8.py
    │   │   │   ├── pspnet_unet_s5-d16.py
    │   │   │   ├── segformer_mit-b0.py
    │   │   │   ├── segmenter_vit-b16_mask.py
    │   │   │   ├── setr_mla.py
    │   │   │   ├── setr_naive.py
    │   │   │   ├── setr_pup.py
    │   │   │   ├── stdc.py
    │   │   │   ├── twins_pcpvt-s_fpn.py
    │   │   │   ├── twins_pcpvt-s_upernet.py
    │   │   │   ├── upernet_beit.py
    │   │   │   ├── upernet_convnext.py
    │   │   │   ├── upernet_mae.py
    │   │   │   ├── upernet_r50.py
    │   │   │   ├── upernet_swin.py
    │   │   │   └── upernet_vit-b16_ln_mln.py
    │   │   └── schedules
    │   │   │   ├── schedule_10k.py
    │   │   │   ├── schedule_160k.py
    │   │   │   ├── schedule_20k.py
    │   │   │   ├── schedule_320k.py
    │   │   │   ├── schedule_40k.py
    │   │   │   ├── schedule_5k.py
    │   │   │   └── schedule_80k.py
    │   └── intern_vit_6b
    │   │   ├── few_shot
    │   │       ├── linear_intern_vit_6b_504_10k_ade20k_bs16_lr4e-5_1of8.py
    │   │       ├── linear_intern_vit_6b_504_20k_ade20k_bs16_lr4e-5_1of4.py
    │   │       ├── linear_intern_vit_6b_504_40k_ade20k_bs16_lr4e-5_1of2.py
    │   │       ├── linear_intern_vit_6b_504_5k_ade20k_bs16_lr4e-5_1of16.py
    │   │       └── linear_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5_1of1.py
    │   │   ├── full_tuning
    │   │       └── upernet_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5.py
    │   │   ├── head_tuning
    │   │       └── upernet_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5_frozen.py
    │   │   └── linear_probing
    │   │       └── linear_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5_frozen.py
    ├── dist_test.sh
    ├── dist_train.sh
    ├── mmcv_custom
    │   ├── __init__.py
    │   ├── ddp_hooks.py
    │   └── layer_decay_optimizer_constructor.py
    ├── mmseg_custom
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ade.py
    │   │   └── pipelines
    │   │   │   ├── __init__.py
    │   │   │   └── transform.py
    │   └── models
    │   │   ├── __init__.py
    │   │   ├── backbones
    │   │       ├── __init__.py
    │   │       ├── flash_attention.py
    │   │       └── intern_vit_6b.py
    │   │   └── decode_heads
    │   │       ├── __init__.py
    │   │       └── fcn_head.py
    ├── release.py
    ├── slurm_test.sh
    ├── slurm_train.sh
    ├── test.py
    ├── train.py
    └── zero_configs
    │   ├── adam_fp16.json
    │   ├── adam_zero1_amp.json
    │   ├── adam_zero1_bf16.json
    │   ├── adam_zero1_fp16.json
    │   ├── adam_zero2_bf16.json
    │   ├── adam_zero2_fp16.json
    │   └── adam_zero3_fp16.json
├── streamlit_demo
    ├── .streamlit
    │   └── config.toml
    ├── api.py
    ├── app.py
    ├── constants.py
    ├── controller.py
    ├── gallery
    │   ├── astro_on_unicorn.png
    │   ├── cheetah.png
    │   ├── prod_1.jpeg
    │   ├── prod_11.jpg
    │   ├── prod_12.png
    │   ├── prod_4.png
    │   ├── prod_9.jpg
    │   └── prod_en_17.png
    ├── library.py
    ├── model_worker.py
    ├── sd_worker.py
    ├── static
    │   └── SimHei.ttf
    └── utils.py
└── video_retrieval
    └── test_msrvtt.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F403, C901, W504, W605, E251, E122, E126, E127, E722, W503, E128, E741, E731, E701
3 | select = E1, E3, E502, E7, E9, W1, W5, W6
4 | max-line-length = 180
5 | exclude=*.egg/*,build,dist,detection/configs/*
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Suggest an idea for this project
 3 | title: "[Feature] "
 4 | 
 5 | body:
 6 | - type: markdown
 7 |   attributes:
 8 |     value: |
 9 |       We strongly appreciate you creating a PR to implement this feature [here](https://github.com/OpenGVLab/InternVL/pulls)!
10 |       If you need our help, please fill in as much of the following form as you're able to.
11 | 
12 |       **The less clear the description, the longer it will take to solve it.**
13 | - type: textarea
14 |   attributes:
15 |     label: Motivation
16 |     description: |
17 |       A clear and concise description of the motivation of the feature.
18 |       Ex1. It is inconvenient when \[....\].
19 |   validations:
20 |     required: true
21 | - type: textarea
22 |   attributes:
23 |     label: Related resources
24 |     description: |
25 |       If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
26 | - type: textarea
27 |   attributes:
28 |     label: Additional context
29 |     description: |
30 |       Add any other context or screenshots about the feature request here.
31 |       If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to the documentation.
 3 | labels: "kind/doc,status/unconfirmed"
 4 | title: "[Docs] "
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description the issue.
12 |   validations:
13 |     required: true
14 | 
15 | - type: textarea
16 |   attributes:
17 |     label: Suggest a potential alternative/fix
18 |     description: >
19 |       Tell us how we could improve the documentation in this regard.
20 | - type: markdown
21 |   attributes:
22 |     value: >
23 |       Thanks for contributing 🎉!
24 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line-length = 180
 3 | multi_line_output = 0
 4 | extra_standard_library = setuptools
 5 | known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml
 6 | no_lines_before = STDLIB,LOCALFOLDER
 7 | default_section = THIRDPARTY
 8 | 
 9 | [yapf]
10 | BASED_ON_STYLE = pep8
11 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
12 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
13 | 
14 | [codespell]
15 | skip = *.ipynb
16 | quiet-level = 3
17 | ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood
18 | © 2022 GitHub, Inc.
19 | Terms
20 | Privacy
21 | Security
22 | Status
23 | Docs
24 | Contact GitHub
25 | Pricing
26 | API
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: ^internvl_chat_llava/
 2 | repos:
 3 |   - repo: https://github.com/PyCQA/flake8
 4 |     rev: 5.0.4
 5 |     hooks:
 6 |       - id: flake8
 7 |   - repo: https://github.com/PyCQA/isort
 8 |     rev: 5.11.5
 9 |     hooks:
10 |       - id: isort
11 |   - repo: https://github.com/pre-commit/pre-commit-hooks
12 |     rev: v4.3.0
13 |     hooks:
14 |       - id: trailing-whitespace
15 |       - id: check-yaml
16 |       - id: end-of-file-fixer
17 |       - id: requirements-txt-fixer
18 |       - id: double-quote-string-fixer
19 |       - id: check-merge-conflict
20 |       - id: fix-encoding-pragma
21 |         args: ["--remove"]
22 |       - id: mixed-line-ending
23 |         args: ["--fix=lf"]
24 |   - repo: https://github.com/executablebooks/mdformat
25 |     rev: 0.7.9
26 |     hooks:
27 |       - id: mdformat
28 |         args: ["--number"]
29 |         additional_dependencies:
30 |           - mdformat-openmmlab
31 |           - mdformat_frontmatter
32 |           - linkify-it-py
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 OpenGVLab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 | MODEL:
 7 |   TYPE: intern_vit_6b
 8 |   DROP_PATH_RATE: 0.0
 9 |   INTERN_VIT_6B:
10 |     FREEZE_VIT: True
11 |     PATCH_SIZE: 14
12 |     PRETRAIN_SIZE: 224
13 |     QKV_BIAS: False
14 |     EMBED_DIM: 3200
15 |     NUM_HEADS: 25
16 |     MLP_RATIO: 4
17 |     INIT_VALUES: 0.1
18 |     QK_NORMALIZATION: True
19 |     DEPTH: 48
20 |     USE_FLASH_ATTN: True
21 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
22 |     CLS_TARGET: 'attention_pooling'
23 | TRAIN:
24 |   EMA:
25 |     ENABLE: True
26 |     DECAY: 0.998
27 |   EPOCHS: 10
28 |   WARMUP_EPOCHS: 1
29 |   WEIGHT_DECAY: 0.0
30 |   BASE_LR: 0.1 # 512
31 |   WARMUP_LR: .0
32 |   MIN_LR: .0
33 |   LR_LAYER_DECAY: false
34 |   OPTIMIZER:
35 |     NAME: 'sgd'
36 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
23 |     CLS_TARGET: 'attention_pooling'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'attention_pooling'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/intern_vit_6b_1k_224.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 128
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 | MODEL:
 7 |   TYPE: intern_vit_6b
 8 |   DROP_PATH_RATE: 0.0
 9 |   INTERN_VIT_6B:
10 |     FREEZE_VIT: True
11 |     PATCH_SIZE: 14
12 |     PRETRAIN_SIZE: 224
13 |     QKV_BIAS: False
14 |     EMBED_DIM: 3200
15 |     NUM_HEADS: 25
16 |     MLP_RATIO: 4
17 |     INIT_VALUES: 0.1
18 |     QK_NORMALIZATION: True
19 |     DEPTH: 48
20 |     USE_FLASH_ATTN: True
21 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
22 |     CLS_TARGET: 'cls_patch_concat'
23 | TRAIN:
24 |   EMA:
25 |     ENABLE: False
26 |     DECAY: 0.998
27 |   EPOCHS: 10
28 |   WARMUP_EPOCHS: 1
29 |   WEIGHT_DECAY: 0.0
30 |   BASE_LR: 0.1 # 512
31 |   WARMUP_LR: .0
32 |   MIN_LR: .0
33 |   LR_LAYER_DECAY: false
34 |   OPTIMIZER:
35 |     NAME: 'sgd'
36 | 


--------------------------------------------------------------------------------
/classification/configs/intern_vit_6b_1k_224_test_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 128
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: False
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/intern_vit_6b_1k_224_test_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 128
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: False
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/intern_vit_6b_1k_224_test_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 128
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: False
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/intern_vit_6b_1k_224_test_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 128
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: False
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/intern_vit_6b_1k_224_test_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 128
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: False
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 | MODEL:
 7 |   TYPE: intern_vit_6b
 8 |   DROP_PATH_RATE: 0.0
 9 |   INTERN_VIT_6B:
10 |     FREEZE_VIT: True
11 |     PATCH_SIZE: 14
12 |     PRETRAIN_SIZE: 224
13 |     QKV_BIAS: False
14 |     EMBED_DIM: 3200
15 |     NUM_HEADS: 25
16 |     MLP_RATIO: 4
17 |     INIT_VALUES: 0.1
18 |     QK_NORMALIZATION: True
19 |     DEPTH: 48
20 |     USE_FLASH_ATTN: True
21 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
22 |     CLS_TARGET: 'cls_patch_concat'
23 | TRAIN:
24 |   EMA:
25 |     ENABLE: True
26 |     DECAY: 0.998
27 |   EPOCHS: 10
28 |   WARMUP_EPOCHS: 1
29 |   WEIGHT_DECAY: 0.0
30 |   BASE_LR: 0.1 # 512
31 |   WARMUP_LR: .0
32 |   MIN_LR: .0
33 |   LR_LAYER_DECAY: false
34 |   OPTIMIZER:
35 |     NAME: 'sgd'
36 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 224
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 48
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 224
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 48
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   TRANSFORM: 'build_transform_for_linear_probe'
 5 |   DATA_PATH: './data/imagenet-1k'
 6 |   IMG_SIZE: 448
 7 | MODEL:
 8 |   TYPE: intern_vit_6b
 9 |   DROP_PATH_RATE: 0.0
10 |   INTERN_VIT_6B:
11 |     FREEZE_VIT: True
12 |     PATCH_SIZE: 14
13 |     PRETRAIN_SIZE: 448
14 |     QKV_BIAS: False
15 |     EMBED_DIM: 3200
16 |     NUM_HEADS: 25
17 |     MLP_RATIO: 4
18 |     INIT_VALUES: 0.1
19 |     QK_NORMALIZATION: True
20 |     DEPTH: 45
21 |     USE_FLASH_ATTN: True
22 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
23 |     CLS_TARGET: 'cls_patch_concat'
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: True
27 |     DECAY: 0.998
28 |   EPOCHS: 10
29 |   WARMUP_EPOCHS: 1
30 |   WEIGHT_DECAY: 0.0
31 |   BASE_LR: 0.1 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: false
35 |   OPTIMIZER:
36 |     NAME: 'sgd'
37 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_a'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-a'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_r'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-r'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet-real'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-1k'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenet_sketch'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenet-sketch'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 |   BATCH_SIZE: 16 # single GPU batch size
 4 |   DATASET: 'imagenetv2'
 5 |   TRANSFORM: 'build_transform_for_linear_probe'
 6 |   DATA_PATH: './data/imagenetv2'
 7 |   IMG_SIZE: 448
 8 | MODEL:
 9 |   TYPE: intern_vit_6b
10 |   DROP_PATH_RATE: 0.0
11 |   INTERN_VIT_6B:
12 |     FREEZE_VIT: True
13 |     PATCH_SIZE: 14
14 |     PRETRAIN_SIZE: 448
15 |     QKV_BIAS: False
16 |     EMBED_DIM: 3200
17 |     NUM_HEADS: 25
18 |     MLP_RATIO: 4
19 |     INIT_VALUES: 0.1
20 |     QK_NORMALIZATION: True
21 |     DEPTH: 45
22 |     USE_FLASH_ATTN: True
23 |     PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
24 |     CLS_TARGET: 'cls_patch_concat'
25 | TRAIN:
26 |   EMA:
27 |     ENABLE: True
28 |     DECAY: 0.998
29 |   EPOCHS: 10
30 |   WARMUP_EPOCHS: 1
31 |   WEIGHT_DECAY: 0.0
32 |   BASE_LR: 0.1 # 512
33 |   WARMUP_LR: .0
34 |   MIN_LR: .0
35 |   LR_LAYER_DECAY: false
36 |   OPTIMIZER:
37 |     NAME: 'sgd'
38 | 


--------------------------------------------------------------------------------
/classification/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # InternVL
3 | # Copyright (c) 2023 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .build import build_loader, build_loader2
8 | 


--------------------------------------------------------------------------------
/classification/meta_data/train.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/classification/meta_data/train.txt.zip


--------------------------------------------------------------------------------
/classification/meta_data/val.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/classification/meta_data/val.txt.zip


--------------------------------------------------------------------------------
/classification/models/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # InternVL
3 | # Copyright (c) 2023 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .build import build_model
8 | 


--------------------------------------------------------------------------------
/classification/train_in1k.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | GPUS=${GPUS:-8}
 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
10 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
11 | SRUN_ARGS=${SRUN_ARGS:-""}
12 | 
13 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
14 |     srun -p ${PARTITION} \
15 |     --job-name=${JOB_NAME} \
16 |     --gres=gpu:${GPUS_PER_NODE} \
17 |     --ntasks=${GPUS} \
18 |     --ntasks-per-node=${GPUS_PER_NODE} \
19 |     --cpus-per-task=${CPUS_PER_TASK} \
20 |     --kill-on-bad-exit=1 \
21 |     --quotatype=reserved \
22 |     ${SRUN_ARGS} \
23 |     python -u main.py \
24 |     --cfg ${CONFIG} \
25 |     --accumulation-steps 1 \
26 |     --local-rank 0 \
27 |     --output work_dirs ${@:4}
28 | 


--------------------------------------------------------------------------------
/clip_benchmark/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Credits
3 | =======
4 | 
5 | * `Mehdi Cherti <https://github.com/mehdidc>`_
6 | * `Romain Beaumont <https://github.com/rom1504>`_
7 | 


--------------------------------------------------------------------------------
/clip_benchmark/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | ## History
 2 | 
 3 | ### 1.4.0
 4 | 
 5 | * Fix silent webdataset error-handling
 6 | * Added support for wds/voc2007_multilabel
 7 | * default to float32
 8 | * add mscoco generative benchmark
 9 | 
10 | ### 1.3.0
11 | 
12 | * update flickr8k results, solve issue #48, thanks to @orchidmajumder
13 | * Evaluate multiple models/datasets/languages using the CLI directly
14 | * Support Japanese CLIP by rinna
15 | * Add arabic imagenet
16 | * updating CuPL prompts with more generated sentences + ensembled with openAI prompts
17 | * put model in eval mode before evaluation
18 | * Webdataset updates
19 | * Make verbose the default
20 | 
21 | ### 1.2.0
22 | 
23 | * Added support for loading webdatasets
24 | 
25 | ### 1.1.0
26 | 
27 | * Added better support for multilingual eval
28 | * Added better support for linear probing
29 | * Added support for CuPL prompts
30 | 
31 | ### 1.0.1
32 | 
33 | * pypi description as markdown
34 | 
35 | ### 1.0.0
36 | 
37 | * Actual first release on PyPI.
38 | 
39 | 
40 | ### 0.1.0
41 | 
42 | * First release on PyPI.
43 | 


--------------------------------------------------------------------------------
/clip_benchmark/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022, Mehdi Cherti
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/clip_benchmark/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include * *.json
12 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
13 | 


--------------------------------------------------------------------------------
/clip_benchmark/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/benchmark.png


--------------------------------------------------------------------------------
/clip_benchmark/benchmark/dataset_type.csv:
--------------------------------------------------------------------------------
 1 | dataset,type
 2 | imagenet1k,natural
 3 | imagenetv2,natural
 4 | imagenet-r,natural
 5 | imagenet_sketch,specialized
 6 | objectnet,natural
 7 | imagenet-a,natural
 8 | imagenet-o,natural
 9 | vtab/cifar10,natural
10 | vtab/cifar100,natural
11 | mnist,specialized
12 | vtab/flowers,natural
13 | cars,natural
14 | vtab/svhn,natural
15 | fer2013,natural
16 | renderedsst2,specialized
17 | vtab/pets,natural
18 | vtab/caltech101,natural
19 | voc2007_multilabel,natural
20 | voc2007,natural
21 | sun397,natural
22 | fgvc_aircraft,natural
23 | country211,natural
24 | vtab/dtd,natural
25 | gtsrb,natural
26 | stl10,natural
27 | vtab/diabetic_retinopathy,specialized
28 | vtab/eurosat,specialized
29 | vtab/resisc45,specialized
30 | vtab/pcam,specialized
31 | vtab/clevr_count_all,structured
32 | vtab/clevr_closest_object_distance,structured
33 | vtab/dsprites_label_orientation,structured
34 | vtab/dsprites_label_x_position,structured
35 | vtab/dsprites_label_y_position,structured
36 | vtab/smallnorb_label_elevation,structured
37 | vtab/smallnorb_label_azimuth,structured
38 | vtab/dmlab,structured
39 | vtab/kitti_closest_vehicle_distance,structured
40 | mscoco_captions,retrieval
41 | flickr8k,retrieval
42 | flickr30k,retrieval
43 | 


--------------------------------------------------------------------------------
/clip_benchmark/benchmark/datasets.txt:
--------------------------------------------------------------------------------
 1 | mscoco_captions
 2 | flickr8k
 3 | flickr30k
 4 | imagenet1k
 5 | imagenetv2
 6 | imagenet_sketch
 7 | imagenet-a
 8 | imagenet-r
 9 | objectnet
10 | fer2013
11 | voc2007
12 | voc2007_multilabel
13 | sun397
14 | cars
15 | fgvc_aircraft
16 | mnist
17 | stl10
18 | gtsrb
19 | country211
20 | renderedsst2
21 | vtab/caltech101
22 | vtab/cifar10
23 | vtab/cifar100
24 | vtab/clevr_count_all
25 | vtab/clevr_closest_object_distance
26 | vtab/diabetic_retinopathy
27 | vtab/dmlab
28 | vtab/dsprites_label_orientation
29 | vtab/dsprites_label_x_position
30 | vtab/dtd
31 | vtab/eurosat
32 | vtab/kitti_closest_vehicle_distance
33 | vtab/flowers
34 | vtab/pets
35 | vtab/pcam
36 | vtab/resisc45
37 | vtab/smallnorb_label_azimuth
38 | vtab/smallnorb_label_elevation
39 | vtab/svhn
40 | 


--------------------------------------------------------------------------------
/clip_benchmark/benchmark/datasets_multilingual.txt:
--------------------------------------------------------------------------------
 1 | multilingual_mscoco_captions,es
 2 | multilingual_mscoco_captions,it
 3 | multilingual_mscoco_captions,ko
 4 | multilingual_mscoco_captions,pl
 5 | multilingual_mscoco_captions,ru
 6 | multilingual_mscoco_captions,tr
 7 | multilingual_mscoco_captions,zh
 8 | multilingual_mscoco_captions,en
 9 | imagenet1k,zh
10 | imagenet1k,it
11 | imagenet1k,jp
12 | imagenet1k,en
13 | imagenet1k,ar
14 | 


--------------------------------------------------------------------------------
/clip_benchmark/benchmark/models.txt:
--------------------------------------------------------------------------------
 1 | ViT-B-32,openai
 2 | ViT-B-16,openai
 3 | ViT-L-14,openai
 4 | ViT-L-14-336,openai
 5 | ViT-B-32-quickgelu,laion400m_e32
 6 | ViT-B-32,laion2b_e16
 7 | ViT-B-32,laion2b_s34b_b79k
 8 | ViT-B-16,laion400m_e32
 9 | ViT-B-16-plus-240,laion400m_e32
10 | ViT-L-14,laion400m_e32
11 | ViT-L-14,laion2b_s32b_b82k
12 | ViT-H-14,laion2b_s32b_b79k
13 | ViT-g-14,laion2b_s12b_b42k
14 | 


--------------------------------------------------------------------------------
/clip_benchmark/benchmark/webdatasets.txt:
--------------------------------------------------------------------------------
 1 | wds/mscoco_captions
 2 | wds/flickr8k
 3 | wds/flickr30k
 4 | wds/imagenet1k
 5 | wds/imagenetv2
 6 | wds/imagenet_sketch
 7 | wds/imagenet-a
 8 | wds/imagenet-r
 9 | wds/imagenet-o
10 | wds/objectnet
11 | wds/fer2013
12 | wds/voc2007
13 | wds/voc2007_multilabel
14 | wds/sun397
15 | wds/cars
16 | wds/fgvc_aircraft
17 | wds/mnist
18 | wds/stl10
19 | wds/gtsrb
20 | wds/country211
21 | wds/renderedsst2
22 | wds/vtab/caltech101
23 | wds/vtab/cifar10
24 | wds/vtab/cifar100
25 | wds/vtab/clevr_count_all
26 | wds/vtab/clevr_closest_object_distance
27 | wds/vtab/diabetic_retinopathy
28 | wds/vtab/dmlab
29 | wds/vtab/dsprites_label_orientation
30 | wds/vtab/dsprites_label_x_position
31 | wds/vtab/dsprites_label_y_position
32 | wds/vtab/dtd
33 | wds/vtab/eurosat
34 | wds/vtab/kitti_closest_vehicle_distance
35 | wds/vtab/flowers
36 | wds/vtab/pets
37 | wds/vtab/pcam
38 | wds/vtab/resisc45
39 | wds/vtab/smallnorb_label_azimuth
40 | wds/vtab/smallnorb_label_elevation
41 | wds/vtab/svhn
42 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for CLIP Benchmark."""
2 | 
3 | __author__ = """Mehdi Cherti"""
4 | __email__ = 'mehdicherti@gmail.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/clip_benchmark/datasets/__init__.py


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/datasets/tools.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def process_single_caption(caption, max_words=50):
 5 |     caption = re.sub(r"([.!\"()*#:;~])", ' ', caption.lower())
 6 |     caption = re.sub(r'\s{2,}', ' ', caption)
 7 |     caption = caption.rstrip('\n')
 8 |     caption = caption.strip(' ')
 9 | 
10 |     # truncate caption
11 |     caption_words = caption.split(' ')
12 |     if len(caption_words) > max_words:
13 |         caption = ' '.join(caption_words[: max_words])
14 |     return caption
15 | 
16 | 
17 | def pre_caption(caption, max_words=50):
18 |     if type(caption) == str:
19 |         caption = process_single_caption(caption, max_words)
20 |     else:
21 |         caption = [process_single_caption(c, max_words) for c in caption]
22 |     return caption
23 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/clip_benchmark/metrics/__init__.py


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/model_collection.py:
--------------------------------------------------------------------------------
 1 | import open_clip
 2 | 
 3 | 
 4 | def get_model_collection_from_file(path):
 5 |     return [l.strip().split(',') for l in open(path).readlines()]
 6 | 
 7 | 
 8 | model_collection = {
 9 |     'openclip_base': [
10 |         ('ViT-B-32-quickgelu', 'laion400m_e32'),
11 |         ('ViT-B-32', 'laion2b_e16'),
12 |         ('ViT-B-32', 'laion2b_s34b_b79k'),
13 |         ('ViT-B-16', 'laion400m_e32'),
14 |         ('ViT-B-16-plus-240', 'laion400m_e32'),
15 |         ('ViT-L-14', 'laion400m_e32'),
16 |         ('ViT-L-14', 'laion2b_s32b_b82k'),
17 |         ('ViT-H-14', 'laion2b_s32b_b79k'),
18 |         ('ViT-g-14', 'laion2b_s12b_b42k'),
19 |     ],
20 |     'openclip_multilingual': [
21 |         ('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'),
22 |         ('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'),
23 |     ],
24 |     'openclip_all': open_clip.list_pretrained(),
25 |     'openai': [
26 |         ('ViT-B-32', 'openai'),
27 |         ('ViT-B-16', 'openai'),
28 |         ('ViT-L-14', 'openai'),
29 |         ('ViT-L-14-336', 'openai'),
30 |     ]
31 | }
32 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | 
 5 | from .internvl import load_internvl
 6 | from .japanese_clip import load_japanese_clip
 7 | from .open_clip import load_open_clip
 8 | 
 9 | # loading function must return (model, transform, tokenizer)
10 | TYPE2FUNC = {
11 |     'open_clip': load_open_clip,
12 |     'ja_clip': load_japanese_clip,
13 |     'internvl': load_internvl,
14 | }
15 | MODEL_TYPES = list(TYPE2FUNC.keys())
16 | 
17 | 
18 | def load_clip(
19 |         model_type: str,
20 |         model_name: str,
21 |         pretrained: str,
22 |         cache_dir: str,
23 |         device: Union[str, torch.device] = 'cuda'
24 | ):
25 |     assert model_type in MODEL_TYPES, f'model_type={model_type} is invalid!'
26 |     load_func = TYPE2FUNC[model_type]
27 |     return load_func(model_name=model_name, pretrained=pretrained, cache_dir=cache_dir, device=device)
28 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/internvl.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2023 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .internvl_c_pytorch import load_internvl_c_pytorch
 8 | from .internvl_huggingface import (load_internvl_c_huggingface,
 9 |                                    load_internvl_g_huggingface)
10 | 
11 | 
12 | def load_internvl(model_name, pretrained, cache_dir, device):
13 |     if model_name == 'internvl_c_classification':
14 |         return load_internvl_c_pytorch(pretrained, device, 'classification')
15 |     elif model_name == 'internvl_c_retrieval':
16 |         return load_internvl_c_pytorch(pretrained, device, 'retrieval')
17 |     elif model_name == 'internvl_c_classification_hf':
18 |         return load_internvl_c_huggingface(pretrained, device, 'classification')
19 |     elif model_name == 'internvl_c_retrieval_hf':
20 |         return load_internvl_c_huggingface(pretrained, device, 'retrieval')
21 |     elif model_name == 'internvl_g_classification_hf':
22 |         return load_internvl_g_huggingface(pretrained, device, 'classification')
23 |     elif model_name == 'internvl_g_retrieval_hf':
24 |         return load_internvl_g_huggingface(pretrained, device, 'retrieval')
25 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "LlamaForCausalLM"
 4 |   ],
 5 |   "bos_token_id": 1,
 6 |   "eos_token_id": 2,
 7 |   "hidden_act": "silu",
 8 |   "hidden_size": 4096,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 11008,
11 |   "max_position_embeddings": 2048,
12 |   "max_sequence_length": 2048,
13 |   "model_type": "llama",
14 |   "num_attention_heads": 32,
15 |   "num_hidden_layers": 32,
16 |   "pad_token_id": 0,
17 |   "rms_norm_eps": 1e-06,
18 |   "tie_word_embeddings": false,
19 |   "torch_dtype": "float16",
20 |   "transformers_version": "4.28.0.dev0",
21 |   "use_cache": true,
22 |   "vocab_size": 49954
23 | }
24 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "_from_model_config": true,
3 |   "bos_token_id": 1,
4 |   "eos_token_id": 2,
5 |   "pad_token_id": 0,
6 |   "transformers_version": "4.28.0.dev0"
7 | }
8 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "bos_token": "<s>",
3 |   "eos_token": "</s>",
4 |   "pad_token": "[PAD]",
5 |   "unk_token": "<unk>"
6 | }
7 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/tokenizer.model


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_bos_token": true,
 3 |   "add_eos_token": false,
 4 |   "bos_token": {
 5 |     "__type": "AddedToken",
 6 |     "content": "<s>",
 7 |     "lstrip": false,
 8 |     "normalized": true,
 9 |     "rstrip": false,
10 |     "single_word": false
11 |   },
12 |   "clean_up_tokenization_spaces": false,
13 |   "eos_token": {
14 |     "__type": "AddedToken",
15 |     "content": "</s>",
16 |     "lstrip": false,
17 |     "normalized": true,
18 |     "rstrip": false,
19 |     "single_word": false
20 |   },
21 |   "model_max_length": 1000000000000000019884624838656,
22 |   "pad_token": null,
23 |   "sp_model_kwargs": {},
24 |   "special_tokens_map_file": "chinese_alpaca_lora_7b/special_tokens_map.json",
25 |   "tokenizer_class": "LlamaTokenizer",
26 |   "unk_token": {
27 |     "__type": "AddedToken",
28 |     "content": "<unk>",
29 |     "lstrip": false,
30 |     "normalized": true,
31 |     "rstrip": false,
32 |     "single_word": false
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/clip_benchmark/clip_benchmark/models/open_clip.py:
--------------------------------------------------------------------------------
 1 | import open_clip
 2 | 
 3 | 
 4 | def load_open_clip(model_name: str = 'ViT-B-32-quickgelu', pretrained: str = 'laion400m_e32', cache_dir: str = None,
 5 |                    device='cpu'):
 6 |     model, _, transform = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, cache_dir=cache_dir)
 7 |     model = model.to(device)
 8 |     tokenizer = open_clip.get_tokenizer(model_name)
 9 |     return model, transform, tokenizer
10 | 


--------------------------------------------------------------------------------
/clip_benchmark/probe_benchmark/PROBES.md:
--------------------------------------------------------------------------------
 1 | Steps to run.
 2 | 
 3 | 1. Navigate to `CLIP_benchmark`.
 4 | 2. Run `export PYTHONPATH=$PWD`.
 5 | 3. (Optional) To re-run the experiments, run `python probe_benchmark/scaling_experiments.py`. You'll have to change line
 6 |    51 to point to your data.
 7 | 4. (Optional) To generate the results, run `python probe_benchmark/build_df_scaling_experiments.py`.
 8 | 5. (Optional) VTAB requires post-processing to average. Run `python probe_benchmark/process_vtab.py`.
 9 | 6. Generate plots with `python probe_benchmark/scaling_plot.py`.
10 | 7. Generate table with `python probe_benchark/generate_table.py`.
11 | 


--------------------------------------------------------------------------------
/clip_benchmark/probe_benchmark/gmacs_vs_perf_retrieval.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/probe_benchmark/gmacs_vs_perf_retrieval.pdf


--------------------------------------------------------------------------------
/clip_benchmark/probe_benchmark/imagenet_cifar_lp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/probe_benchmark/imagenet_cifar_lp.pdf


--------------------------------------------------------------------------------
/clip_benchmark/probe_benchmark/imagenet_cifar_lp_vtab.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/probe_benchmark/imagenet_cifar_lp_vtab.pdf


--------------------------------------------------------------------------------
/clip_benchmark/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/clip_benchmark/requirements.txt:
--------------------------------------------------------------------------------
 1 | open_clip_torch>=0.2.1
 2 | opencv-python
 3 | peft>=0.6.2
 4 | protobuf==3.20.3
 5 | pycocoevalcap
 6 | pyyaml
 7 | scikit-learn>=1.0,<2
 8 | scikit-learn
 9 | scipy
10 | task_adaptation
11 | tensorflow==2.11.0
12 | termcolor
13 | tqdm>=2
14 | transformers>=4.32.0
15 | webdataset>=0.2.31
16 | yacs
17 | 


--------------------------------------------------------------------------------
/clip_benchmark/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.1.0
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:clip_benchmark/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 


--------------------------------------------------------------------------------
/clip_benchmark/tests/test_clip_benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Tests for `clip_benchmark` package."""
 4 | 
 5 | import os
 6 | 
 7 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
 8 | from clip_benchmark.cli import run
 9 | 
10 | 
11 | class base_args:
12 |     dataset = 'dummy'
13 |     split = 'test'
14 |     model = 'ViT-B-32-quickgelu'
15 |     pretrained = 'laion400m_e32'
16 |     task = 'zeroshot_classification'
17 |     amp = False
18 |     num_workers = 4
19 |     batch_size = 64
20 |     dataset_root = 'root'
21 |     output = 'result.json'
22 |     verbose = True
23 |     root = 'root'
24 |     annotation_file = ''
25 |     seed = 0
26 |     skip_load = False
27 |     language = 'en'
28 |     model_cache_dir = None
29 |     cupl = False
30 |     save_clf = None
31 |     load_clfs = []
32 |     model_type = 'open_clip'
33 |     wds_cache_dir = None
34 |     which = 'eval'
35 |     skip_existing = False
36 | 
37 | 
38 | def test_base():
39 |     run(base_args)
40 | 


--------------------------------------------------------------------------------
/clip_benchmark/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38, flake8
 3 | 
 4 | [travis]
 5 | python =
 6 |     3.8: py38
 7 |     3.7: py37
 8 |     3.6: py36
 9 | 
10 | [testenv:flake8]
11 | basepython = python
12 | deps = flake8
13 | commands = flake8 clip_benchmark tests
14 | 
15 | [testenv]
16 | setenv =
17 |     PYTHONPATH = {toxinidir}
18 | 
19 | commands = python setup.py test
20 | 


--------------------------------------------------------------------------------
/internvl_chat/eval/mmmu_pro/prompts.yaml:
--------------------------------------------------------------------------------
1 | cot:
2 |   vision: "Write out the multiple-choice question in the image and then solve it. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options. Think step by step before answering."
3 |   standard: "Answer the preceding multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options. Think step by step before answering."
4 | direct:
5 |   vision: "Answer with the option letter from the given choices directly. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options."
6 |   standard: "Answer with the option letter from the given choices directly."
7 | 


--------------------------------------------------------------------------------
/internvl_chat/eval/vqa/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('--src', type=str)
 6 | parser.add_argument('--dst', type=str)
 7 | args = parser.parse_args()
 8 | 
 9 | all_answers = []
10 | data = json.load(open(args.src))
11 | for res in data:
12 |     question_id = res['questionId']
13 |     answer = res['answer'].rstrip('.').lower()
14 |     all_answers.append({'questionId': question_id, 'prediction': answer})
15 | 
16 | with open(args.dst, 'w') as f:
17 |     json.dump(all_answers, f)
18 | 


--------------------------------------------------------------------------------
/internvl_chat/examples/image1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image1.jpg


--------------------------------------------------------------------------------
/internvl_chat/examples/image2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image2.jpg


--------------------------------------------------------------------------------
/internvl_chat/examples/image3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image3.jpg


--------------------------------------------------------------------------------
/internvl_chat/examples/image4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image4.jpg


--------------------------------------------------------------------------------
/internvl_chat/examples/image5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image5.jpg


--------------------------------------------------------------------------------
/internvl_chat/internvl/model/internvl_chat/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .configuration_intern_vit import InternVisionConfig
 8 | from .configuration_internvl_chat import InternVLChatConfig
 9 | from .modeling_intern_vit import InternVisionModel
10 | from .modeling_internvl_chat import InternVLChatModel
11 | 
12 | __all__ = ['InternVisionConfig', 'InternVisionModel',
13 |            'InternVLChatConfig', 'InternVLChatModel']
14 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/patch/internvit_liger_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | def apply_liger_kernel_to_internvit() -> None:
 8 |     from internvl.model.internvl_chat import modeling_intern_vit
 9 |     from liger_kernel.transformers.layer_norm import LigerLayerNorm
10 |     from liger_kernel.transformers.rms_norm import LigerRMSNorm
11 |     modeling_intern_vit.NORM2FN['rms_norm'] = LigerRMSNorm
12 |     modeling_intern_vit.NORM2FN['layer_norm'] = LigerLayerNorm
13 |     print('Liger kernel applied to InternViT')
14 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/patch/llama_rmsnorm_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | import transformers
 8 | 
 9 | 
10 | def replace_llama_rmsnorm_with_fused_rmsnorm():
11 |     try:
12 |         from functools import partial
13 | 
14 |         from apex.normalization import FusedRMSNorm
15 |         LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
16 |         transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
17 |         print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm')
18 |     except ImportError:
19 |         # using the normal LlamaRMSNorm
20 |         pass
21 |     except Exception:
22 |         print('discovered apex but it failed to load, falling back to LlamaRMSNorm')
23 |         pass
24 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/train/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # InternVL
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/train/constants.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
 8 | IMG_START_TOKEN = '<img>'
 9 | IMG_END_TOKEN = '</img>'
10 | QUAD_START_TOKEN = '<quad>'
11 | QUAD_END_TOKEN = '</quad>'
12 | REF_START_TOKEN = '<ref>'
13 | REF_END_TOKEN = '</ref>'
14 | BOX_START_TOKEN = '<box>'
15 | BOX_END_TOKEN = '</box>'
16 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
17 | IMAGENET_STD = (0.229, 0.224, 0.225)
18 | CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073)
19 | CLIP_STD = (0.2686295, 0.2613025, 0.2757711)
20 | SIGLIP_MEAN = (0.5, 0.5, 0.5)
21 | SIGLIP_STD = (0.5, 0.5, 0.5)
22 | 


--------------------------------------------------------------------------------
/internvl_chat/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "internvl_chat"
 7 | version = "2.0.0"
 8 | description = "Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch>=2", "torchvision>=0.15",
17 |     "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate", "peft>=0.4.0", "bitsandbytes==0.41.0",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn>=1.2.2",
20 |     "gradio==3.35.2", "gradio_client==0.2.9",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "deepspeed==0.13.5", "einops", "einops-exts", "timm==0.9.12",
23 | ]
24 | 
25 | [project.urls]
26 | "Homepage" = "https://github.com/OpenGVLab/InternVL"
27 | "Bug Tracker" = "https://github.com/OpenGVLab/InternVL/issues"
28 | 
29 | [tool.setuptools.packages.find]
30 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"]
31 | 
32 | [tool.wheel]
33 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"]
34 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/data/coco_caption.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "coco_karpathy_train_567k": {
 3 |     "root": "data/coco/",
 4 |     "annotation": "data/coco/annotations/coco_karpathy_train_567k.jsonl",
 5 |     "data_augment": false,
 6 |     "repeat_time": 1,
 7 |     "length": 566747
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl3.0/mpo_data_construction/correctness_build_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 4 | 
 5 | PROMPT_VERSION="en_v2"
 6 | data_dir="outputs_mpo/correctness_mmpr_v1_2_${PROMPT_VERSION}"
 7 | save_dir="outputs_mpo/correctness_mmpr_v1_2_${PROMPT_VERSION}_pairs"
 8 | 
 9 | model="OpenGVLab_InternVL3-8B"
10 | 
11 | declare -a max_tiles=( \
12 |     "1" \
13 |     "6" \
14 |     "12" \
15 |     "18" \
16 |     "24" \
17 | )
18 | 
19 | for ((j=0; j<${#max_tiles[@]}; j++)); do
20 |     curr_max_tiles=${max_tiles[j]}
21 |     echo "$(date) ${model} ${curr_max_tiles}"
22 | 
23 |     srun \
24 |         -p Intern5 \
25 |         --gres=gpu:0 \
26 |     python -u tools/reasoning_data_pipeline/mmpr_data_pipeline_correctness_postprocess.py \
27 |         --data-dir "${data_dir}/${model}/max_tiles_${curr_max_tiles}" \
28 |         --save-dir "${save_dir}/${model}" \
29 |         --answer-fix \
30 |         --force \
31 |         --num-pairs-per-key 15 \
32 |         --max-lines 1200000
33 | 
34 | done
35 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl3.0/visualprm_data_construction/visualprm_build_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 4 | 
 5 | PROMPT_VERSION="en_v2"
 6 | data_dir="outputs_prm/visualprm_v1_1_${PROMPT_VERSION}_raw"
 7 | save_dir="outputs_prm/visualprm_v1_1_${PROMPT_VERSION}_conv"
 8 | 
 9 | model="OpenGVLab_InternVL3-8B"
10 | 
11 | declare -a max_tiles=( \
12 |     "1" \
13 |     "6" \
14 |     "12" \
15 |     "18" \
16 |     "24" \
17 | )
18 | 
19 | for ((j=0; j<${#max_tiles[@]}; j++)); do
20 |     curr_max_tiles=${max_tiles[j]}
21 |     echo "$(date) ${model} ${curr_max_tiles}"
22 | 
23 |     srun \
24 |         -p Intern5 \
25 |         --gres=gpu:0 \
26 |     python -u tools/reasoning_data_pipeline/visualprm_data_pipeline_postprocess.py \
27 |         --data-dir "${data_dir}/${model}/max_tiles_${curr_max_tiles}" \
28 |         --save-dir "${save_dir}/${model}" \
29 |         --mc-threshold 0.0
30 | 
31 | done
32 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/convert_to_int8.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModel, AutoTokenizer
 3 | 
 4 | path = 'OpenGVLab/InternVL-Chat-V1-5'
 5 | model = AutoModel.from_pretrained(
 6 |     path,
 7 |     torch_dtype=torch.bfloat16,
 8 |     low_cpu_mem_usage=True,
 9 |     trust_remote_code=True,
10 |     load_in_8bit=True).eval()
11 | 
12 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
13 | 
14 | model.save_pretrained('release/InternVL-Chat-V1-5-Int8')
15 | tokenizer.save_pretrained('release/InternVL-Chat-V1-5-Int8')
16 | print('finished')
17 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/extract_mlp.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os.path
 3 | 
 4 | import torch
 5 | from internvl.model.internvl_chat import InternVLChatModel
 6 | 
 7 | argparse = argparse.ArgumentParser()
 8 | argparse.add_argument('model_path', type=str, default='')
 9 | argparse.add_argument('output_path', type=str, default='')
10 | 
11 | args = argparse.parse_args()
12 | 
13 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
14 | model = model.mlp1.to(torch.bfloat16)
15 | 
16 | ckpt = model.state_dict()
17 | output_path = os.path.join(args.output_path, 'mlp_projector.pth')
18 | torch.save(ckpt, output_path)
19 | print('finished')
20 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/extract_vit.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | from internvl.model.internvl_chat import InternVLChatModel
 5 | 
 6 | argparse = argparse.ArgumentParser()
 7 | argparse.add_argument('model_path', type=str, default='')
 8 | argparse.add_argument('output_path', type=str, default='')
 9 | 
10 | args = argparse.parse_args()
11 | 
12 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
13 | model = model.vision_model.to(torch.bfloat16)
14 | 
15 | model.save_pretrained(args.output_path)
16 | print('finished')
17 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/json2jsonl.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | argparse = argparse.ArgumentParser()
 5 | argparse.add_argument('path', type=str)
 6 | 
 7 | args = argparse.parse_args()
 8 | 
 9 | assert args.path.endswith('.json')
10 | 
11 | data = json.load(open(args.path))
12 | writer = open(args.path.replace('.json', '.jsonl'), 'w')
13 | for idx, item in enumerate(data):
14 |     conversations = item['conversations']
15 |     if conversations[0]['from'] == 'system':
16 |         item['conversations'] = item['conversations'][1:]
17 |     item['id'] = idx
18 |     writer.write(json.dumps(item, ensure_ascii=False) + '\n')
19 | 
20 | writer.close()
21 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/jsonl2jsonl.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | argparse = argparse.ArgumentParser()
 6 | argparse.add_argument('path', type=str)
 7 | 
 8 | args = argparse.parse_args()
 9 | 
10 | assert args.path.endswith('.jsonl')
11 | 
12 | f = open(args.path)
13 | data = [json.loads(line) for line in f.readlines()]
14 | writer = open(args.path.replace('.jsonl', '_new.jsonl'), 'w')
15 | for idx, item in enumerate(data):
16 |     item['id'] = idx
17 |     conversations = item['conversations']
18 |     if conversations[0]['from'] == 'system':
19 |         item['conversations'] = item['conversations'][1:]
20 |     writer.write(json.dumps(item, ensure_ascii=False) + '\n')
21 | 
22 | writer.close()
23 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/merge_lora.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | from internvl.model.internvl_chat import InternVLChatModel
 5 | from transformers import AutoTokenizer
 6 | 
 7 | argparse = argparse.ArgumentParser()
 8 | argparse.add_argument('input_path', type=str, help='Path to the input model')
 9 | argparse.add_argument('output_path', type=str, help='Path to the output model')
10 | args = argparse.parse_args()
11 | 
12 | print('Loading model...')
13 | model = InternVLChatModel.from_pretrained(
14 |     args.input_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).eval()
15 | print('Loading tokenizer...')
16 | tokenizer = AutoTokenizer.from_pretrained(args.input_path, trust_remote_code=True)
17 | 
18 | if model.config.use_backbone_lora:
19 |     model.vision_model.merge_and_unload()
20 |     model.vision_model = model.vision_model.model
21 |     model.config.use_backbone_lora = 0
22 | if model.config.use_llm_lora:
23 |     model.language_model.merge_and_unload()
24 |     model.language_model = model.language_model.model
25 |     model.config.use_llm_lora = 0
26 | 
27 | print('Saving model...')
28 | model.save_pretrained(args.output_path)
29 | print('Saving tokenizer...')
30 | tokenizer.save_pretrained(args.output_path)
31 | print('Done!')
32 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/replace_llm.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | from internvl.model.internvl_chat import InternVLChatModel
 5 | from transformers import AutoModel, AutoTokenizer
 6 | 
 7 | argparse = argparse.ArgumentParser()
 8 | argparse.add_argument('model_path', type=str, default='')
 9 | argparse.add_argument('llm_path', type=str, default='')
10 | 
11 | args = argparse.parse_args()
12 | 
13 | if args.model_path[-1] == '/':
14 |     args.model_path = args.model_path[:-1]
15 | 
16 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
17 | 
18 | llm = AutoModel.from_pretrained(
19 |     args.llm_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
20 | tokenizer = AutoTokenizer.from_pretrained(
21 |     args.llm_path, trust_remote_code=True)
22 | model.language_model = llm
23 | model.config.llm_config = llm.config
24 | model.to(torch.bfloat16)
25 | 
26 | output_path = args.model_path + '_replace_llm'
27 | model.save_pretrained(output_path)
28 | tokenizer.save_pretrained(output_path)
29 | print('finished')
30 | 


--------------------------------------------------------------------------------
/internvl_chat/tools/resize_pos_embed.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | from internvl.model.internvl_chat import InternVLChatModel
 5 | from transformers import AutoTokenizer
 6 | 
 7 | argparse = argparse.ArgumentParser()
 8 | argparse.add_argument('model_path', type=str, default='')
 9 | argparse.add_argument('output_path', type=str, default='')
10 | argparse.add_argument('force_image_size', type=int, default=448)
11 | 
12 | args = argparse.parse_args()
13 | 
14 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
15 | model.vision_model.resize_pos_embeddings(old_size=model.config.vision_config.image_size,
16 |                                          new_size=args.force_image_size,
17 |                                          patch_size=14)
18 | model.config.vision_config.image_size = args.force_image_size
19 | model.config.force_image_size = args.force_image_size
20 | 
21 | model.save_pretrained(args.output_path)
22 | 
23 | tokenizer = AutoTokenizer.from_pretrained(args.model_path)
24 | tokenizer.save_pretrained(args.output_path)
25 | print('finished')
26 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage1_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 1,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e9,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e9,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "optimizer": {
24 |     "type": "AdamW",
25 |     "params": {
26 |       "lr": "auto",
27 |       "betas": [
28 |         0.9,
29 |         0.999
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": true
41 | }
42 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 2,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e8,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "optimizer": {
24 |     "type": "AdamW",
25 |     "params": {
26 |       "lr": "auto",
27 |       "betas": [
28 |         0.9,
29 |         0.999
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": false
41 | }
42 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e7,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_100b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e4,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_100b_1e8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e8,
 7 |     "reduce_bucket_size": 1e8,
 8 |     "stage3_prefetch_bucket_size": 1e8,
 9 |     "stage3_param_persistence_threshold": 1e4,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_34b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e5,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_70b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e5,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/images/demo_cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/demo_cli.gif


--------------------------------------------------------------------------------
/internvl_chat_llava/images/llava_example_cmp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/llava_example_cmp.png


--------------------------------------------------------------------------------
/internvl_chat_llava/images/llava_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/llava_logo.png


--------------------------------------------------------------------------------
/internvl_chat_llava/images/llava_v1_5_radar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/llava_v1_5_radar.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/table/model.jsonl:
--------------------------------------------------------------------------------
1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}
6 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/table/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
5 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
3 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | class SharedEmbedding(nn.Embedding):
 7 | 
 8 |     def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") \
 9 |             or "intern" in vision_tower.lower():
10 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
11 | 
12 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
13 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/__init__.py


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img1.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img2.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img3.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img4.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/img5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img5.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/img6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img6.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | # from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | # replace_llama_attn_with_flash_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train(attn_implementation="flash_attention_2")
14 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/llava/train/train_mem_custom.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | # from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | # replace_llama_attn_with_flash_attn()
 9 | 
10 | from llava.train.train_custom import train
11 | from llava.train.dist_utils import init_dist
12 | 
13 | if __name__ == "__main__":
14 |     try:
15 |         init_dist(launcher='slurm', backend='nccl')
16 |         print("slurm environment detected")
17 |     except:
18 |         init_dist(launcher='pytorch', backend='nccl')
19 |     train(attn_implementation="flash_attention_2")
20 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llava"
 7 | version = "1.1.1"
 8 | description = "Towards GPT-4 like large language and visual assistant."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch>=2", "torchvision>=0.15",
17 |     "transformers>=4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate", "peft>=0.4.0", "bitsandbytes==0.41.0",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn>=1.2.2",
20 |     "gradio==3.35.2", "gradio_client==0.2.9",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "deepspeed==0.13.5", "einops", "einops-exts", "timm==0.9.12",
23 | ]
24 | 
25 | [project.urls]
26 | "Homepage" = "https://github.com/OpenGVLab/InternVL"
27 | "Bug Tracker" = "https://github.com/OpenGVLab/InternVL/issues"
28 | 
29 | [tool.setuptools.packages.find]
30 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
31 | 
32 | [tool.wheel]
33 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
34 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/sqa_eval_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | for IDX in {0..7}; do
 5 |     CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
 6 |         --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
 7 |         --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
 8 |         --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
 9 |         --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
10 |         --num-chunks $CHUNKS \
11 |         --chunk-idx $IDX \
12 |         --conv-mode llava_v1 &
13 | done
14 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/sqa_eval_gather.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | output_file="test_llava-13b.jsonl"
 5 | 
 6 | # Clear out the output file if it exists.
 7 | > "$output_file"
 8 | 
 9 | # Loop through the indices and concatenate each file.
10 | for idx in $(seq 0 $((CHUNKS-1))); do
11 |   cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
12 | done
13 | 
14 | python llava/eval/eval_science_qa.py \
15 |     --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
16 |     --result-file ./test_llava-13b.jsonl \
17 |     --output-file ./test_llava-13b_output.json \
18 |     --output-result ./test_llava-13b_result.json
19 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
 6 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
 7 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
12 | 
13 | python llava/eval/eval_gpt_review_bench.py \
14 |     --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
15 |     --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
16 |     --rule llava/eval/table/rule.json \
17 |     --answer-list \
18 |         playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
19 |         playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
20 |     --output \
21 |         playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
22 | 
23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
24 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="mmbench_dev_20230712"
 4 | 
 5 | python -m llava.eval.model_vqa_mmbench \
 6 |     --model-path liuhaotian/llava-v1.5-13b \
 7 |     --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
 8 |     --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \
 9 |     --single-pred-prompt \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
14 | 
15 | python scripts/convert_mmbench_for_submission.py \
16 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
17 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
18 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
19 |     --experiment llava-v1.5-13b
20 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="mmbench_dev_cn_20231003"
 4 | 
 5 | python -m llava.eval.model_vqa_mmbench \
 6 |     --model-path liuhaotian/llava-v1.5-13b \
 7 |     --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
 8 |     --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \
 9 |     --lang cn \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode vicuna_v1
13 | 
14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
18 |     --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \
19 |     --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \
20 |     --experiment llava-v1.5-13b
21 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
 6 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 7 |     --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | cd ./playground/data/eval/MME
12 | 
13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b
14 | 
15 | cd eval_tool
16 | 
17 | python calculation.py --results_dir answers/llava-v1.5-13b
18 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
 6 |     --image-folder ./playground/data/eval/mm-vet/images \
 7 |     --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | mkdir -p ./playground/data/eval/mm-vet/results
12 | 
13 | python scripts/convert_mmvet_for_eval.py \
14 |     --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \
15 |     --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json
16 | 
17 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
 6 |     --image-folder ./playground/data/eval/pope/val2014 \
 7 |     --answers-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python llava/eval/eval_pope.py \
12 |     --annotation-dir ./playground/data/eval/pope/coco \
13 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
14 |     --result-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl
15 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_science \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
 6 |     --image-folder ./playground/data/eval/scienceqa/images/test \
 7 |     --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \
 8 |     --single-pred-prompt \
 9 |     --temperature 0 \
10 |     --conv-mode vicuna_v1
11 | 
12 | python llava/eval/eval_science_qa.py \
13 |     --base-dir ./playground/data/eval/scienceqa \
14 |     --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \
15 |     --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \
16 |     --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json
17 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
 6 |     --image-folder ./playground/data/eval/textvqa/train_images \
 7 |     --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python -m llava.eval.eval_textvqa \
12 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
13 |     --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl
14 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
 6 |     --image-folder ./playground/data/eval/vizwiz/test \
 7 |     --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python scripts/convert_vizwiz_for_submission.py \
12 |     --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
13 |     --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
14 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json
15 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-v1.5-13b"
 9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 |         --model-path liuhaotian/llava-v1.5-13b \
14 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
15 |         --image-folder ./playground/data/eval/vqav2/test2015 \
16 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode vicuna_v1 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
36 | 
37 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/v1_5/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 6 |     --version plain \
 7 |     --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
 8 |     --image_folder ./playground/data/LLaVA-Pretrain/images \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --tune_mm_mlp_adapter True \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --bf16 True \
16 |     --output_dir ./checkpoints/llava-v1.5-13b-pretrain \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 32 \
19 |     --per_device_eval_batch_size 4 \
20 |     --gradient_accumulation_steps 1 \
21 |     --evaluation_strategy "no" \
22 |     --save_strategy "steps" \
23 |     --save_steps 24000 \
24 |     --save_total_limit 1 \
25 |     --learning_rate 1e-3 \
26 |     --weight_decay 0. \
27 |     --warmup_ratio 0.03 \
28 |     --lr_scheduler_type "cosine" \
29 |     --logging_steps 1 \
30 |     --tf32 True \
31 |     --model_max_length 2048 \
32 |     --gradient_checkpointing True \
33 |     --dataloader_num_workers 4 \
34 |     --lazy_preprocess True \
35 |     --report_to wandb
36 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/zero1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 1,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
12 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
13 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/${MODEL_NAME}.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
18 | 
19 | python llava/eval/eval_gpt_review_bench.py \
20 |     --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
21 |     --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
22 |     --rule llava/eval/table/rule.json \
23 |     --answer-list \
24 |         playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
25 |         playground/data/eval/llava-bench-in-the-wild/answers/${MODEL_NAME}.jsonl \
26 |     --output \
27 |         playground/data/eval/llava-bench-in-the-wild/reviews/${MODEL_NAME}.jsonl
28 | 
29 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/${MODEL_NAME}.jsonl
30 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | SPLIT="mmbench_dev_20230712"  # "mmbench_dev_20230712" or "mmbench_test_en_20231003"
10 | 
11 | python -m llava.eval.model_vqa_mmbench \
12 |     --model-path ${OUTPUT_DIR} \
13 |     --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
14 |     --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${MODEL_NAME}.jsonl \
15 |     --single-pred-prompt \
16 |     --temperature 0 \
17 |     --conv-mode vicuna_v1
18 | 
19 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
20 | 
21 | python scripts/convert_mmbench_for_submission.py \
22 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
23 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
24 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
25 |     --experiment ${MODEL_NAME}
26 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa_loader \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
12 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
13 |     --answers-file ./playground/data/eval/MME/answers/${MODEL_NAME}.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | cd ./playground/data/eval/MME
18 | 
19 | python convert_answer_to_mme.py --experiment ${MODEL_NAME}
20 | 
21 | cd eval_tool
22 | 
23 | python calculation.py --results_dir answers/${MODEL_NAME}
24 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
12 |     --image-folder ./playground/data/eval/mm-vet/images \
13 |     --answers-file ./playground/data/eval/mm-vet/answers/${MODEL_NAME}.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | mkdir -p ./playground/data/eval/mm-vet/results
18 | 
19 | python scripts/convert_mmvet_for_eval.py \
20 |     --src ./playground/data/eval/mm-vet/answers/${MODEL_NAME}.jsonl \
21 |     --dst ./playground/data/eval/mm-vet/results/${MODEL_NAME}.json
22 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa_loader \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
12 |     --image-folder ./playground/data/eval/pope/val2014 \
13 |     --answers-file ./playground/data/eval/pope/answers/${MODEL_NAME}.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | python llava/eval/eval_pope.py \
18 |     --annotation-dir ./playground/data/eval/pope/coco \
19 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
20 |     --result-file ./playground/data/eval/pope/answers/${MODEL_NAME}.jsonl
21 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa_science \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
12 |     --image-folder ./playground/data/eval/scienceqa/images/test \
13 |     --answers-file ./playground/data/eval/scienceqa/answers/${MODEL_NAME}.jsonl \
14 |     --single-pred-prompt \
15 |     --temperature 0 \
16 |     --conv-mode vicuna_v1
17 | 
18 | python llava/eval/eval_science_qa.py \
19 |     --base-dir ./playground/data/eval/scienceqa \
20 |     --result-file ./playground/data/eval/scienceqa/answers/${MODEL_NAME}.jsonl \
21 |     --output-file ./playground/data/eval/scienceqa/answers/${MODEL_NAME}_output.jsonl \
22 |     --output-result ./playground/data/eval/scienceqa/answers/${MODEL_NAME}_result.json
23 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa_loader \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
12 |     --image-folder ./playground/data/eval/textvqa/train_images \
13 |     --answers-file ./playground/data/eval/textvqa/answers/${MODEL_NAME}.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | python -m llava.eval.eval_textvqa \
18 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
19 |     --result-file ./playground/data/eval/textvqa/answers/${MODEL_NAME}.jsonl
20 | 


--------------------------------------------------------------------------------
/internvl_chat_llava/scripts_internvl/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 5 | 
 6 | OUTPUT_DIR=$1
 7 | MODEL_NAME=$(basename ${OUTPUT_DIR})
 8 | 
 9 | python -m llava.eval.model_vqa_loader \
10 |     --model-path ${OUTPUT_DIR} \
11 |     --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
12 |     --image-folder ./playground/data/eval/vizwiz/test \
13 |     --answers-file ./playground/data/eval/vizwiz/answers/${MODEL_NAME}.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | python scripts/convert_vizwiz_for_submission.py \
18 |     --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
19 |     --result-file ./playground/data/eval/vizwiz/answers/${MODEL_NAME}.jsonl \
20 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/${MODEL_NAME}.json
21 | 


--------------------------------------------------------------------------------
/internvl_g/evaluate.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | CHECKPOINT=${1}
 4 | DATASET=${2}
 5 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 6 | echo "CHECKPOINT: ${CHECKPOINT}"
 7 | 
 8 | if  [ ${DATASET} == "caption" ]; then
 9 |   torchrun \
10 |     --nnodes=1 \
11 |     --node_rank=0 \
12 |     --master_addr=127.0.0.1 \
13 |     --nproc_per_node=8 \
14 |     --master_port=63667 \
15 |     eval/evaluate_caption.py --checkpoint ${CHECKPOINT}
16 | fi
17 | 
18 | if  [ ${DATASET} == "caption-coco" ]; then
19 |   torchrun \
20 |     --nnodes=1 \
21 |     --node_rank=0 \
22 |     --master_addr=127.0.0.1 \
23 |     --nproc_per_node=8 \
24 |     --master_port=63667 \
25 |     eval/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets coco
26 | fi
27 | 
28 | if  [ ${DATASET} == "caption-flickr30k" ]; then
29 |   torchrun \
30 |     --nnodes=1 \
31 |     --node_rank=0 \
32 |     --master_addr=127.0.0.1 \
33 |     --nproc_per_node=8 \
34 |     --master_port=63667 \
35 |     eval/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets flickr30k
36 | fi
37 | 
38 | if  [ ${DATASET} == "caption-nocaps" ]; then
39 |   torchrun \
40 |     --nnodes=1 \
41 |     --node_rank=0 \
42 |     --master_addr=127.0.0.1 \
43 |     --nproc_per_node=8 \
44 |     --master_port=63667 \
45 |     eval/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets nocaps
46 | fi
47 | 


--------------------------------------------------------------------------------
/internvl_g/internvl/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_g/internvl/model/__init__.py


--------------------------------------------------------------------------------
/internvl_g/internvl/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_g/internvl/train/__init__.py


--------------------------------------------------------------------------------
/internvl_g/zero_stage1_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 1,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e9,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e9,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "scheduler": {
24 |     "type": "WarmupDecayLR",
25 |     "params": {
26 |       "warmup_min_lr": "auto",
27 |       "warmup_max_lr": "auto",
28 |       "warmup_num_steps": "auto",
29 |       "total_num_steps": "auto"
30 |     }
31 |   },
32 |   "gradient_accumulation_steps": "auto",
33 |   "gradient_clipping": "auto",
34 |   "steps_per_print": 2000,
35 |   "train_batch_size": "auto",
36 |   "train_micro_batch_size_per_gpu": "auto",
37 |   "wall_clock_breakdown": true
38 | }
39 | 


--------------------------------------------------------------------------------
/internvl_g/zero_stage2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 2,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 2e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e9,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "scheduler": {
24 |     "type": "WarmupDecayLR",
25 |     "params": {
26 |       "warmup_min_lr": "auto",
27 |       "warmup_max_lr": "auto",
28 |       "warmup_num_steps": "auto",
29 |       "total_num_steps": "auto"
30 |     }
31 |   },
32 |   "gradient_accumulation_steps": "auto",
33 |   "gradient_clipping": "auto",
34 |   "steps_per_print": 2000,
35 |   "train_batch_size": "auto",
36 |   "train_micro_batch_size_per_gpu": "auto",
37 |   "wall_clock_breakdown": true
38 | }
39 | 


--------------------------------------------------------------------------------
/internvl_g/zero_stage3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e5,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "scheduler": {
27 |     "type": "WarmupDecayLR",
28 |     "params": {
29 |       "warmup_min_lr": "auto",
30 |       "warmup_max_lr": "auto",
31 |       "warmup_num_steps": "auto",
32 |       "total_num_steps": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": true
41 | }
42 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/internvl_chat.txt
2 | -r requirements/streamlit_demo.txt
3 | -r requirements/classification.txt
4 | -r requirements/segmentation.txt
5 | 


--------------------------------------------------------------------------------
/requirements/classification.txt:
--------------------------------------------------------------------------------
1 | gdown
2 | termcolor
3 | yacs
4 | 


--------------------------------------------------------------------------------
/requirements/clip_benchmark.txt:
--------------------------------------------------------------------------------
 1 | open_clip_torch>=0.2.1
 2 | opencv-python
 3 | peft>=0.6.2
 4 | protobuf
 5 | pycocoevalcap
 6 | pyyaml
 7 | scikit-learn>=1.0,<2
 8 | scikit-learn
 9 | scipy
10 | task_adaptation
11 | tensorflow==2.11.0
12 | termcolor
13 | tqdm>=2
14 | transformers>=4.32.0
15 | webdataset>=0.2.31
16 | yacs
17 | 


--------------------------------------------------------------------------------
/requirements/internvl_chat.txt:
--------------------------------------------------------------------------------
 1 | accelerate<1
 2 | bitsandbytes==0.42.0
 3 | decord
 4 | deepspeed>=0.13.5
 5 | einops==0.6.1
 6 | einops-exts==0.0.4
 7 | huggingface_hub
 8 | imageio
 9 | numpy==1.26.4
10 | opencv-python
11 | orjson
12 | peft==0.10.0
13 | pycocoevalcap
14 | pyyaml
15 | scikit-learn>=1.2.2
16 | scipy
17 | sentencepiece==0.1.99
18 | shortuuid
19 | tensorboardX
20 | termcolor
21 | timm==0.9.12
22 | tokenizers==0.15.1
23 | torch>=2
24 | torchvision>=0.15
25 | tqdm
26 | transformers==4.37.2
27 | yacs
28 | 


--------------------------------------------------------------------------------
/requirements/segmentation.txt:
--------------------------------------------------------------------------------
 1 | future
 2 | importlib_metadata
 3 | mmcv-full==1.6.2
 4 | mmsegmentation==0.30.0
 5 | openmim
 6 | ordered-set
 7 | platformdirs
 8 | tensorboard
 9 | tomli
10 | yapf==0.40.1
11 | 


--------------------------------------------------------------------------------
/requirements/streamlit_demo.txt:
--------------------------------------------------------------------------------
 1 | fastapi
 2 | gradio==3.35.2
 3 | gradio_client==0.2.9
 4 | httpx==0.24.0
 5 | markdown2[all]
 6 | pydantic
 7 | requests
 8 | streamlit
 9 | streamlit-image-select
10 | uvicorn
11 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/pascal_voc12_aug.py:
--------------------------------------------------------------------------------
 1 | _base_ = './pascal_voc12.py'
 2 | # dataset settings
 3 | data = dict(
 4 |     train=dict(
 5 |         ann_dir=['SegmentationClass', 'SegmentationClassAug'],
 6 |         split=[
 7 |             'ImageSets/Segmentation/train.txt',
 8 |             'ImageSets/Segmentation/aug.txt'
 9 |         ]))
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | # yapf:disable
 2 | log_config = dict(
 3 |     interval=50,
 4 |     hooks=[
 5 |         dict(type='TextLoggerHook', by_epoch=False),
 6 |         dict(type='TensorboardLoggerHook')
 7 |         # dict(type='PaviLoggerHook') # for internal services
 8 |     ])
 9 | # yapf:enable
10 | dist_params = dict(backend='nccl')
11 | log_level = 'INFO'
12 | load_from = None
13 | resume_from = None
14 | workflow = [('train', 1)]
15 | cudnn_benchmark = True
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/cgnet.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='CGNet',
 7 |         norm_cfg=norm_cfg,
 8 |         in_channels=3,
 9 |         num_channels=(32, 64, 128),
10 |         num_blocks=(3, 21),
11 |         dilations=(2, 4),
12 |         reductions=(8, 16)),
13 |     decode_head=dict(
14 |         type='FCNHead',
15 |         in_channels=256,
16 |         in_index=2,
17 |         channels=256,
18 |         num_convs=0,
19 |         concat_input=False,
20 |         dropout_ratio=0,
21 |         num_classes=19,
22 |         norm_cfg=norm_cfg,
23 |         loss_decode=dict(
24 |             type='CrossEntropyLoss',
25 |             use_sigmoid=False,
26 |             loss_weight=1.0,
27 |             class_weight=[
28 |                 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
29 |                 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
30 |                 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
31 |                 10.396974, 10.055647
32 |             ])),
33 |     # model training and testing settings
34 |     train_cfg=dict(sampler=None),
35 |     test_cfg=dict(mode='whole'))
36 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/dpt_vit-b16.py:
--------------------------------------------------------------------------------
 1 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 2 | model = dict(
 3 |     type='EncoderDecoder',
 4 |     pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa
 5 |     backbone=dict(
 6 |         type='VisionTransformer',
 7 |         img_size=224,
 8 |         embed_dims=768,
 9 |         num_layers=12,
10 |         num_heads=12,
11 |         out_indices=(2, 5, 8, 11),
12 |         final_norm=False,
13 |         with_cls_token=True,
14 |         output_cls_token=True),
15 |     decode_head=dict(
16 |         type='DPTHead',
17 |         in_channels=(768, 768, 768, 768),
18 |         channels=256,
19 |         embed_dims=768,
20 |         post_process_channels=[96, 192, 384, 768],
21 |         num_classes=150,
22 |         readout_type='project',
23 |         input_transform='multiple_select',
24 |         in_index=(0, 1, 2, 3),
25 |         norm_cfg=norm_cfg,
26 |         loss_decode=dict(
27 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28 |     auxiliary_head=None,
29 |     # model training and testing settings
30 |     train_cfg=dict(),
31 |     test_cfg=dict(mode='whole'))  # yapf: disable
32 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/erfnet_fcn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained=None,
 6 |     backbone=dict(
 7 |         type='ERFNet',
 8 |         in_channels=3,
 9 |         enc_downsample_channels=(16, 64, 128),
10 |         enc_stage_non_bottlenecks=(5, 8),
11 |         enc_non_bottleneck_dilations=(2, 4, 8, 16),
12 |         enc_non_bottleneck_channels=(64, 128),
13 |         dec_upsample_channels=(64, 16),
14 |         dec_stages_non_bottleneck=(2, 2),
15 |         dec_non_bottleneck_channels=(64, 16),
16 |         dropout_ratio=0.1,
17 |         init_cfg=None),
18 |     decode_head=dict(
19 |         type='FCNHead',
20 |         in_channels=16,
21 |         channels=128,
22 |         num_convs=1,
23 |         concat_input=False,
24 |         dropout_ratio=0.1,
25 |         num_classes=19,
26 |         norm_cfg=norm_cfg,
27 |         align_corners=False,
28 |         loss_decode=dict(
29 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
30 |     # model training and testing settings
31 |     train_cfg=dict(),
32 |     test_cfg=dict(mode='whole'))
33 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/fpn_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained='open-mmlab://resnet50_v1c',
 6 |     backbone=dict(
 7 |         type='ResNetV1c',
 8 |         depth=50,
 9 |         num_stages=4,
10 |         out_indices=(0, 1, 2, 3),
11 |         dilations=(1, 1, 1, 1),
12 |         strides=(1, 2, 2, 2),
13 |         norm_cfg=norm_cfg,
14 |         norm_eval=False,
15 |         style='pytorch',
16 |         contract_dilation=True),
17 |     neck=dict(
18 |         type='FPN',
19 |         in_channels=[256, 512, 1024, 2048],
20 |         out_channels=256,
21 |         num_outs=4),
22 |     decode_head=dict(
23 |         type='FPNHead',
24 |         in_channels=[256, 256, 256, 256],
25 |         in_index=[0, 1, 2, 3],
26 |         feature_strides=[4, 8, 16, 32],
27 |         channels=128,
28 |         dropout_ratio=0.1,
29 |         num_classes=19,
30 |         norm_cfg=norm_cfg,
31 |         align_corners=False,
32 |         loss_decode=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34 |     # model training and testing settings
35 |     train_cfg=dict(),
36 |     test_cfg=dict(mode='whole'))
37 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/lraspp_m-v3-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='MobileNetV3',
 7 |         arch='large',
 8 |         out_indices=(1, 3, 16),
 9 |         norm_cfg=norm_cfg),
10 |     decode_head=dict(
11 |         type='LRASPPHead',
12 |         in_channels=(16, 24, 960),
13 |         in_index=(0, 1, 2),
14 |         channels=128,
15 |         input_transform='multiple_select',
16 |         dropout_ratio=0.1,
17 |         num_classes=19,
18 |         norm_cfg=norm_cfg,
19 |         act_cfg=dict(type='ReLU'),
20 |         align_corners=False,
21 |         loss_decode=dict(
22 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
23 |     # model training and testing settings
24 |     train_cfg=dict(),
25 |     test_cfg=dict(mode='whole'))
26 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/segformer_mit-b0.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained=None,
 6 |     backbone=dict(
 7 |         type='MixVisionTransformer',
 8 |         in_channels=3,
 9 |         embed_dims=32,
10 |         num_stages=4,
11 |         num_layers=[2, 2, 2, 2],
12 |         num_heads=[1, 2, 5, 8],
13 |         patch_sizes=[7, 3, 3, 3],
14 |         sr_ratios=[8, 4, 2, 1],
15 |         out_indices=(0, 1, 2, 3),
16 |         mlp_ratio=4,
17 |         qkv_bias=True,
18 |         drop_rate=0.0,
19 |         attn_drop_rate=0.0,
20 |         drop_path_rate=0.1),
21 |     decode_head=dict(
22 |         type='SegformerHead',
23 |         in_channels=[32, 64, 160, 256],
24 |         in_index=[0, 1, 2, 3],
25 |         channels=256,
26 |         dropout_ratio=0.1,
27 |         num_classes=19,
28 |         norm_cfg=norm_cfg,
29 |         align_corners=False,
30 |         loss_decode=dict(
31 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32 |     # model training and testing settings
33 |     train_cfg=dict(),
34 |     test_cfg=dict(mode='whole'))
35 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/segmenter_vit-b16_mask.py:
--------------------------------------------------------------------------------
 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_base_p16_384_20220308-96dfe169.pth'  # noqa
 2 | # model settings
 3 | backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
 4 | model = dict(
 5 |     type='EncoderDecoder',
 6 |     pretrained=checkpoint,
 7 |     backbone=dict(
 8 |         type='VisionTransformer',
 9 |         img_size=(512, 512),
10 |         patch_size=16,
11 |         in_channels=3,
12 |         embed_dims=768,
13 |         num_layers=12,
14 |         num_heads=12,
15 |         drop_path_rate=0.1,
16 |         attn_drop_rate=0.0,
17 |         drop_rate=0.0,
18 |         final_norm=True,
19 |         norm_cfg=backbone_norm_cfg,
20 |         with_cls_token=True,
21 |         interpolate_mode='bicubic',
22 |     ),
23 |     decode_head=dict(
24 |         type='SegmenterMaskTransformerHead',
25 |         in_channels=768,
26 |         channels=768,
27 |         num_classes=150,
28 |         num_layers=2,
29 |         num_heads=12,
30 |         embed_dims=768,
31 |         dropout_ratio=0.0,
32 |         loss_decode=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
34 |     ),
35 |     test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(480, 480)),
36 | )
37 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_10k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=10000)
 8 | checkpoint_config = dict(by_epoch=False, interval=1000)
 9 | evaluation = dict(interval=1000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=160000)
 8 | checkpoint_config = dict(by_epoch=False, interval=16000)
 9 | evaluation = dict(interval=16000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=20000)
 8 | checkpoint_config = dict(by_epoch=False, interval=2000)
 9 | evaluation = dict(interval=2000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_320k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=320000)
 8 | checkpoint_config = dict(by_epoch=False, interval=32000)
 9 | evaluation = dict(interval=32000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=40000)
 8 | checkpoint_config = dict(by_epoch=False, interval=4000)
 9 | evaluation = dict(interval=4000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_5k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=5000)
 8 | checkpoint_config = dict(by_epoch=False, interval=1000)
 9 | evaluation = dict(interval=1000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=80000)
 8 | checkpoint_config = dict(by_epoch=False, interval=8000)
 9 | evaluation = dict(interval=8000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | PORT=${PORT:-29510}
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \
 9 |     $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
10 | 


--------------------------------------------------------------------------------
/segmentation/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | PORT=${PORT:-29300}
 6 | 
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \
 9 |     $(dirname "$0")/train.py $CONFIG --launcher pytorch --deterministic ${@:3}
10 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # InternVL
3 | # Copyright (c) 2023 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | from .datasets import *  # noqa: F401,F403
7 | from .models import *  # noqa: F401,F403
8 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2023 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | from .ade import ADE20KDataset
 7 | from .pipelines import *  # noqa: F401,F403
 8 | 
 9 | __all__ = ['ADE20KDataset']
10 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2023 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | from .transform import PadShortSide, SETR_Resize
 7 | 
 8 | __all__ = [
 9 |     'SETR_Resize', 'PadShortSide',
10 | ]
11 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # InternVL
3 | # Copyright (c) 2023 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | from .backbones import *  # noqa: F401,F403
7 | from .decode_heads import *  # noqa: F401,F403
8 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .intern_vit_6b import InternViT6B
2 | 
3 | __all__ = ['InternViT6B']
4 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2023 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .fcn_head import FCNHead
 8 | 
 9 | __all__ = ['FCNHead']
10 | 


--------------------------------------------------------------------------------
/segmentation/release.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | 
 5 | parser = argparse.ArgumentParser(description='Hyperparams')
 6 | parser.add_argument('filename', nargs='?', type=str, default=None)
 7 | 
 8 | args = parser.parse_args()
 9 | 
10 | model = torch.load(args.filename, map_location=torch.device('cpu'))
11 | model = model['module']
12 | 
13 | # new_model = {}
14 | # for k, v in model.items():
15 | #     if "backbone.blocks" in k:
16 | #         continue
17 | #     if "auxiliary_head" in k:
18 | #         continue
19 | #     if "pos_embed" in k or "patch_embed" in k or "cls_token" in k:
20 | #         continue
21 | #     try:
22 | #         if "bn" in k:
23 | #             print("fp32:", k)
24 | #             new_model[k] = v
25 | #         else:
26 | #             new_model[k] = v
27 | #     except:
28 | #         new_model[k] = v
29 | # print(new_model.keys())
30 | 
31 | # new_dict = {'state_dict': new_state_dict}
32 | torch.save(model, args.filename.replace('.pt', '_release.pt'))
33 | 


--------------------------------------------------------------------------------
/segmentation/slurm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | CHECKPOINT=$4
 9 | GPUS=${GPUS:-8}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12 | PY_ARGS=${@:5}
13 | SRUN_ARGS=${SRUN_ARGS:-""}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     ${SRUN_ARGS} \
24 |     python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
25 | 


--------------------------------------------------------------------------------
/segmentation/slurm_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | GPUS=${GPUS:-8}
 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
10 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
11 | SRUN_ARGS=${SRUN_ARGS:-""}
12 | PY_ARGS=${@:4}
13 | 
14 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
15 | srun -p ${PARTITION} \
16 |     --job-name=${JOB_NAME} \
17 |     --gres=gpu:${GPUS_PER_NODE} \
18 |     --ntasks=${GPUS} \
19 |     --ntasks-per-node=${GPUS_PER_NODE} \
20 |     --cpus-per-task=${CPUS_PER_TASK} \
21 |     --kill-on-bad-exit=1 \
22 |     ${SRUN_ARGS} \
23 |     python -u train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
24 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "torch_adam": true,
 8 |       "lr": 0.00004
 9 |     }
10 |   },
11 |   "fp16": {
12 |     "enabled": true,
13 |     "auto_cast": true
14 |   },
15 |   "steps_per_print": 50
16 | }
17 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_zero1_amp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "AdamW",
 6 |     "params": {
 7 |       "lr": 0.00004
 8 |     }
 9 |   },
10 |   "amp": {
11 |     "enabled": true,
12 |     "opt_level": "O1"
13 |   },
14 |   "log": {
15 |     "steps_per_print": 50
16 |   },
17 |   "wall_clock_breakdown": false
18 | }
19 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_zero1_bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 0.00004
 8 |     }
 9 |   },
10 |   "bf16": {
11 |     "enabled": true
12 |   },
13 |   "zero_optimization": {
14 |     "stage": 1,
15 |     "reduce_bucket_size": 5e8,
16 |     "overlap_comm": true
17 |   },
18 |   "log": {
19 |     "steps_per_print": 50
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_zero1_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "AdamW",
 6 |     "params": {
 7 |       "lr": 0.00004
 8 |     }
 9 |   },
10 |   "fp16": {
11 |     "enabled": true,
12 |     "auto_cast": false
13 |   },
14 |   "zero_optimization": {
15 |     "stage": 1,
16 |     "allgather_partitions": true,
17 |     "allgather_bucket_size": 1e9,
18 |     "overlap_comm": true,
19 |     "reduce_scatter": false,
20 |     "reduce_bucket_size": 1e9,
21 |     "contiguous_gradients": true
22 |   },
23 |   "log": {
24 |     "steps_per_print": 50
25 |   },
26 |   "wall_clock_breakdown": false
27 | }
28 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_zero2_bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 0.00004
 8 |     }
 9 |   },
10 |   "bf16": {
11 |     "enabled": true
12 |   },
13 |   "zero_optimization": {
14 |     "stage": 1,
15 |     "reduce_bucket_size": 5e8,
16 |     "overlap_comm": true
17 |   },
18 |   "log": {
19 |     "steps_per_print": 50
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_zero2_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "AdamW",
 6 |     "params": {
 7 |       "lr": 0.00004
 8 |     }
 9 |   },
10 |   "fp16": {
11 |     "enabled": true
12 |   },
13 |   "zero_optimization": {
14 |     "stage": 2,
15 |     "allgather_partitions": true,
16 |     "allgather_bucket_size": 1e9,
17 |     "overlap_comm": true,
18 |     "reduce_scatter": true,
19 |     "reduce_bucket_size": 1e9,
20 |     "contiguous_gradients": true
21 |   },
22 |   "log": {
23 |     "steps_per_print": 50
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/segmentation/zero_configs/adam_zero3_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "ZeroOneAdam",
 6 |     "params": {
 7 |       "lr": 1e-3,
 8 |       "weight_decay": 0.01,
 9 |       "bias_correction": false,
10 |       "var_freeze_step": 1000,
11 |       "var_update_scaler": 16,
12 |       "local_step_scaler": 1000,
13 |       "local_step_clipper": 16,
14 |       "cuda_aware": false,
15 |       "comm_backend_name": "nccl"
16 |     }
17 |   },
18 |   "fp16": {
19 |     "enabled": true
20 |   },
21 |   "zero_optimization": {
22 |     "stage": 3,
23 |     "contiguous_gradients": true,
24 |     "stage3_max_live_parameters": 1e9,
25 |     "stage3_max_reuse_distance": 1e9,
26 |     "stage3_prefetch_bucket_size": 1e7,
27 |     "stage3_param_persistence_threshold": 1e5,
28 |     "reduce_bucket_size": 1e7,
29 |     "sub_group_size": 1e9
30 |   },
31 |   "steps_per_print": 50
32 | }
33 | 


--------------------------------------------------------------------------------
/streamlit_demo/.streamlit/config.toml:
--------------------------------------------------------------------------------
 1 | [server]
 2 | enableStaticServing = false
 3 | enableXsrfProtection = false
 4 | enableCORS = false
 5 | 
 6 | [browser] # This ip and port will show in command prompt
 7 | # serverAddress = "internvl.opengvlab.com" # Put your Local IP or Domain Name
 8 | serverPort = 10003
 9 | enableCORS = false
10 | 


--------------------------------------------------------------------------------
/streamlit_demo/constants.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 8 | WORKER_HEART_BEAT_INTERVAL = 15
 9 | 
10 | LOGDIR = 'logs/'
11 | 
12 | # Model Constants
13 | IGNORE_INDEX = -100
14 | IMAGE_TOKEN_INDEX = -200
15 | DEFAULT_IMAGE_TOKEN = '<image>'
16 | DEFAULT_IMAGE_PATCH_TOKEN = '<IMG_CONTEXT>'
17 | DEFAULT_IM_START_TOKEN = '<img>'
18 | DEFAULT_IM_END_TOKEN = '</img>'
19 | IMAGE_PLACEHOLDER = '<image-placeholder>'
20 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
21 | IMAGENET_STD = (0.229, 0.224, 0.225)
22 | 
23 | server_error_msg = '**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**'
24 | 


--------------------------------------------------------------------------------
/streamlit_demo/gallery/astro_on_unicorn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/astro_on_unicorn.png


--------------------------------------------------------------------------------
/streamlit_demo/gallery/cheetah.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/cheetah.png


--------------------------------------------------------------------------------
/streamlit_demo/gallery/prod_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_1.jpeg


--------------------------------------------------------------------------------
/streamlit_demo/gallery/prod_11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_11.jpg


--------------------------------------------------------------------------------
/streamlit_demo/gallery/prod_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_12.png


--------------------------------------------------------------------------------
/streamlit_demo/gallery/prod_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_4.png


--------------------------------------------------------------------------------
/streamlit_demo/gallery/prod_9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_9.jpg


--------------------------------------------------------------------------------
/streamlit_demo/gallery/prod_en_17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_en_17.png


--------------------------------------------------------------------------------
/streamlit_demo/static/SimHei.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/static/SimHei.ttf


--------------------------------------------------------------------------------