├── .flake8 ├── .github ├── CONTRIBUTING.md └── ISSUE_TEMPLATE │ ├── 1-bug-report.yml │ ├── 2-feature-request.yml │ └── 3-documentation.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── INSTALLATION.md ├── LICENSE ├── README.md ├── README_zh.md ├── classification ├── README.md ├── config.py ├── configs │ ├── attn_pooling_probing │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml │ │ ├── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml │ │ └── attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml │ ├── intern_vit_6b_1k_224.yaml │ ├── intern_vit_6b_1k_224_test_imagenet_a.yaml │ ├── intern_vit_6b_1k_224_test_imagenet_r.yaml │ ├── intern_vit_6b_1k_224_test_imagenet_real.yaml │ ├── intern_vit_6b_1k_224_test_imagenet_sketch.yaml │ ├── intern_vit_6b_1k_224_test_imagenetv2.yaml │ └── linear_probing │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml │ │ ├── linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml │ │ ├── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml │ │ └── linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml ├── dataset │ ├── __init__.py │ ├── build.py │ ├── cached_image_folder.py │ ├── imagenet_a_r_indices.py │ ├── imagenet_real.py │ ├── imagenetv2.py │ ├── samplers.py │ └── zipreader.py ├── ddp_hooks.py ├── gflops.py ├── hf2pytorch.py ├── logger.py ├── lr_scheduler.py ├── main.py ├── meta_data │ ├── 22k_class_to_idx.json │ ├── imagenet_classes.json │ ├── map22kto1k.txt │ ├── real.json │ ├── train.txt.zip │ └── val.txt.zip ├── models │ ├── __init__.py │ ├── build.py │ ├── clip_vit.py │ ├── flash_attention.py │ └── intern_vit_6b.py ├── optimizer.py ├── train_in1k.sh ├── utils.py └── work_dirs │ └── intern_vit_6b_1k_224 │ └── log_rank0.txt ├── clip_benchmark ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── benchmark.png ├── benchmark │ ├── README.md │ ├── benchmark.csv │ ├── dataset_type.csv │ ├── datasets.txt │ ├── datasets_multilingual.txt │ ├── models.txt │ ├── results.ipynb │ └── webdatasets.txt ├── clip_benchmark │ ├── __init__.py │ ├── cli.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ar_classnames.json │ │ ├── ar_zeroshot_classification_templates.json │ │ ├── birdsnap.py │ │ ├── builder.py │ │ ├── caltech101.py │ │ ├── cn_classnames.json │ │ ├── cn_zeroshot_classification_templates.json │ │ ├── cupl_prompts.json │ │ ├── en_classnames.json │ │ ├── en_zeroshot_classification_templates.json │ │ ├── flickr.py │ │ ├── imagenetv2.py │ │ ├── it_classnames.json │ │ ├── it_zeroshot_classification_templates.json │ │ ├── jp_classnames.json │ │ ├── jp_zeroshot_classification_templates.json │ │ ├── kitti.py │ │ ├── multilingual_mscoco.py │ │ ├── objectnet.py │ │ ├── tfds.py │ │ ├── tools.py │ │ └── voc2007.py │ ├── metrics │ │ ├── __init__.py │ │ ├── linear_probe.py │ │ ├── mscoco_generative.py │ │ ├── zeroshot_classification.py │ │ └── zeroshot_retrieval.py │ ├── model_collection.py │ ├── models │ │ ├── __init__.py │ │ ├── intern_vit_6b │ │ │ ├── configuration_intern_vit.py │ │ │ ├── flash_attention.py │ │ │ └── modeling_intern_vit.py │ │ ├── internvl.py │ │ ├── internvl_c_pytorch │ │ │ ├── __init__.py │ │ │ ├── chinese_alpaca_lora_7b │ │ │ │ ├── config.json │ │ │ │ ├── generation_config.json │ │ │ │ ├── pytorch_model.bin.index.json │ │ │ │ ├── special_tokens_map.json │ │ │ │ ├── tokenizer.model │ │ │ │ └── tokenizer_config.json │ │ │ ├── flash_attention.py │ │ │ └── internvl_c.py │ │ ├── internvl_huggingface │ │ │ ├── __init__.py │ │ │ ├── configuration_intern_vit.py │ │ │ ├── configuration_internvl.py │ │ │ ├── flash_attention.py │ │ │ ├── modeling_intern_vit.py │ │ │ ├── modeling_internvl.py │ │ │ └── modeling_qllama.py │ │ ├── japanese_clip.py │ │ └── open_clip.py │ └── webdataset_builder.py ├── data │ ├── birdsnap │ │ └── test_images_valid.txt │ ├── flickr30k │ │ └── flickr30k_cn_test.txt │ └── mscoco_captions │ │ └── coco-cn_test.json ├── probe_benchmark │ ├── PROBES.md │ ├── build_df_scaling_experiments.py │ ├── clip_table_2.csv │ ├── generate_table.py │ ├── gmacs_vs_perf_retrieval.pdf │ ├── imagenet_cifar_lp.pdf │ ├── imagenet_cifar_lp_vtab.pdf │ ├── laion5b_fewshot_experiments.py │ ├── openclip_results.csv │ ├── process_vtab.py │ ├── scaling_experiment_data2.json │ ├── scaling_experiment_data_vtab.json │ ├── scaling_experiments.py │ └── scaling_plot.ipynb ├── requirements-test.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── test_internvl_c_classification.sh ├── test_internvl_c_imagenet.sh ├── test_internvl_c_retrieval.sh ├── test_internvl_c_xtd.sh ├── test_internvl_g_classification.sh ├── test_internvl_g_imagenet.sh ├── test_internvl_g_retrieval.sh ├── test_internvl_g_retrieval_finetune.sh ├── test_internvl_g_xtd.sh ├── tests │ └── test_clip_benchmark.py └── tox.ini ├── internvl_chat ├── README.md ├── eval │ ├── README.md │ ├── caption │ │ ├── README.md │ │ └── evaluate_caption.py │ ├── domain_specific │ │ ├── drivelm │ │ │ └── evaluate.py │ │ ├── mme_rw │ │ │ └── evaluate.py │ │ ├── rs_det │ │ │ ├── caculate.py │ │ │ └── evaluate.py │ │ └── rs_vqa │ │ │ ├── evaluate.py │ │ │ └── score.py │ ├── llava_bench │ │ ├── README.md │ │ ├── eval_gpt_review_bench.py │ │ ├── evaluate_llava_bench.py │ │ ├── rule.json │ │ └── summarize_gpt_review.py │ ├── mantis_eval │ │ ├── README.md │ │ └── evaluate_mantis.py │ ├── mathvista │ │ ├── README.md │ │ ├── calculate_score.py │ │ ├── evaluate_mathvista.py │ │ ├── extract_answer.py │ │ ├── prompts │ │ │ └── ext_ans.py │ │ └── utilities.py │ ├── mirb │ │ ├── README.md │ │ └── evaluate_mirb.py │ ├── mmbench │ │ ├── README.md │ │ └── evaluate_mmbench.py │ ├── mme │ │ ├── README.md │ │ ├── Your_Results │ │ │ ├── OCR.txt │ │ │ ├── artwork.txt │ │ │ ├── celebrity.txt │ │ │ ├── code_reasoning.txt │ │ │ ├── color.txt │ │ │ ├── commonsense_reasoning.txt │ │ │ ├── count.txt │ │ │ ├── existence.txt │ │ │ ├── landmark.txt │ │ │ ├── numerical_calculation.txt │ │ │ ├── position.txt │ │ │ ├── posters.txt │ │ │ ├── scene.txt │ │ │ └── text_translation.txt │ │ ├── calculation.py │ │ └── eval.py │ ├── mmhal │ │ ├── README.md │ │ ├── eval_gpt_mmhal.py │ │ └── evaluate_mmhal.py │ ├── mmiu │ │ ├── README.md │ │ ├── evaluate_mmiu.py │ │ └── mmiu.jsonl │ ├── mmmu │ │ ├── README.md │ │ ├── answer_dict_val.json │ │ ├── data_utils.py │ │ ├── eval_utils.py │ │ ├── evaluate_mmmu.py │ │ └── main_eval_only.py │ ├── mmmu_pro │ │ ├── README.md │ │ ├── evaluate.py │ │ ├── evaluate_mmmu_pro.py │ │ └── prompts.yaml │ ├── mmvet │ │ ├── README.md │ │ └── evaluate_mmvet.py │ ├── mmvetv2 │ │ ├── README.md │ │ └── evaluate_mmvet_v2.py │ ├── mmvp │ │ ├── README.md │ │ └── evaluate_mmvp.py │ ├── mpdocvqa │ │ ├── README.md │ │ ├── evaluate_vqa.py │ │ └── infographicsvqa_eval.py │ ├── mvbench │ │ ├── README.md │ │ └── evaluate_mvbench.py │ ├── pope │ │ ├── README.md │ │ ├── eval_pope.py │ │ └── evaluate_pope.py │ ├── refcoco │ │ ├── README.md │ │ └── evaluate_grounding.py │ ├── scienceqa │ │ ├── README.md │ │ └── evaluate_scienceqa.py │ ├── seed │ │ ├── README.md │ │ ├── calculation.py │ │ └── evaluate_seed.py │ ├── tiny_lvlm │ │ ├── README.md │ │ ├── calculate_score.py │ │ ├── evaluate_lvlm.py │ │ └── tools.py │ └── vqa │ │ ├── README.md │ │ ├── convert_gqa_for_eval.py │ │ ├── evaluate_vqa.py │ │ ├── infographicsvqa_eval.py │ │ └── textvqa_eval.py ├── evaluate.sh ├── examples │ ├── image1.jpg │ ├── image2.jpg │ ├── image3.jpg │ ├── image4.jpg │ └── image5.jpg ├── internvl │ ├── conversation.py │ ├── dist_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── internlm2 │ │ │ ├── configuration_internlm2.py │ │ │ ├── modeling_internlm2.py │ │ │ ├── tokenization_internlm2.py │ │ │ └── tokenization_internlm2_fast.py │ │ ├── internvl_chat │ │ │ ├── __init__.py │ │ │ ├── configuration_intern_vit.py │ │ │ ├── configuration_internvl_chat.py │ │ │ ├── modeling_intern_vit.py │ │ │ └── modeling_internvl_chat.py │ │ └── phi3 │ │ │ ├── configuration_phi3.py │ │ │ └── modeling_phi3.py │ ├── patch │ │ ├── __init__.py │ │ ├── internlm2_packed_training_patch.py │ │ ├── internvit_liger_monkey_patch.py │ │ ├── llama2_flash_attn_monkey_patch.py │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_packed_training_patch.py │ │ ├── llama_rmsnorm_monkey_patch.py │ │ ├── pad_data_collator.py │ │ ├── phi3_packed_training_patch.py │ │ ├── qwen2_packed_training_patch.py │ │ ├── train_dataloader_patch.py │ │ └── train_sampler_patch.py │ └── train │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── dataset.py │ │ ├── dataset_packed.py │ │ ├── internvl_chat_finetune.py │ │ ├── internvl_chat_mpo.py │ │ ├── internvl_chat_pretrain.py │ │ └── trainer_dpo.py ├── pyproject.toml ├── shell │ ├── data │ │ ├── coco_caption.json │ │ ├── internvl_1_2_finetune.json │ │ └── internvl_1_2_finetune_custom.json │ ├── internvl1.2 │ │ ├── 2nd_finetune │ │ │ ├── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh │ │ │ └── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh │ │ └── hermes2_yi34b │ │ │ └── internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh │ ├── internvl1.5 │ │ ├── 2nd_finetune │ │ │ ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh │ │ │ └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh │ │ ├── hermes2_yi34b │ │ │ ├── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh │ │ ├── internlm2_1_8b │ │ │ ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh │ │ ├── internlm2_20b │ │ │ ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh │ │ └── phi3_3_8b │ │ │ ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh │ ├── internvl2.0 │ │ └── 2nd_finetune │ │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh │ │ │ ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh │ │ │ └── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh │ ├── internvl2.0_mpo │ │ ├── README.md │ │ └── preference_optimization │ │ │ └── internvl2_8b_internlm2_7b_dynamic_res_mpo_full.sh │ ├── internvl2.5 │ │ ├── 2nd_finetune │ │ │ ├── internvl2_5_1b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_5_1b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_5_26b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_5_26b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_5_2b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_5_2b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_5_2b_dynamic_res_2nd_finetune_lora_coco.sh │ │ │ ├── internvl2_5_38b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_5_38b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_5_4b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_5_4b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_5_78b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl2_5_78b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl2_5_8b_dynamic_res_2nd_finetune_full.sh │ │ │ └── internvl2_5_8b_dynamic_res_2nd_finetune_lora.sh │ │ ├── stage1.5 │ │ │ ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5.sh │ │ │ └── internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5.sh │ │ ├── stage1 │ │ │ ├── internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1.sh │ │ │ ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1.sh │ │ │ ├── internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1.sh │ │ │ ├── internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1.sh │ │ │ ├── internvl2_5_4b_qwen2_5_3b_dynamic_res_stage1.sh │ │ │ ├── internvl2_5_78b_qwen2_5_72b_dynamic_res_stage1.sh │ │ │ └── internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1.sh │ │ └── stage2 │ │ │ ├── internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage2.sh │ │ │ ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_stage2.sh │ │ │ ├── internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage2.sh │ │ │ ├── internvl2_5_38b_qwen2_5_32b_dynamic_res_stage2.sh │ │ │ ├── internvl2_5_4b_qwen2_5_3b_dynamic_res_stage2.sh │ │ │ ├── internvl2_5_78b_qwen2_5_72b_dynamic_res_stage2.sh │ │ │ └── internvl2_5_8b_internlm2_5_7b_dynamic_res_stage2.sh │ ├── internvl2.5_mpo │ │ └── preference_optimization │ │ │ ├── internvl2_5_1b_qwen2_5_0_5b_dynamic_res_mpo.sh │ │ │ ├── internvl2_5_26b_internlm2_5_20b_dynamic_res_mpo.sh │ │ │ ├── internvl2_5_2b_internlm2_5_1_8b_dynamic_res_mpo.sh │ │ │ ├── internvl2_5_38b_qwen2_5_32b_dynamic_res_mpo.sh │ │ │ ├── internvl2_5_4b_qwen2_5_3b_dynamic_res_mpo.sh │ │ │ ├── internvl2_5_78b_qwen2_5_72b_dynamic_res_mpo.sh │ │ │ └── internvl2_5_8b_internlm2_5_7b_dynamic_res_mpo.sh │ ├── internvl3.0 │ │ ├── 2nd_finetune │ │ │ ├── internvl3_14b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl3_1b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl3_2b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl3_38b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl3_78b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl3_8b_dynamic_res_2nd_finetune_full.sh │ │ │ └── internvl3_9b_dynamic_res_2nd_finetune_full.sh │ │ ├── mpo │ │ │ ├── internvl3_14b_mpo.sh │ │ │ ├── internvl3_1b_mpo.sh │ │ │ ├── internvl3_2b_mpo.sh │ │ │ ├── internvl3_38b_mpo.sh │ │ │ ├── internvl3_78b_mpo.sh │ │ │ ├── internvl3_8b_mpo.sh │ │ │ └── internvl3_9b_mpo.sh │ │ ├── mpo_data_construction │ │ │ ├── correctness_build_data.sh │ │ │ └── correctness_mmpr_8b.sh │ │ └── visualprm_data_construction │ │ │ ├── visualprm_build_data.sh │ │ │ └── visualprm_mmpr_8b.sh │ └── mini_internvl │ │ ├── README.md │ │ └── domain_adaptation │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_bdd.sh │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_drivelm.sh │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_medical.sh │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_finetune_remote.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_bdd.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_drivelm.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_medical.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_finetune_remote.sh │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_finetune_bdd.sh │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_finetune_drivelm.sh │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_finetune_medical.sh │ │ └── internvl2_4b_phi3_3_8b_dynamic_res_finetune_remote.sh ├── tools │ ├── convert_to_int8.py │ ├── extract_mlp.py │ ├── extract_video_frames.py │ ├── extract_vit.py │ ├── images_stitching.py │ ├── json2jsonl.py │ ├── jsonl2jsonl.py │ ├── merge_lora.py │ ├── reasoning_data_pipeline │ │ ├── mmpr_data_pipeline_correctness.py │ │ ├── mmpr_data_pipeline_correctness_postprocess.py │ │ ├── mmpr_data_pipeline_dropout_ntp.py │ │ ├── utils │ │ │ ├── accuracy_reward.py │ │ │ ├── constants.py │ │ │ └── utils.py │ │ ├── visualprm_data_pieline.py │ │ └── visualprm_data_pipeline_postprocess.py │ ├── replace_llm.py │ └── resize_pos_embed.py ├── zero_stage1_config.json ├── zero_stage2_config.json ├── zero_stage3_config.json ├── zero_stage3_config_100b.json ├── zero_stage3_config_100b_1e7_offload.json ├── zero_stage3_config_100b_1e8.json ├── zero_stage3_config_34b.json └── zero_stage3_config_70b.json ├── internvl_chat_llava ├── LICENSE ├── README.md ├── docs │ ├── Customize_Component.md │ ├── Data.md │ ├── Evaluation.md │ ├── LLaVA_Bench.md │ ├── LLaVA_from_LLaMA2.md │ ├── LoRA.md │ ├── MODEL_ZOO.md │ └── ScienceQA.md ├── images │ ├── demo_cli.gif │ ├── llava_example_cmp.png │ ├── llava_logo.png │ └── llava_v1_5_radar.jpg ├── llava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── eval │ │ ├── eval_gpt_review.py │ │ ├── eval_gpt_review_bench.py │ │ ├── eval_gpt_review_visual.py │ │ ├── eval_pope.py │ │ ├── eval_science_qa.py │ │ ├── eval_science_qa_gpt4.py │ │ ├── eval_science_qa_gpt4_requery.py │ │ ├── eval_textvqa.py │ │ ├── generate_webpage_data_from_table.py │ │ ├── m4c_evaluator.py │ │ ├── model_qa.py │ │ ├── model_vqa.py │ │ ├── model_vqa_loader.py │ │ ├── model_vqa_mmbench.py │ │ ├── model_vqa_science.py │ │ ├── qa_baseline_gpt35.py │ │ ├── run_llava.py │ │ ├── summarize_gpt_review.py │ │ ├── table │ │ │ ├── answer │ │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ │ ├── answer_bard.jsonl │ │ │ │ ├── answer_gpt35.jsonl │ │ │ │ ├── answer_llama-13b.jsonl │ │ │ │ └── answer_vicuna-13b.jsonl │ │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ │ ├── model.jsonl │ │ │ ├── prompt.jsonl │ │ │ ├── question.jsonl │ │ │ ├── results │ │ │ │ ├── test_sqa_llava_13b_v0.json │ │ │ │ └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json │ │ │ ├── review │ │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ │ ├── reviewer.jsonl │ │ │ └── rule.json │ │ └── webpage │ │ │ ├── figures │ │ │ ├── alpaca.png │ │ │ ├── bard.jpg │ │ │ ├── chatgpt.svg │ │ │ ├── llama.jpg │ │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ │ └── vicuna.jpeg │ │ │ ├── index.html │ │ │ ├── script.js │ │ │ └── styles.css │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mpt.py │ │ │ └── mpt │ │ │ │ ├── adapt_tokenizer.py │ │ │ │ ├── attention.py │ │ │ │ ├── blocks.py │ │ │ │ ├── configuration_mpt.py │ │ │ │ ├── custom_embedding.py │ │ │ │ ├── flash_attn_triton.py │ │ │ │ ├── hf_prefixlm_converter.py │ │ │ │ ├── meta_init_context.py │ │ │ │ ├── modeling_mpt.py │ │ │ │ ├── norm.py │ │ │ │ └── param_init_fns.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ ├── clip_encoder.py │ │ │ ├── eva_clip │ │ │ │ ├── configuration_evaclip.py │ │ │ │ └── modeling_evaclip.py │ │ │ ├── intern_vit_6b │ │ │ │ ├── configuration_intern_vit.py │ │ │ │ ├── flash_attention.py │ │ │ │ └── modeling_intern_vit.py │ │ │ └── internvl_14b │ │ │ │ ├── __init__.py │ │ │ │ ├── configuration_intern_vit.py │ │ │ │ ├── configuration_internvl.py │ │ │ │ ├── flash_attention.py │ │ │ │ ├── modeling_intern_vit.py │ │ │ │ ├── modeling_internvl.py │ │ │ │ └── modeling_qllama.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── utils.py │ ├── serve │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── controller.py │ │ ├── examples │ │ │ ├── extreme_ironing.jpg │ │ │ ├── img1.jpg │ │ │ ├── img2.jpg │ │ │ ├── img3.jpg │ │ │ ├── img4.jpg │ │ │ ├── img5.jpg │ │ │ ├── img6.jpg │ │ │ └── waterview.jpg │ │ ├── gradio_web_server.py │ │ ├── model_worker.py │ │ ├── register_worker.py │ │ └── test_message.py │ ├── train │ │ ├── dist_utils.py │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ ├── train_custom.py │ │ ├── train_mem.py │ │ └── train_mem_custom.py │ └── utils.py ├── pyproject.toml ├── scripts │ ├── convert_gqa_for_eval.py │ ├── convert_mmbench_for_submission.py │ ├── convert_mmvet_for_eval.py │ ├── convert_seed_for_submission.py │ ├── convert_sqa_to_llava.py │ ├── convert_sqa_to_llava_base_prompt.py │ ├── convert_vizwiz_for_submission.py │ ├── convert_vqav2_for_submission.py │ ├── finetune.sh │ ├── finetune_full_schedule.sh │ ├── finetune_lora.sh │ ├── finetune_qlora.sh │ ├── finetune_sqa.sh │ ├── merge_lora_weights.py │ ├── pretrain.sh │ ├── sqa_eval_batch.sh │ ├── sqa_eval_gather.sh │ ├── v1_5 │ │ ├── eval │ │ │ ├── gqa.sh │ │ │ ├── llavabench.sh │ │ │ ├── mmbench.sh │ │ │ ├── mmbench_cn.sh │ │ │ ├── mme.sh │ │ │ ├── mmvet.sh │ │ │ ├── pope.sh │ │ │ ├── seed.sh │ │ │ ├── sqa.sh │ │ │ ├── textvqa.sh │ │ │ ├── vizwiz.sh │ │ │ └── vqav2.sh │ │ ├── finetune.sh │ │ └── pretrain.sh │ ├── zero1.json │ ├── zero2.json │ ├── zero3.json │ └── zero3_offload.json └── scripts_internvl │ ├── eval │ ├── gqa.sh │ ├── llavabench.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmvet.sh │ ├── pope.sh │ ├── sqa.sh │ ├── textvqa.sh │ ├── vizwiz.sh │ └── vqav2.sh │ ├── finetune_internvit6b_224to336_vicuna13b.sh │ ├── finetune_internvit6b_224to336_vicuna13b_custom_data.sh │ ├── finetune_internvit6b_224to336_vicuna7b.sh │ ├── finetune_internvit6b_448_v1_2_vicuna13b.sh │ ├── finetune_internvit6b_448_v1_5_vicuna13b.sh │ ├── finetune_internvit6b_448_vicuna13b.sh │ ├── finetune_internvit6b_448_vicuna7b.sh │ ├── meta │ └── custom_data.json │ ├── pretrain_internvit6b_224to336_vicuna13b.sh │ ├── pretrain_internvit6b_224to336_vicuna7b.sh │ ├── pretrain_internvit6b_448_v1_2_vicuna13b.sh │ ├── pretrain_internvit6b_448_v1_5_vicuna13b.sh │ ├── pretrain_internvit6b_448_vicuna13b.sh │ └── pretrain_internvit6b_448_vicuna7b.sh ├── internvl_g ├── README.md ├── eval │ └── evaluate_caption.py ├── evaluate.sh ├── internvl │ ├── dist_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── internvl_stage2 │ │ │ ├── __init__.py │ │ │ ├── configuration_intern_vit.py │ │ │ ├── configuration_internvl.py │ │ │ ├── flash_attention.py │ │ │ ├── modeling_intern_vit.py │ │ │ ├── modeling_internvl.py │ │ │ └── modeling_qllama.py │ │ └── internvl_stage2_retrieval │ │ │ ├── __init__.py │ │ │ ├── configuration_intern_vit.py │ │ │ ├── configuration_internvl.py │ │ │ ├── flash_attention.py │ │ │ ├── modeling_intern_vit.py │ │ │ ├── modeling_internvl.py │ │ │ └── modeling_qllama.py │ └── train │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── internvl_stage2_finetune.py │ │ └── trainer_monkey_patch.py ├── shell │ ├── finetune │ │ ├── internvl_stage2_finetune_coco_364_bs1024_ep5.sh │ │ ├── internvl_stage2_finetune_flickr_364_bs1024_ep10.sh │ │ └── internvl_stage2_finetune_flickrcn_364_bs1024_ep10.sh │ ├── head_finetune │ │ ├── internvl_stage2_finetune_coco_224_bs1024_ep5_head_4gpu.sh │ │ ├── internvl_stage2_finetune_flickr_224_bs1024_ep10_head_4gpu.sh │ │ └── internvl_stage2_finetune_flickrcn_224_bs1024_ep10_head_4gpu.sh │ └── lora_finetune │ │ ├── internvl_stage2_finetune_coco_224_bs1024_ep5_lora16_4gpu.sh │ │ ├── internvl_stage2_finetune_flickr_224_bs1024_ep10_lora16_4gpu.sh │ │ └── internvl_stage2_finetune_flickrcn_224_bs1024_ep10_lora16_4gpu.sh ├── zero_stage1_config.json ├── zero_stage2_config.json └── zero_stage3_config.json ├── requirements.txt ├── requirements ├── classification.txt ├── clip_benchmark.txt ├── internvl_chat.txt ├── segmentation.txt └── streamlit_demo.txt ├── segmentation ├── README.md ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── ade20k.py │ │ │ ├── ade20k_504x504.py │ │ │ ├── ade20k_504x504_1of16.py │ │ │ ├── ade20k_504x504_1of2.py │ │ │ ├── ade20k_504x504_1of4.py │ │ │ ├── ade20k_504x504_1of8.py │ │ │ ├── ade20k_640x640.py │ │ │ ├── ade20k_896x896.py │ │ │ ├── chase_db1.py │ │ │ ├── cityscapes.py │ │ │ ├── cityscapes_1024x1024.py │ │ │ ├── cityscapes_768x768.py │ │ │ ├── cityscapes_769x769.py │ │ │ ├── cityscapes_832x832.py │ │ │ ├── coco-stuff10k.py │ │ │ ├── coco-stuff164k.py │ │ │ ├── coco-stuff164k_896x896.py │ │ │ ├── drive.py │ │ │ ├── hrf.py │ │ │ ├── isaid.py │ │ │ ├── loveda.py │ │ │ ├── pascal_context.py │ │ │ ├── pascal_context_59.py │ │ │ ├── pascal_voc12.py │ │ │ ├── pascal_voc12_aug.py │ │ │ ├── potsdam.py │ │ │ ├── stare.py │ │ │ └── vaihingen.py │ │ ├── default_runtime.py │ │ ├── models │ │ │ ├── ann_r50-d8.py │ │ │ ├── apcnet_r50-d8.py │ │ │ ├── bisenetv1_r18-d32.py │ │ │ ├── bisenetv2.py │ │ │ ├── ccnet_r50-d8.py │ │ │ ├── cgnet.py │ │ │ ├── danet_r50-d8.py │ │ │ ├── deeplabv3_r50-d8.py │ │ │ ├── deeplabv3_unet_s5-d16.py │ │ │ ├── deeplabv3plus_r50-d8.py │ │ │ ├── dmnet_r50-d8.py │ │ │ ├── dnl_r50-d8.py │ │ │ ├── dpt_vit-b16.py │ │ │ ├── emanet_r50-d8.py │ │ │ ├── encnet_r50-d8.py │ │ │ ├── erfnet_fcn.py │ │ │ ├── fast_scnn.py │ │ │ ├── fastfcn_r50-d32_jpu_psp.py │ │ │ ├── fcn_hr18.py │ │ │ ├── fcn_r50-d8.py │ │ │ ├── fcn_unet_s5-d16.py │ │ │ ├── fpn_r50.py │ │ │ ├── gcnet_r50-d8.py │ │ │ ├── icnet_r50-d8.py │ │ │ ├── isanet_r50-d8.py │ │ │ ├── lraspp_m-v3-d8.py │ │ │ ├── mask2former_beit.py │ │ │ ├── nonlocal_r50-d8.py │ │ │ ├── ocrnet_hr18.py │ │ │ ├── ocrnet_r50-d8.py │ │ │ ├── pointrend_r50.py │ │ │ ├── psanet_r50-d8.py │ │ │ ├── pspnet_r50-d8.py │ │ │ ├── pspnet_unet_s5-d16.py │ │ │ ├── segformer_mit-b0.py │ │ │ ├── segmenter_vit-b16_mask.py │ │ │ ├── setr_mla.py │ │ │ ├── setr_naive.py │ │ │ ├── setr_pup.py │ │ │ ├── stdc.py │ │ │ ├── twins_pcpvt-s_fpn.py │ │ │ ├── twins_pcpvt-s_upernet.py │ │ │ ├── upernet_beit.py │ │ │ ├── upernet_convnext.py │ │ │ ├── upernet_mae.py │ │ │ ├── upernet_r50.py │ │ │ ├── upernet_swin.py │ │ │ └── upernet_vit-b16_ln_mln.py │ │ └── schedules │ │ │ ├── schedule_10k.py │ │ │ ├── schedule_160k.py │ │ │ ├── schedule_20k.py │ │ │ ├── schedule_320k.py │ │ │ ├── schedule_40k.py │ │ │ ├── schedule_5k.py │ │ │ └── schedule_80k.py │ └── intern_vit_6b │ │ ├── few_shot │ │ ├── linear_intern_vit_6b_504_10k_ade20k_bs16_lr4e-5_1of8.py │ │ ├── linear_intern_vit_6b_504_20k_ade20k_bs16_lr4e-5_1of4.py │ │ ├── linear_intern_vit_6b_504_40k_ade20k_bs16_lr4e-5_1of2.py │ │ ├── linear_intern_vit_6b_504_5k_ade20k_bs16_lr4e-5_1of16.py │ │ └── linear_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5_1of1.py │ │ ├── full_tuning │ │ └── upernet_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5.py │ │ ├── head_tuning │ │ └── upernet_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5_frozen.py │ │ └── linear_probing │ │ └── linear_intern_vit_6b_504_80k_ade20k_bs16_lr4e-5_frozen.py ├── dist_test.sh ├── dist_train.sh ├── mmcv_custom │ ├── __init__.py │ ├── ddp_hooks.py │ └── layer_decay_optimizer_constructor.py ├── mmseg_custom │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ade.py │ │ └── pipelines │ │ │ ├── __init__.py │ │ │ └── transform.py │ └── models │ │ ├── __init__.py │ │ ├── backbones │ │ ├── __init__.py │ │ ├── flash_attention.py │ │ └── intern_vit_6b.py │ │ └── decode_heads │ │ ├── __init__.py │ │ └── fcn_head.py ├── release.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── train.py └── zero_configs │ ├── adam_fp16.json │ ├── adam_zero1_amp.json │ ├── adam_zero1_bf16.json │ ├── adam_zero1_fp16.json │ ├── adam_zero2_bf16.json │ ├── adam_zero2_fp16.json │ └── adam_zero3_fp16.json ├── streamlit_demo ├── .streamlit │ └── config.toml ├── api.py ├── app.py ├── constants.py ├── controller.py ├── gallery │ ├── astro_on_unicorn.png │ ├── cheetah.png │ ├── prod_1.jpeg │ ├── prod_11.jpg │ ├── prod_12.png │ ├── prod_4.png │ ├── prod_9.jpg │ └── prod_en_17.png ├── library.py ├── model_worker.py ├── sd_worker.py ├── static │ └── SimHei.ttf └── utils.py └── video_retrieval └── test_msrvtt.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, F403, C901, W504, W605, E251, E122, E126, E127, E722, W503, E128, E741, E731, E701 3 | select = E1, E3, E502, E7, E9, W1, W5, W6 4 | max-line-length = 180 5 | exclude=*.egg/*,build,dist,detection/configs/* 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Suggest an idea for this project 3 | title: "[Feature] " 4 | 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | We strongly appreciate you creating a PR to implement this feature [here](https://github.com/OpenGVLab/InternVL/pulls)! 10 | If you need our help, please fill in as much of the following form as you're able to. 11 | 12 | **The less clear the description, the longer it will take to solve it.** 13 | - type: textarea 14 | attributes: 15 | label: Motivation 16 | description: | 17 | A clear and concise description of the motivation of the feature. 18 | Ex1. It is inconvenient when \[....\]. 19 | validations: 20 | required: true 21 | - type: textarea 22 | attributes: 23 | label: Related resources 24 | description: | 25 | If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful. 26 | - type: textarea 27 | attributes: 28 | label: Additional context 29 | description: | 30 | Add any other context or screenshots about the feature request here. 31 | If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to the documentation. 3 | labels: "kind/doc,status/unconfirmed" 4 | title: "[Docs] " 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description the issue. 12 | validations: 13 | required: true 14 | 15 | - type: textarea 16 | attributes: 17 | label: Suggest a potential alternative/fix 18 | description: > 19 | Tell us how we could improve the documentation in this regard. 20 | - type: markdown 21 | attributes: 22 | value: > 23 | Thanks for contributing 🎉! 24 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line-length = 180 3 | multi_line_output = 0 4 | extra_standard_library = setuptools 5 | known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml 6 | no_lines_before = STDLIB,LOCALFOLDER 7 | default_section = THIRDPARTY 8 | 9 | [yapf] 10 | BASED_ON_STYLE = pep8 11 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 12 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 13 | 14 | [codespell] 15 | skip = *.ipynb 16 | quiet-level = 3 17 | ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood 18 | © 2022 GitHub, Inc. 19 | Terms 20 | Privacy 21 | Security 22 | Status 23 | Docs 24 | Contact GitHub 25 | Pricing 26 | API 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^internvl_chat_llava/ 2 | repos: 3 | - repo: https://github.com/PyCQA/flake8 4 | rev: 5.0.4 5 | hooks: 6 | - id: flake8 7 | - repo: https://github.com/PyCQA/isort 8 | rev: 5.11.5 9 | hooks: 10 | - id: isort 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v4.3.0 13 | hooks: 14 | - id: trailing-whitespace 15 | - id: check-yaml 16 | - id: end-of-file-fixer 17 | - id: requirements-txt-fixer 18 | - id: double-quote-string-fixer 19 | - id: check-merge-conflict 20 | - id: fix-encoding-pragma 21 | args: ["--remove"] 22 | - id: mixed-line-ending 23 | args: ["--fix=lf"] 24 | - repo: https://github.com/executablebooks/mdformat 25 | rev: 0.7.9 26 | hooks: 27 | - id: mdformat 28 | args: ["--number"] 29 | additional_dependencies: 30 | - mdformat-openmmlab 31 | - mdformat_frontmatter 32 | - linkify-it-py 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 OpenGVLab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | MODEL: 7 | TYPE: intern_vit_6b 8 | DROP_PATH_RATE: 0.0 9 | INTERN_VIT_6B: 10 | FREEZE_VIT: True 11 | PATCH_SIZE: 14 12 | PRETRAIN_SIZE: 224 13 | QKV_BIAS: False 14 | EMBED_DIM: 3200 15 | NUM_HEADS: 25 16 | MLP_RATIO: 4 17 | INIT_VALUES: 0.1 18 | QK_NORMALIZATION: True 19 | DEPTH: 48 20 | USE_FLASH_ATTN: True 21 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 22 | CLS_TARGET: 'attention_pooling' 23 | TRAIN: 24 | EMA: 25 | ENABLE: True 26 | DECAY: 0.998 27 | EPOCHS: 10 28 | WARMUP_EPOCHS: 1 29 | WEIGHT_DECAY: 0.0 30 | BASE_LR: 0.1 # 512 31 | WARMUP_LR: .0 32 | MIN_LR: .0 33 | LR_LAYER_DECAY: false 34 | OPTIMIZER: 35 | NAME: 'sgd' 36 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 23 | CLS_TARGET: 'attention_pooling' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'attention_pooling' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/intern_vit_6b_1k_224.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 128 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | MODEL: 7 | TYPE: intern_vit_6b 8 | DROP_PATH_RATE: 0.0 9 | INTERN_VIT_6B: 10 | FREEZE_VIT: True 11 | PATCH_SIZE: 14 12 | PRETRAIN_SIZE: 224 13 | QKV_BIAS: False 14 | EMBED_DIM: 3200 15 | NUM_HEADS: 25 16 | MLP_RATIO: 4 17 | INIT_VALUES: 0.1 18 | QK_NORMALIZATION: True 19 | DEPTH: 48 20 | USE_FLASH_ATTN: True 21 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 22 | CLS_TARGET: 'cls_patch_concat' 23 | TRAIN: 24 | EMA: 25 | ENABLE: False 26 | DECAY: 0.998 27 | EPOCHS: 10 28 | WARMUP_EPOCHS: 1 29 | WEIGHT_DECAY: 0.0 30 | BASE_LR: 0.1 # 512 31 | WARMUP_LR: .0 32 | MIN_LR: .0 33 | LR_LAYER_DECAY: false 34 | OPTIMIZER: 35 | NAME: 'sgd' 36 | -------------------------------------------------------------------------------- /classification/configs/intern_vit_6b_1k_224_test_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 128 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: False 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/intern_vit_6b_1k_224_test_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 128 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: False 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/intern_vit_6b_1k_224_test_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 128 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: False 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/intern_vit_6b_1k_224_test_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 128 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: False 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/intern_vit_6b_1k_224_test_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 128 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: False 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | MODEL: 7 | TYPE: intern_vit_6b 8 | DROP_PATH_RATE: 0.0 9 | INTERN_VIT_6B: 10 | FREEZE_VIT: True 11 | PATCH_SIZE: 14 12 | PRETRAIN_SIZE: 224 13 | QKV_BIAS: False 14 | EMBED_DIM: 3200 15 | NUM_HEADS: 25 16 | MLP_RATIO: 4 17 | INIT_VALUES: 0.1 18 | QK_NORMALIZATION: True 19 | DEPTH: 48 20 | USE_FLASH_ATTN: True 21 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 22 | CLS_TARGET: 'cls_patch_concat' 23 | TRAIN: 24 | EMA: 25 | ENABLE: True 26 | DECAY: 0.998 27 | EPOCHS: 10 28 | WARMUP_EPOCHS: 1 29 | WEIGHT_DECAY: 0.0 30 | BASE_LR: 0.1 # 512 31 | WARMUP_LR: .0 32 | MIN_LR: .0 33 | LR_LAYER_DECAY: false 34 | OPTIMIZER: 35 | NAME: 'sgd' 36 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 224 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 48 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 224 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 48 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | TRANSFORM: 'build_transform_for_linear_probe' 5 | DATA_PATH: './data/imagenet-1k' 6 | IMG_SIZE: 448 7 | MODEL: 8 | TYPE: intern_vit_6b 9 | DROP_PATH_RATE: 0.0 10 | INTERN_VIT_6B: 11 | FREEZE_VIT: True 12 | PATCH_SIZE: 14 13 | PRETRAIN_SIZE: 448 14 | QKV_BIAS: False 15 | EMBED_DIM: 3200 16 | NUM_HEADS: 25 17 | MLP_RATIO: 4 18 | INIT_VALUES: 0.1 19 | QK_NORMALIZATION: True 20 | DEPTH: 45 21 | USE_FLASH_ATTN: True 22 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 23 | CLS_TARGET: 'cls_patch_concat' 24 | TRAIN: 25 | EMA: 26 | ENABLE: True 27 | DECAY: 0.998 28 | EPOCHS: 10 29 | WARMUP_EPOCHS: 1 30 | WEIGHT_DECAY: 0.0 31 | BASE_LR: 0.1 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: false 35 | OPTIMIZER: 36 | NAME: 'sgd' 37 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_a' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-a' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_r' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-r' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet-real' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-1k' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenet_sketch' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenet-sketch' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | BATCH_SIZE: 16 # single GPU batch size 4 | DATASET: 'imagenetv2' 5 | TRANSFORM: 'build_transform_for_linear_probe' 6 | DATA_PATH: './data/imagenetv2' 7 | IMG_SIZE: 448 8 | MODEL: 9 | TYPE: intern_vit_6b 10 | DROP_PATH_RATE: 0.0 11 | INTERN_VIT_6B: 12 | FREEZE_VIT: True 13 | PATCH_SIZE: 14 14 | PRETRAIN_SIZE: 448 15 | QKV_BIAS: False 16 | EMBED_DIM: 3200 17 | NUM_HEADS: 25 18 | MLP_RATIO: 4 19 | INIT_VALUES: 0.1 20 | QK_NORMALIZATION: True 21 | DEPTH: 45 22 | USE_FLASH_ATTN: True 23 | PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" 24 | CLS_TARGET: 'cls_patch_concat' 25 | TRAIN: 26 | EMA: 27 | ENABLE: True 28 | DECAY: 0.998 29 | EPOCHS: 10 30 | WARMUP_EPOCHS: 1 31 | WEIGHT_DECAY: 0.0 32 | BASE_LR: 0.1 # 512 33 | WARMUP_LR: .0 34 | MIN_LR: .0 35 | LR_LAYER_DECAY: false 36 | OPTIMIZER: 37 | NAME: 'sgd' 38 | -------------------------------------------------------------------------------- /classification/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .build import build_loader, build_loader2 8 | -------------------------------------------------------------------------------- /classification/meta_data/train.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/classification/meta_data/train.txt.zip -------------------------------------------------------------------------------- /classification/meta_data/val.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/classification/meta_data/val.txt.zip -------------------------------------------------------------------------------- /classification/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .build import build_model 8 | -------------------------------------------------------------------------------- /classification/train_in1k.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | GPUS=${GPUS:-8} 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 10 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 11 | SRUN_ARGS=${SRUN_ARGS:-""} 12 | 13 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 14 | srun -p ${PARTITION} \ 15 | --job-name=${JOB_NAME} \ 16 | --gres=gpu:${GPUS_PER_NODE} \ 17 | --ntasks=${GPUS} \ 18 | --ntasks-per-node=${GPUS_PER_NODE} \ 19 | --cpus-per-task=${CPUS_PER_TASK} \ 20 | --kill-on-bad-exit=1 \ 21 | --quotatype=reserved \ 22 | ${SRUN_ARGS} \ 23 | python -u main.py \ 24 | --cfg ${CONFIG} \ 25 | --accumulation-steps 1 \ 26 | --local-rank 0 \ 27 | --output work_dirs ${@:4} 28 | -------------------------------------------------------------------------------- /clip_benchmark/AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | * `Mehdi Cherti `_ 6 | * `Romain Beaumont `_ 7 | -------------------------------------------------------------------------------- /clip_benchmark/HISTORY.rst: -------------------------------------------------------------------------------- 1 | ## History 2 | 3 | ### 1.4.0 4 | 5 | * Fix silent webdataset error-handling 6 | * Added support for wds/voc2007_multilabel 7 | * default to float32 8 | * add mscoco generative benchmark 9 | 10 | ### 1.3.0 11 | 12 | * update flickr8k results, solve issue #48, thanks to @orchidmajumder 13 | * Evaluate multiple models/datasets/languages using the CLI directly 14 | * Support Japanese CLIP by rinna 15 | * Add arabic imagenet 16 | * updating CuPL prompts with more generated sentences + ensembled with openAI prompts 17 | * put model in eval mode before evaluation 18 | * Webdataset updates 19 | * Make verbose the default 20 | 21 | ### 1.2.0 22 | 23 | * Added support for loading webdatasets 24 | 25 | ### 1.1.0 26 | 27 | * Added better support for multilingual eval 28 | * Added better support for linear probing 29 | * Added support for CuPL prompts 30 | 31 | ### 1.0.1 32 | 33 | * pypi description as markdown 34 | 35 | ### 1.0.0 36 | 37 | * Actual first release on PyPI. 38 | 39 | 40 | ### 0.1.0 41 | 42 | * First release on PyPI. 43 | -------------------------------------------------------------------------------- /clip_benchmark/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022, Mehdi Cherti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /clip_benchmark/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include * *.json 12 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 13 | -------------------------------------------------------------------------------- /clip_benchmark/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/benchmark.png -------------------------------------------------------------------------------- /clip_benchmark/benchmark/dataset_type.csv: -------------------------------------------------------------------------------- 1 | dataset,type 2 | imagenet1k,natural 3 | imagenetv2,natural 4 | imagenet-r,natural 5 | imagenet_sketch,specialized 6 | objectnet,natural 7 | imagenet-a,natural 8 | imagenet-o,natural 9 | vtab/cifar10,natural 10 | vtab/cifar100,natural 11 | mnist,specialized 12 | vtab/flowers,natural 13 | cars,natural 14 | vtab/svhn,natural 15 | fer2013,natural 16 | renderedsst2,specialized 17 | vtab/pets,natural 18 | vtab/caltech101,natural 19 | voc2007_multilabel,natural 20 | voc2007,natural 21 | sun397,natural 22 | fgvc_aircraft,natural 23 | country211,natural 24 | vtab/dtd,natural 25 | gtsrb,natural 26 | stl10,natural 27 | vtab/diabetic_retinopathy,specialized 28 | vtab/eurosat,specialized 29 | vtab/resisc45,specialized 30 | vtab/pcam,specialized 31 | vtab/clevr_count_all,structured 32 | vtab/clevr_closest_object_distance,structured 33 | vtab/dsprites_label_orientation,structured 34 | vtab/dsprites_label_x_position,structured 35 | vtab/dsprites_label_y_position,structured 36 | vtab/smallnorb_label_elevation,structured 37 | vtab/smallnorb_label_azimuth,structured 38 | vtab/dmlab,structured 39 | vtab/kitti_closest_vehicle_distance,structured 40 | mscoco_captions,retrieval 41 | flickr8k,retrieval 42 | flickr30k,retrieval 43 | -------------------------------------------------------------------------------- /clip_benchmark/benchmark/datasets.txt: -------------------------------------------------------------------------------- 1 | mscoco_captions 2 | flickr8k 3 | flickr30k 4 | imagenet1k 5 | imagenetv2 6 | imagenet_sketch 7 | imagenet-a 8 | imagenet-r 9 | objectnet 10 | fer2013 11 | voc2007 12 | voc2007_multilabel 13 | sun397 14 | cars 15 | fgvc_aircraft 16 | mnist 17 | stl10 18 | gtsrb 19 | country211 20 | renderedsst2 21 | vtab/caltech101 22 | vtab/cifar10 23 | vtab/cifar100 24 | vtab/clevr_count_all 25 | vtab/clevr_closest_object_distance 26 | vtab/diabetic_retinopathy 27 | vtab/dmlab 28 | vtab/dsprites_label_orientation 29 | vtab/dsprites_label_x_position 30 | vtab/dtd 31 | vtab/eurosat 32 | vtab/kitti_closest_vehicle_distance 33 | vtab/flowers 34 | vtab/pets 35 | vtab/pcam 36 | vtab/resisc45 37 | vtab/smallnorb_label_azimuth 38 | vtab/smallnorb_label_elevation 39 | vtab/svhn 40 | -------------------------------------------------------------------------------- /clip_benchmark/benchmark/datasets_multilingual.txt: -------------------------------------------------------------------------------- 1 | multilingual_mscoco_captions,es 2 | multilingual_mscoco_captions,it 3 | multilingual_mscoco_captions,ko 4 | multilingual_mscoco_captions,pl 5 | multilingual_mscoco_captions,ru 6 | multilingual_mscoco_captions,tr 7 | multilingual_mscoco_captions,zh 8 | multilingual_mscoco_captions,en 9 | imagenet1k,zh 10 | imagenet1k,it 11 | imagenet1k,jp 12 | imagenet1k,en 13 | imagenet1k,ar 14 | -------------------------------------------------------------------------------- /clip_benchmark/benchmark/models.txt: -------------------------------------------------------------------------------- 1 | ViT-B-32,openai 2 | ViT-B-16,openai 3 | ViT-L-14,openai 4 | ViT-L-14-336,openai 5 | ViT-B-32-quickgelu,laion400m_e32 6 | ViT-B-32,laion2b_e16 7 | ViT-B-32,laion2b_s34b_b79k 8 | ViT-B-16,laion400m_e32 9 | ViT-B-16-plus-240,laion400m_e32 10 | ViT-L-14,laion400m_e32 11 | ViT-L-14,laion2b_s32b_b82k 12 | ViT-H-14,laion2b_s32b_b79k 13 | ViT-g-14,laion2b_s12b_b42k 14 | -------------------------------------------------------------------------------- /clip_benchmark/benchmark/webdatasets.txt: -------------------------------------------------------------------------------- 1 | wds/mscoco_captions 2 | wds/flickr8k 3 | wds/flickr30k 4 | wds/imagenet1k 5 | wds/imagenetv2 6 | wds/imagenet_sketch 7 | wds/imagenet-a 8 | wds/imagenet-r 9 | wds/imagenet-o 10 | wds/objectnet 11 | wds/fer2013 12 | wds/voc2007 13 | wds/voc2007_multilabel 14 | wds/sun397 15 | wds/cars 16 | wds/fgvc_aircraft 17 | wds/mnist 18 | wds/stl10 19 | wds/gtsrb 20 | wds/country211 21 | wds/renderedsst2 22 | wds/vtab/caltech101 23 | wds/vtab/cifar10 24 | wds/vtab/cifar100 25 | wds/vtab/clevr_count_all 26 | wds/vtab/clevr_closest_object_distance 27 | wds/vtab/diabetic_retinopathy 28 | wds/vtab/dmlab 29 | wds/vtab/dsprites_label_orientation 30 | wds/vtab/dsprites_label_x_position 31 | wds/vtab/dsprites_label_y_position 32 | wds/vtab/dtd 33 | wds/vtab/eurosat 34 | wds/vtab/kitti_closest_vehicle_distance 35 | wds/vtab/flowers 36 | wds/vtab/pets 37 | wds/vtab/pcam 38 | wds/vtab/resisc45 39 | wds/vtab/smallnorb_label_azimuth 40 | wds/vtab/smallnorb_label_elevation 41 | wds/vtab/svhn 42 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for CLIP Benchmark.""" 2 | 3 | __author__ = """Mehdi Cherti""" 4 | __email__ = 'mehdicherti@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/clip_benchmark/datasets/__init__.py -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/datasets/tools.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def process_single_caption(caption, max_words=50): 5 | caption = re.sub(r"([.!\"()*#:;~])", ' ', caption.lower()) 6 | caption = re.sub(r'\s{2,}', ' ', caption) 7 | caption = caption.rstrip('\n') 8 | caption = caption.strip(' ') 9 | 10 | # truncate caption 11 | caption_words = caption.split(' ') 12 | if len(caption_words) > max_words: 13 | caption = ' '.join(caption_words[: max_words]) 14 | return caption 15 | 16 | 17 | def pre_caption(caption, max_words=50): 18 | if type(caption) == str: 19 | caption = process_single_caption(caption, max_words) 20 | else: 21 | caption = [process_single_caption(c, max_words) for c in caption] 22 | return caption 23 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/clip_benchmark/metrics/__init__.py -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/model_collection.py: -------------------------------------------------------------------------------- 1 | import open_clip 2 | 3 | 4 | def get_model_collection_from_file(path): 5 | return [l.strip().split(',') for l in open(path).readlines()] 6 | 7 | 8 | model_collection = { 9 | 'openclip_base': [ 10 | ('ViT-B-32-quickgelu', 'laion400m_e32'), 11 | ('ViT-B-32', 'laion2b_e16'), 12 | ('ViT-B-32', 'laion2b_s34b_b79k'), 13 | ('ViT-B-16', 'laion400m_e32'), 14 | ('ViT-B-16-plus-240', 'laion400m_e32'), 15 | ('ViT-L-14', 'laion400m_e32'), 16 | ('ViT-L-14', 'laion2b_s32b_b82k'), 17 | ('ViT-H-14', 'laion2b_s32b_b79k'), 18 | ('ViT-g-14', 'laion2b_s12b_b42k'), 19 | ], 20 | 'openclip_multilingual': [ 21 | ('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), 22 | ('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), 23 | ], 24 | 'openclip_all': open_clip.list_pretrained(), 25 | 'openai': [ 26 | ('ViT-B-32', 'openai'), 27 | ('ViT-B-16', 'openai'), 28 | ('ViT-L-14', 'openai'), 29 | ('ViT-L-14-336', 'openai'), 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | 5 | from .internvl import load_internvl 6 | from .japanese_clip import load_japanese_clip 7 | from .open_clip import load_open_clip 8 | 9 | # loading function must return (model, transform, tokenizer) 10 | TYPE2FUNC = { 11 | 'open_clip': load_open_clip, 12 | 'ja_clip': load_japanese_clip, 13 | 'internvl': load_internvl, 14 | } 15 | MODEL_TYPES = list(TYPE2FUNC.keys()) 16 | 17 | 18 | def load_clip( 19 | model_type: str, 20 | model_name: str, 21 | pretrained: str, 22 | cache_dir: str, 23 | device: Union[str, torch.device] = 'cuda' 24 | ): 25 | assert model_type in MODEL_TYPES, f'model_type={model_type} is invalid!' 26 | load_func = TYPE2FUNC[model_type] 27 | return load_func(model_name=model_name, pretrained=pretrained, cache_dir=cache_dir, device=device) 28 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/internvl.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .internvl_c_pytorch import load_internvl_c_pytorch 8 | from .internvl_huggingface import (load_internvl_c_huggingface, 9 | load_internvl_g_huggingface) 10 | 11 | 12 | def load_internvl(model_name, pretrained, cache_dir, device): 13 | if model_name == 'internvl_c_classification': 14 | return load_internvl_c_pytorch(pretrained, device, 'classification') 15 | elif model_name == 'internvl_c_retrieval': 16 | return load_internvl_c_pytorch(pretrained, device, 'retrieval') 17 | elif model_name == 'internvl_c_classification_hf': 18 | return load_internvl_c_huggingface(pretrained, device, 'classification') 19 | elif model_name == 'internvl_c_retrieval_hf': 20 | return load_internvl_c_huggingface(pretrained, device, 'retrieval') 21 | elif model_name == 'internvl_g_classification_hf': 22 | return load_internvl_g_huggingface(pretrained, device, 'classification') 23 | elif model_name == 'internvl_g_retrieval_hf': 24 | return load_internvl_g_huggingface(pretrained, device, 'retrieval') 25 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "LlamaForCausalLM" 4 | ], 5 | "bos_token_id": 1, 6 | "eos_token_id": 2, 7 | "hidden_act": "silu", 8 | "hidden_size": 4096, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 11008, 11 | "max_position_embeddings": 2048, 12 | "max_sequence_length": 2048, 13 | "model_type": "llama", 14 | "num_attention_heads": 32, 15 | "num_hidden_layers": 32, 16 | "pad_token_id": 0, 17 | "rms_norm_eps": 1e-06, 18 | "tie_word_embeddings": false, 19 | "torch_dtype": "float16", 20 | "transformers_version": "4.28.0.dev0", 21 | "use_cache": true, 22 | "vocab_size": 49954 23 | } 24 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "bos_token_id": 1, 4 | "eos_token_id": 2, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.28.0.dev0" 7 | } 8 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "", 3 | "eos_token": "", 4 | "pad_token": "[PAD]", 5 | "unk_token": "" 6 | } 7 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/tokenizer.model -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/internvl_c_pytorch/chinese_alpaca_lora_7b/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_bos_token": true, 3 | "add_eos_token": false, 4 | "bos_token": { 5 | "__type": "AddedToken", 6 | "content": "", 7 | "lstrip": false, 8 | "normalized": true, 9 | "rstrip": false, 10 | "single_word": false 11 | }, 12 | "clean_up_tokenization_spaces": false, 13 | "eos_token": { 14 | "__type": "AddedToken", 15 | "content": "", 16 | "lstrip": false, 17 | "normalized": true, 18 | "rstrip": false, 19 | "single_word": false 20 | }, 21 | "model_max_length": 1000000000000000019884624838656, 22 | "pad_token": null, 23 | "sp_model_kwargs": {}, 24 | "special_tokens_map_file": "chinese_alpaca_lora_7b/special_tokens_map.json", 25 | "tokenizer_class": "LlamaTokenizer", 26 | "unk_token": { 27 | "__type": "AddedToken", 28 | "content": "", 29 | "lstrip": false, 30 | "normalized": true, 31 | "rstrip": false, 32 | "single_word": false 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /clip_benchmark/clip_benchmark/models/open_clip.py: -------------------------------------------------------------------------------- 1 | import open_clip 2 | 3 | 4 | def load_open_clip(model_name: str = 'ViT-B-32-quickgelu', pretrained: str = 'laion400m_e32', cache_dir: str = None, 5 | device='cpu'): 6 | model, _, transform = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, cache_dir=cache_dir) 7 | model = model.to(device) 8 | tokenizer = open_clip.get_tokenizer(model_name) 9 | return model, transform, tokenizer 10 | -------------------------------------------------------------------------------- /clip_benchmark/probe_benchmark/PROBES.md: -------------------------------------------------------------------------------- 1 | Steps to run. 2 | 3 | 1. Navigate to `CLIP_benchmark`. 4 | 2. Run `export PYTHONPATH=$PWD`. 5 | 3. (Optional) To re-run the experiments, run `python probe_benchmark/scaling_experiments.py`. You'll have to change line 6 | 51 to point to your data. 7 | 4. (Optional) To generate the results, run `python probe_benchmark/build_df_scaling_experiments.py`. 8 | 5. (Optional) VTAB requires post-processing to average. Run `python probe_benchmark/process_vtab.py`. 9 | 6. Generate plots with `python probe_benchmark/scaling_plot.py`. 10 | 7. Generate table with `python probe_benchark/generate_table.py`. 11 | -------------------------------------------------------------------------------- /clip_benchmark/probe_benchmark/gmacs_vs_perf_retrieval.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/probe_benchmark/gmacs_vs_perf_retrieval.pdf -------------------------------------------------------------------------------- /clip_benchmark/probe_benchmark/imagenet_cifar_lp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/probe_benchmark/imagenet_cifar_lp.pdf -------------------------------------------------------------------------------- /clip_benchmark/probe_benchmark/imagenet_cifar_lp_vtab.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/clip_benchmark/probe_benchmark/imagenet_cifar_lp_vtab.pdf -------------------------------------------------------------------------------- /clip_benchmark/requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /clip_benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | open_clip_torch>=0.2.1 2 | opencv-python 3 | peft>=0.6.2 4 | protobuf==3.20.3 5 | pycocoevalcap 6 | pyyaml 7 | scikit-learn>=1.0,<2 8 | scikit-learn 9 | scipy 10 | task_adaptation 11 | tensorflow==2.11.0 12 | termcolor 13 | tqdm>=2 14 | transformers>=4.32.0 15 | webdataset>=0.2.31 16 | yacs 17 | -------------------------------------------------------------------------------- /clip_benchmark/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:clip_benchmark/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | -------------------------------------------------------------------------------- /clip_benchmark/tests/test_clip_benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Tests for `clip_benchmark` package.""" 4 | 5 | import os 6 | 7 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 8 | from clip_benchmark.cli import run 9 | 10 | 11 | class base_args: 12 | dataset = 'dummy' 13 | split = 'test' 14 | model = 'ViT-B-32-quickgelu' 15 | pretrained = 'laion400m_e32' 16 | task = 'zeroshot_classification' 17 | amp = False 18 | num_workers = 4 19 | batch_size = 64 20 | dataset_root = 'root' 21 | output = 'result.json' 22 | verbose = True 23 | root = 'root' 24 | annotation_file = '' 25 | seed = 0 26 | skip_load = False 27 | language = 'en' 28 | model_cache_dir = None 29 | cupl = False 30 | save_clf = None 31 | load_clfs = [] 32 | model_type = 'open_clip' 33 | wds_cache_dir = None 34 | which = 'eval' 35 | skip_existing = False 36 | 37 | 38 | def test_base(): 39 | run(base_args) 40 | -------------------------------------------------------------------------------- /clip_benchmark/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38, flake8 3 | 4 | [travis] 5 | python = 6 | 3.8: py38 7 | 3.7: py37 8 | 3.6: py36 9 | 10 | [testenv:flake8] 11 | basepython = python 12 | deps = flake8 13 | commands = flake8 clip_benchmark tests 14 | 15 | [testenv] 16 | setenv = 17 | PYTHONPATH = {toxinidir} 18 | 19 | commands = python setup.py test 20 | -------------------------------------------------------------------------------- /internvl_chat/eval/mmmu_pro/prompts.yaml: -------------------------------------------------------------------------------- 1 | cot: 2 | vision: "Write out the multiple-choice question in the image and then solve it. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options. Think step by step before answering." 3 | standard: "Answer the preceding multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options. Think step by step before answering." 4 | direct: 5 | vision: "Answer with the option letter from the given choices directly. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options." 6 | standard: "Answer with the option letter from the given choices directly." 7 | -------------------------------------------------------------------------------- /internvl_chat/eval/vqa/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('--src', type=str) 6 | parser.add_argument('--dst', type=str) 7 | args = parser.parse_args() 8 | 9 | all_answers = [] 10 | data = json.load(open(args.src)) 11 | for res in data: 12 | question_id = res['questionId'] 13 | answer = res['answer'].rstrip('.').lower() 14 | all_answers.append({'questionId': question_id, 'prediction': answer}) 15 | 16 | with open(args.dst, 'w') as f: 17 | json.dump(all_answers, f) 18 | -------------------------------------------------------------------------------- /internvl_chat/examples/image1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image1.jpg -------------------------------------------------------------------------------- /internvl_chat/examples/image2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image2.jpg -------------------------------------------------------------------------------- /internvl_chat/examples/image3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image3.jpg -------------------------------------------------------------------------------- /internvl_chat/examples/image4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image4.jpg -------------------------------------------------------------------------------- /internvl_chat/examples/image5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat/examples/image5.jpg -------------------------------------------------------------------------------- /internvl_chat/internvl/model/internvl_chat/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .configuration_intern_vit import InternVisionConfig 8 | from .configuration_internvl_chat import InternVLChatConfig 9 | from .modeling_intern_vit import InternVisionModel 10 | from .modeling_internvl_chat import InternVLChatModel 11 | 12 | __all__ = ['InternVisionConfig', 'InternVisionModel', 13 | 'InternVLChatConfig', 'InternVLChatModel'] 14 | -------------------------------------------------------------------------------- /internvl_chat/internvl/patch/internvit_liger_monkey_patch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | def apply_liger_kernel_to_internvit() -> None: 8 | from internvl.model.internvl_chat import modeling_intern_vit 9 | from liger_kernel.transformers.layer_norm import LigerLayerNorm 10 | from liger_kernel.transformers.rms_norm import LigerRMSNorm 11 | modeling_intern_vit.NORM2FN['rms_norm'] = LigerRMSNorm 12 | modeling_intern_vit.NORM2FN['layer_norm'] = LigerLayerNorm 13 | print('Liger kernel applied to InternViT') 14 | -------------------------------------------------------------------------------- /internvl_chat/internvl/patch/llama_rmsnorm_monkey_patch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import transformers 8 | 9 | 10 | def replace_llama_rmsnorm_with_fused_rmsnorm(): 11 | try: 12 | from functools import partial 13 | 14 | from apex.normalization import FusedRMSNorm 15 | LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6) # noqa 16 | transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm 17 | print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm') 18 | except ImportError: 19 | # using the normal LlamaRMSNorm 20 | pass 21 | except Exception: 22 | print('discovered apex but it failed to load, falling back to LlamaRMSNorm') 23 | pass 24 | -------------------------------------------------------------------------------- /internvl_chat/internvl/train/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | -------------------------------------------------------------------------------- /internvl_chat/internvl/train/constants.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | IMG_CONTEXT_TOKEN = '' 8 | IMG_START_TOKEN = '' 9 | IMG_END_TOKEN = '' 10 | QUAD_START_TOKEN = '' 11 | QUAD_END_TOKEN = '' 12 | REF_START_TOKEN = '' 13 | REF_END_TOKEN = '' 14 | BOX_START_TOKEN = '' 15 | BOX_END_TOKEN = '' 16 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 17 | IMAGENET_STD = (0.229, 0.224, 0.225) 18 | CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073) 19 | CLIP_STD = (0.2686295, 0.2613025, 0.2757711) 20 | SIGLIP_MEAN = (0.5, 0.5, 0.5) 21 | SIGLIP_STD = (0.5, 0.5, 0.5) 22 | -------------------------------------------------------------------------------- /internvl_chat/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "internvl_chat" 7 | version = "2.0.0" 8 | description = "Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch>=2", "torchvision>=0.15", 17 | "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate", "peft>=0.4.0", "bitsandbytes==0.41.0", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn>=1.2.2", 20 | "gradio==3.35.2", "gradio_client==0.2.9", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "deepspeed==0.13.5", "einops", "einops-exts", "timm==0.9.12", 23 | ] 24 | 25 | [project.urls] 26 | "Homepage" = "https://github.com/OpenGVLab/InternVL" 27 | "Bug Tracker" = "https://github.com/OpenGVLab/InternVL/issues" 28 | 29 | [tool.setuptools.packages.find] 30 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"] 31 | 32 | [tool.wheel] 33 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"] 34 | -------------------------------------------------------------------------------- /internvl_chat/shell/data/coco_caption.json: -------------------------------------------------------------------------------- 1 | { 2 | "coco_karpathy_train_567k": { 3 | "root": "data/coco/", 4 | "annotation": "data/coco/annotations/coco_karpathy_train_567k.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 566747 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl3.0/mpo_data_construction/correctness_build_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 4 | 5 | PROMPT_VERSION="en_v2" 6 | data_dir="outputs_mpo/correctness_mmpr_v1_2_${PROMPT_VERSION}" 7 | save_dir="outputs_mpo/correctness_mmpr_v1_2_${PROMPT_VERSION}_pairs" 8 | 9 | model="OpenGVLab_InternVL3-8B" 10 | 11 | declare -a max_tiles=( \ 12 | "1" \ 13 | "6" \ 14 | "12" \ 15 | "18" \ 16 | "24" \ 17 | ) 18 | 19 | for ((j=0; j<${#max_tiles[@]}; j++)); do 20 | curr_max_tiles=${max_tiles[j]} 21 | echo "$(date) ${model} ${curr_max_tiles}" 22 | 23 | srun \ 24 | -p Intern5 \ 25 | --gres=gpu:0 \ 26 | python -u tools/reasoning_data_pipeline/mmpr_data_pipeline_correctness_postprocess.py \ 27 | --data-dir "${data_dir}/${model}/max_tiles_${curr_max_tiles}" \ 28 | --save-dir "${save_dir}/${model}" \ 29 | --answer-fix \ 30 | --force \ 31 | --num-pairs-per-key 15 \ 32 | --max-lines 1200000 33 | 34 | done 35 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl3.0/visualprm_data_construction/visualprm_build_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 4 | 5 | PROMPT_VERSION="en_v2" 6 | data_dir="outputs_prm/visualprm_v1_1_${PROMPT_VERSION}_raw" 7 | save_dir="outputs_prm/visualprm_v1_1_${PROMPT_VERSION}_conv" 8 | 9 | model="OpenGVLab_InternVL3-8B" 10 | 11 | declare -a max_tiles=( \ 12 | "1" \ 13 | "6" \ 14 | "12" \ 15 | "18" \ 16 | "24" \ 17 | ) 18 | 19 | for ((j=0; j<${#max_tiles[@]}; j++)); do 20 | curr_max_tiles=${max_tiles[j]} 21 | echo "$(date) ${model} ${curr_max_tiles}" 22 | 23 | srun \ 24 | -p Intern5 \ 25 | --gres=gpu:0 \ 26 | python -u tools/reasoning_data_pipeline/visualprm_data_pipeline_postprocess.py \ 27 | --data-dir "${data_dir}/${model}/max_tiles_${curr_max_tiles}" \ 28 | --save-dir "${save_dir}/${model}" \ 29 | --mc-threshold 0.0 30 | 31 | done 32 | -------------------------------------------------------------------------------- /internvl_chat/tools/convert_to_int8.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModel, AutoTokenizer 3 | 4 | path = 'OpenGVLab/InternVL-Chat-V1-5' 5 | model = AutoModel.from_pretrained( 6 | path, 7 | torch_dtype=torch.bfloat16, 8 | low_cpu_mem_usage=True, 9 | trust_remote_code=True, 10 | load_in_8bit=True).eval() 11 | 12 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 13 | 14 | model.save_pretrained('release/InternVL-Chat-V1-5-Int8') 15 | tokenizer.save_pretrained('release/InternVL-Chat-V1-5-Int8') 16 | print('finished') 17 | -------------------------------------------------------------------------------- /internvl_chat/tools/extract_mlp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path 3 | 4 | import torch 5 | from internvl.model.internvl_chat import InternVLChatModel 6 | 7 | argparse = argparse.ArgumentParser() 8 | argparse.add_argument('model_path', type=str, default='') 9 | argparse.add_argument('output_path', type=str, default='') 10 | 11 | args = argparse.parse_args() 12 | 13 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16) 14 | model = model.mlp1.to(torch.bfloat16) 15 | 16 | ckpt = model.state_dict() 17 | output_path = os.path.join(args.output_path, 'mlp_projector.pth') 18 | torch.save(ckpt, output_path) 19 | print('finished') 20 | -------------------------------------------------------------------------------- /internvl_chat/tools/extract_vit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from internvl.model.internvl_chat import InternVLChatModel 5 | 6 | argparse = argparse.ArgumentParser() 7 | argparse.add_argument('model_path', type=str, default='') 8 | argparse.add_argument('output_path', type=str, default='') 9 | 10 | args = argparse.parse_args() 11 | 12 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16) 13 | model = model.vision_model.to(torch.bfloat16) 14 | 15 | model.save_pretrained(args.output_path) 16 | print('finished') 17 | -------------------------------------------------------------------------------- /internvl_chat/tools/json2jsonl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | argparse = argparse.ArgumentParser() 5 | argparse.add_argument('path', type=str) 6 | 7 | args = argparse.parse_args() 8 | 9 | assert args.path.endswith('.json') 10 | 11 | data = json.load(open(args.path)) 12 | writer = open(args.path.replace('.json', '.jsonl'), 'w') 13 | for idx, item in enumerate(data): 14 | conversations = item['conversations'] 15 | if conversations[0]['from'] == 'system': 16 | item['conversations'] = item['conversations'][1:] 17 | item['id'] = idx 18 | writer.write(json.dumps(item, ensure_ascii=False) + '\n') 19 | 20 | writer.close() 21 | -------------------------------------------------------------------------------- /internvl_chat/tools/jsonl2jsonl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | argparse = argparse.ArgumentParser() 6 | argparse.add_argument('path', type=str) 7 | 8 | args = argparse.parse_args() 9 | 10 | assert args.path.endswith('.jsonl') 11 | 12 | f = open(args.path) 13 | data = [json.loads(line) for line in f.readlines()] 14 | writer = open(args.path.replace('.jsonl', '_new.jsonl'), 'w') 15 | for idx, item in enumerate(data): 16 | item['id'] = idx 17 | conversations = item['conversations'] 18 | if conversations[0]['from'] == 'system': 19 | item['conversations'] = item['conversations'][1:] 20 | writer.write(json.dumps(item, ensure_ascii=False) + '\n') 21 | 22 | writer.close() 23 | -------------------------------------------------------------------------------- /internvl_chat/tools/merge_lora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from internvl.model.internvl_chat import InternVLChatModel 5 | from transformers import AutoTokenizer 6 | 7 | argparse = argparse.ArgumentParser() 8 | argparse.add_argument('input_path', type=str, help='Path to the input model') 9 | argparse.add_argument('output_path', type=str, help='Path to the output model') 10 | args = argparse.parse_args() 11 | 12 | print('Loading model...') 13 | model = InternVLChatModel.from_pretrained( 14 | args.input_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).eval() 15 | print('Loading tokenizer...') 16 | tokenizer = AutoTokenizer.from_pretrained(args.input_path, trust_remote_code=True) 17 | 18 | if model.config.use_backbone_lora: 19 | model.vision_model.merge_and_unload() 20 | model.vision_model = model.vision_model.model 21 | model.config.use_backbone_lora = 0 22 | if model.config.use_llm_lora: 23 | model.language_model.merge_and_unload() 24 | model.language_model = model.language_model.model 25 | model.config.use_llm_lora = 0 26 | 27 | print('Saving model...') 28 | model.save_pretrained(args.output_path) 29 | print('Saving tokenizer...') 30 | tokenizer.save_pretrained(args.output_path) 31 | print('Done!') 32 | -------------------------------------------------------------------------------- /internvl_chat/tools/replace_llm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from internvl.model.internvl_chat import InternVLChatModel 5 | from transformers import AutoModel, AutoTokenizer 6 | 7 | argparse = argparse.ArgumentParser() 8 | argparse.add_argument('model_path', type=str, default='') 9 | argparse.add_argument('llm_path', type=str, default='') 10 | 11 | args = argparse.parse_args() 12 | 13 | if args.model_path[-1] == '/': 14 | args.model_path = args.model_path[:-1] 15 | 16 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16) 17 | 18 | llm = AutoModel.from_pretrained( 19 | args.llm_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 20 | tokenizer = AutoTokenizer.from_pretrained( 21 | args.llm_path, trust_remote_code=True) 22 | model.language_model = llm 23 | model.config.llm_config = llm.config 24 | model.to(torch.bfloat16) 25 | 26 | output_path = args.model_path + '_replace_llm' 27 | model.save_pretrained(output_path) 28 | tokenizer.save_pretrained(output_path) 29 | print('finished') 30 | -------------------------------------------------------------------------------- /internvl_chat/tools/resize_pos_embed.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from internvl.model.internvl_chat import InternVLChatModel 5 | from transformers import AutoTokenizer 6 | 7 | argparse = argparse.ArgumentParser() 8 | argparse.add_argument('model_path', type=str, default='') 9 | argparse.add_argument('output_path', type=str, default='') 10 | argparse.add_argument('force_image_size', type=int, default=448) 11 | 12 | args = argparse.parse_args() 13 | 14 | model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16) 15 | model.vision_model.resize_pos_embeddings(old_size=model.config.vision_config.image_size, 16 | new_size=args.force_image_size, 17 | patch_size=14) 18 | model.config.vision_config.image_size = args.force_image_size 19 | model.config.force_image_size = args.force_image_size 20 | 21 | model.save_pretrained(args.output_path) 22 | 23 | tokenizer = AutoTokenizer.from_pretrained(args.model_path) 24 | tokenizer.save_pretrained(args.output_path) 25 | print('finished') 26 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 1, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e9, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": true 41 | } 42 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e8, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e8, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } 42 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e7, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_100b.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e4, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_100b_1e8.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e8, 7 | "reduce_bucket_size": 1e8, 8 | "stage3_prefetch_bucket_size": 1e8, 9 | "stage3_param_persistence_threshold": 1e4, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_34b.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e5, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_70b.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e5, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat_llava/images/demo_cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/demo_cli.gif -------------------------------------------------------------------------------- /internvl_chat_llava/images/llava_example_cmp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/llava_example_cmp.png -------------------------------------------------------------------------------- /internvl_chat_llava/images/llava_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/llava_logo.png -------------------------------------------------------------------------------- /internvl_chat_llava/images/llava_v1_5_radar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/images/llava_v1_5_radar.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 3 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | class SharedEmbedding(nn.Embedding): 7 | 8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) -------------------------------------------------------------------------------- /internvl_chat_llava/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") \ 9 | or "intern" in vision_tower.lower(): 10 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 11 | 12 | raise ValueError(f'Unknown vision tower: {vision_tower}') 13 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/__init__.py -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img1.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img2.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img3.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img4.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img5.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/img6.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_chat_llava/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /internvl_chat_llava/llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | # from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | 8 | # replace_llama_attn_with_flash_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train(attn_implementation="flash_attention_2") 14 | -------------------------------------------------------------------------------- /internvl_chat_llava/llava/train/train_mem_custom.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | # from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | 8 | # replace_llama_attn_with_flash_attn() 9 | 10 | from llava.train.train_custom import train 11 | from llava.train.dist_utils import init_dist 12 | 13 | if __name__ == "__main__": 14 | try: 15 | init_dist(launcher='slurm', backend='nccl') 16 | print("slurm environment detected") 17 | except: 18 | init_dist(launcher='pytorch', backend='nccl') 19 | train(attn_implementation="flash_attention_2") 20 | -------------------------------------------------------------------------------- /internvl_chat_llava/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.1.1" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch>=2", "torchvision>=0.15", 17 | "transformers>=4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate", "peft>=0.4.0", "bitsandbytes==0.41.0", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn>=1.2.2", 20 | "gradio==3.35.2", "gradio_client==0.2.9", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "deepspeed==0.13.5", "einops", "einops-exts", "timm==0.9.12", 23 | ] 24 | 25 | [project.urls] 26 | "Homepage" = "https://github.com/OpenGVLab/InternVL" 27 | "Bug Tracker" = "https://github.com/OpenGVLab/InternVL/issues" 28 | 29 | [tool.setuptools.packages.find] 30 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 31 | 32 | [tool.wheel] 33 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 34 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/sqa_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | for IDX in {0..7}; do 5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ 6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ 7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ 8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ 9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ 10 | --num-chunks $CHUNKS \ 11 | --chunk-idx $IDX \ 12 | --conv-mode llava_v1 & 13 | done 14 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/sqa_eval_gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | output_file="test_llava-13b.jsonl" 5 | 6 | # Clear out the output file if it exists. 7 | > "$output_file" 8 | 9 | # Loop through the indices and concatenate each file. 10 | for idx in $(seq 0 $((CHUNKS-1))); do 11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" 12 | done 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ 16 | --result-file ./test_llava-13b.jsonl \ 17 | --output-file ./test_llava-13b_output.json \ 18 | --output-result ./test_llava-13b_result.json 19 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 12 | 13 | python llava/eval/eval_gpt_review_bench.py \ 14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 16 | --rule llava/eval/table/rule.json \ 17 | --answer-list \ 18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 20 | --output \ 21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 22 | 23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 24 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 14 | 15 | python scripts/convert_mmbench_for_submission.py \ 16 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 17 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 19 | --experiment llava-v1.5-13b 20 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_cn_20231003" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --lang cn \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 18 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \ 19 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \ 20 | --experiment llava-v1.5-13b 21 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 6 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 7 | --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | cd ./playground/data/eval/MME 12 | 13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b 14 | 15 | cd eval_tool 16 | 17 | python calculation.py --results_dir answers/llava-v1.5-13b 18 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 6 | --image-folder ./playground/data/eval/mm-vet/images \ 7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p ./playground/data/eval/mm-vet/results 12 | 13 | python scripts/convert_mmvet_for_eval.py \ 14 | --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 15 | --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json 16 | 17 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 6 | --image-folder ./playground/data/eval/pope/val2014 \ 7 | --answers-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python llava/eval/eval_pope.py \ 12 | --annotation-dir ./playground/data/eval/pope/coco \ 13 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 14 | --result-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl 15 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_science \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 6 | --image-folder ./playground/data/eval/scienceqa/images/test \ 7 | --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 8 | --single-pred-prompt \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | python llava/eval/eval_science_qa.py \ 13 | --base-dir ./playground/data/eval/scienceqa \ 14 | --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 15 | --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \ 16 | --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json 17 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 6 | --image-folder ./playground/data/eval/textvqa/train_images \ 7 | --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python -m llava.eval.eval_textvqa \ 12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 13 | --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl 14 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 6 | --image-folder ./playground/data/eval/vizwiz/test \ 7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python scripts/convert_vizwiz_for_submission.py \ 12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 13 | --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json 15 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder ./playground/data/eval/vqav2/test2015 \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 36 | 37 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/v1_5/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version plain \ 7 | --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./playground/data/LLaVA-Pretrain/images \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --tune_mm_mlp_adapter True \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --bf16 True \ 16 | --output_dir ./checkpoints/llava-v1.5-13b-pretrain \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 24000 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-3 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --dataloader_num_workers 4 \ 34 | --lazy_preprocess True \ 35 | --report_to wandb 36 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/zero1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 1, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /internvl_chat_llava/scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 12 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 13 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/${MODEL_NAME}.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 18 | 19 | python llava/eval/eval_gpt_review_bench.py \ 20 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 21 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 22 | --rule llava/eval/table/rule.json \ 23 | --answer-list \ 24 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 25 | playground/data/eval/llava-bench-in-the-wild/answers/${MODEL_NAME}.jsonl \ 26 | --output \ 27 | playground/data/eval/llava-bench-in-the-wild/reviews/${MODEL_NAME}.jsonl 28 | 29 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/${MODEL_NAME}.jsonl 30 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | SPLIT="mmbench_dev_20230712" # "mmbench_dev_20230712" or "mmbench_test_en_20231003" 10 | 11 | python -m llava.eval.model_vqa_mmbench \ 12 | --model-path ${OUTPUT_DIR} \ 13 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 14 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${MODEL_NAME}.jsonl \ 15 | --single-pred-prompt \ 16 | --temperature 0 \ 17 | --conv-mode vicuna_v1 18 | 19 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 20 | 21 | python scripts/convert_mmbench_for_submission.py \ 22 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 23 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 24 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 25 | --experiment ${MODEL_NAME} 26 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa_loader \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 12 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 13 | --answers-file ./playground/data/eval/MME/answers/${MODEL_NAME}.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | cd ./playground/data/eval/MME 18 | 19 | python convert_answer_to_mme.py --experiment ${MODEL_NAME} 20 | 21 | cd eval_tool 22 | 23 | python calculation.py --results_dir answers/${MODEL_NAME} 24 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 12 | --image-folder ./playground/data/eval/mm-vet/images \ 13 | --answers-file ./playground/data/eval/mm-vet/answers/${MODEL_NAME}.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | mkdir -p ./playground/data/eval/mm-vet/results 18 | 19 | python scripts/convert_mmvet_for_eval.py \ 20 | --src ./playground/data/eval/mm-vet/answers/${MODEL_NAME}.jsonl \ 21 | --dst ./playground/data/eval/mm-vet/results/${MODEL_NAME}.json 22 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa_loader \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 12 | --image-folder ./playground/data/eval/pope/val2014 \ 13 | --answers-file ./playground/data/eval/pope/answers/${MODEL_NAME}.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | python llava/eval/eval_pope.py \ 18 | --annotation-dir ./playground/data/eval/pope/coco \ 19 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 20 | --result-file ./playground/data/eval/pope/answers/${MODEL_NAME}.jsonl 21 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa_science \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 12 | --image-folder ./playground/data/eval/scienceqa/images/test \ 13 | --answers-file ./playground/data/eval/scienceqa/answers/${MODEL_NAME}.jsonl \ 14 | --single-pred-prompt \ 15 | --temperature 0 \ 16 | --conv-mode vicuna_v1 17 | 18 | python llava/eval/eval_science_qa.py \ 19 | --base-dir ./playground/data/eval/scienceqa \ 20 | --result-file ./playground/data/eval/scienceqa/answers/${MODEL_NAME}.jsonl \ 21 | --output-file ./playground/data/eval/scienceqa/answers/${MODEL_NAME}_output.jsonl \ 22 | --output-result ./playground/data/eval/scienceqa/answers/${MODEL_NAME}_result.json 23 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa_loader \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 12 | --image-folder ./playground/data/eval/textvqa/train_images \ 13 | --answers-file ./playground/data/eval/textvqa/answers/${MODEL_NAME}.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | python -m llava.eval.eval_textvqa \ 18 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 19 | --result-file ./playground/data/eval/textvqa/answers/${MODEL_NAME}.jsonl 20 | -------------------------------------------------------------------------------- /internvl_chat_llava/scripts_internvl/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 5 | 6 | OUTPUT_DIR=$1 7 | MODEL_NAME=$(basename ${OUTPUT_DIR}) 8 | 9 | python -m llava.eval.model_vqa_loader \ 10 | --model-path ${OUTPUT_DIR} \ 11 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 12 | --image-folder ./playground/data/eval/vizwiz/test \ 13 | --answers-file ./playground/data/eval/vizwiz/answers/${MODEL_NAME}.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | python scripts/convert_vizwiz_for_submission.py \ 18 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 19 | --result-file ./playground/data/eval/vizwiz/answers/${MODEL_NAME}.jsonl \ 20 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/${MODEL_NAME}.json 21 | -------------------------------------------------------------------------------- /internvl_g/evaluate.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | CHECKPOINT=${1} 4 | DATASET=${2} 5 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 6 | echo "CHECKPOINT: ${CHECKPOINT}" 7 | 8 | if [ ${DATASET} == "caption" ]; then 9 | torchrun \ 10 | --nnodes=1 \ 11 | --node_rank=0 \ 12 | --master_addr=127.0.0.1 \ 13 | --nproc_per_node=8 \ 14 | --master_port=63667 \ 15 | eval/evaluate_caption.py --checkpoint ${CHECKPOINT} 16 | fi 17 | 18 | if [ ${DATASET} == "caption-coco" ]; then 19 | torchrun \ 20 | --nnodes=1 \ 21 | --node_rank=0 \ 22 | --master_addr=127.0.0.1 \ 23 | --nproc_per_node=8 \ 24 | --master_port=63667 \ 25 | eval/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets coco 26 | fi 27 | 28 | if [ ${DATASET} == "caption-flickr30k" ]; then 29 | torchrun \ 30 | --nnodes=1 \ 31 | --node_rank=0 \ 32 | --master_addr=127.0.0.1 \ 33 | --nproc_per_node=8 \ 34 | --master_port=63667 \ 35 | eval/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets flickr30k 36 | fi 37 | 38 | if [ ${DATASET} == "caption-nocaps" ]; then 39 | torchrun \ 40 | --nnodes=1 \ 41 | --node_rank=0 \ 42 | --master_addr=127.0.0.1 \ 43 | --nproc_per_node=8 \ 44 | --master_port=63667 \ 45 | eval/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets nocaps 46 | fi 47 | -------------------------------------------------------------------------------- /internvl_g/internvl/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_g/internvl/model/__init__.py -------------------------------------------------------------------------------- /internvl_g/internvl/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/internvl_g/internvl/train/__init__.py -------------------------------------------------------------------------------- /internvl_g/zero_stage1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 1, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e9, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "scheduler": { 24 | "type": "WarmupDecayLR", 25 | "params": { 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto", 29 | "total_num_steps": "auto" 30 | } 31 | }, 32 | "gradient_accumulation_steps": "auto", 33 | "gradient_clipping": "auto", 34 | "steps_per_print": 2000, 35 | "train_batch_size": "auto", 36 | "train_micro_batch_size_per_gpu": "auto", 37 | "wall_clock_breakdown": true 38 | } 39 | -------------------------------------------------------------------------------- /internvl_g/zero_stage2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 2e8, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "scheduler": { 24 | "type": "WarmupDecayLR", 25 | "params": { 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto", 29 | "total_num_steps": "auto" 30 | } 31 | }, 32 | "gradient_accumulation_steps": "auto", 33 | "gradient_clipping": "auto", 34 | "steps_per_print": 2000, 35 | "train_batch_size": "auto", 36 | "train_micro_batch_size_per_gpu": "auto", 37 | "wall_clock_breakdown": true 38 | } 39 | -------------------------------------------------------------------------------- /internvl_g/zero_stage3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e5, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "scheduler": { 27 | "type": "WarmupDecayLR", 28 | "params": { 29 | "warmup_min_lr": "auto", 30 | "warmup_max_lr": "auto", 31 | "warmup_num_steps": "auto", 32 | "total_num_steps": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": true 41 | } 42 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/internvl_chat.txt 2 | -r requirements/streamlit_demo.txt 3 | -r requirements/classification.txt 4 | -r requirements/segmentation.txt 5 | -------------------------------------------------------------------------------- /requirements/classification.txt: -------------------------------------------------------------------------------- 1 | gdown 2 | termcolor 3 | yacs 4 | -------------------------------------------------------------------------------- /requirements/clip_benchmark.txt: -------------------------------------------------------------------------------- 1 | open_clip_torch>=0.2.1 2 | opencv-python 3 | peft>=0.6.2 4 | protobuf 5 | pycocoevalcap 6 | pyyaml 7 | scikit-learn>=1.0,<2 8 | scikit-learn 9 | scipy 10 | task_adaptation 11 | tensorflow==2.11.0 12 | termcolor 13 | tqdm>=2 14 | transformers>=4.32.0 15 | webdataset>=0.2.31 16 | yacs 17 | -------------------------------------------------------------------------------- /requirements/internvl_chat.txt: -------------------------------------------------------------------------------- 1 | accelerate<1 2 | bitsandbytes==0.42.0 3 | decord 4 | deepspeed>=0.13.5 5 | einops==0.6.1 6 | einops-exts==0.0.4 7 | huggingface_hub 8 | imageio 9 | numpy==1.26.4 10 | opencv-python 11 | orjson 12 | peft==0.10.0 13 | pycocoevalcap 14 | pyyaml 15 | scikit-learn>=1.2.2 16 | scipy 17 | sentencepiece==0.1.99 18 | shortuuid 19 | tensorboardX 20 | termcolor 21 | timm==0.9.12 22 | tokenizers==0.15.1 23 | torch>=2 24 | torchvision>=0.15 25 | tqdm 26 | transformers==4.37.2 27 | yacs 28 | -------------------------------------------------------------------------------- /requirements/segmentation.txt: -------------------------------------------------------------------------------- 1 | future 2 | importlib_metadata 3 | mmcv-full==1.6.2 4 | mmsegmentation==0.30.0 5 | openmim 6 | ordered-set 7 | platformdirs 8 | tensorboard 9 | tomli 10 | yapf==0.40.1 11 | -------------------------------------------------------------------------------- /requirements/streamlit_demo.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | gradio==3.35.2 3 | gradio_client==0.2.9 4 | httpx==0.24.0 5 | markdown2[all] 6 | pydantic 7 | requests 8 | streamlit 9 | streamlit-image-select 10 | uvicorn 11 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_voc12_aug.py: -------------------------------------------------------------------------------- 1 | _base_ = './pascal_voc12.py' 2 | # dataset settings 3 | data = dict( 4 | train=dict( 5 | ann_dir=['SegmentationClass', 'SegmentationClassAug'], 6 | split=[ 7 | 'ImageSets/Segmentation/train.txt', 8 | 'ImageSets/Segmentation/aug.txt' 9 | ])) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | dict(type='TensorboardLoggerHook') 7 | # dict(type='PaviLoggerHook') # for internal services 8 | ]) 9 | # yapf:enable 10 | dist_params = dict(backend='nccl') 11 | log_level = 'INFO' 12 | load_from = None 13 | resume_from = None 14 | workflow = [('train', 1)] 15 | cudnn_benchmark = True 16 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/cgnet.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='CGNet', 7 | norm_cfg=norm_cfg, 8 | in_channels=3, 9 | num_channels=(32, 64, 128), 10 | num_blocks=(3, 21), 11 | dilations=(2, 4), 12 | reductions=(8, 16)), 13 | decode_head=dict( 14 | type='FCNHead', 15 | in_channels=256, 16 | in_index=2, 17 | channels=256, 18 | num_convs=0, 19 | concat_input=False, 20 | dropout_ratio=0, 21 | num_classes=19, 22 | norm_cfg=norm_cfg, 23 | loss_decode=dict( 24 | type='CrossEntropyLoss', 25 | use_sigmoid=False, 26 | loss_weight=1.0, 27 | class_weight=[ 28 | 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, 29 | 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, 30 | 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, 31 | 10.396974, 10.055647 32 | ])), 33 | # model training and testing settings 34 | train_cfg=dict(sampler=None), 35 | test_cfg=dict(mode='whole')) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/dpt_vit-b16.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | model = dict( 3 | type='EncoderDecoder', 4 | pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa 5 | backbone=dict( 6 | type='VisionTransformer', 7 | img_size=224, 8 | embed_dims=768, 9 | num_layers=12, 10 | num_heads=12, 11 | out_indices=(2, 5, 8, 11), 12 | final_norm=False, 13 | with_cls_token=True, 14 | output_cls_token=True), 15 | decode_head=dict( 16 | type='DPTHead', 17 | in_channels=(768, 768, 768, 768), 18 | channels=256, 19 | embed_dims=768, 20 | post_process_channels=[96, 192, 384, 768], 21 | num_classes=150, 22 | readout_type='project', 23 | input_transform='multiple_select', 24 | in_index=(0, 1, 2, 3), 25 | norm_cfg=norm_cfg, 26 | loss_decode=dict( 27 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 28 | auxiliary_head=None, 29 | # model training and testing settings 30 | train_cfg=dict(), 31 | test_cfg=dict(mode='whole')) # yapf: disable 32 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/erfnet_fcn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='ERFNet', 8 | in_channels=3, 9 | enc_downsample_channels=(16, 64, 128), 10 | enc_stage_non_bottlenecks=(5, 8), 11 | enc_non_bottleneck_dilations=(2, 4, 8, 16), 12 | enc_non_bottleneck_channels=(64, 128), 13 | dec_upsample_channels=(64, 16), 14 | dec_stages_non_bottleneck=(2, 2), 15 | dec_non_bottleneck_channels=(64, 16), 16 | dropout_ratio=0.1, 17 | init_cfg=None), 18 | decode_head=dict( 19 | type='FCNHead', 20 | in_channels=16, 21 | channels=128, 22 | num_convs=1, 23 | concat_input=False, 24 | dropout_ratio=0.1, 25 | num_classes=19, 26 | norm_cfg=norm_cfg, 27 | align_corners=False, 28 | loss_decode=dict( 29 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 30 | # model training and testing settings 31 | train_cfg=dict(), 32 | test_cfg=dict(mode='whole')) 33 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fpn_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 1, 1), 12 | strides=(1, 2, 2, 2), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | neck=dict( 18 | type='FPN', 19 | in_channels=[256, 512, 1024, 2048], 20 | out_channels=256, 21 | num_outs=4), 22 | decode_head=dict( 23 | type='FPNHead', 24 | in_channels=[256, 256, 256, 256], 25 | in_index=[0, 1, 2, 3], 26 | feature_strides=[4, 8, 16, 32], 27 | channels=128, 28 | dropout_ratio=0.1, 29 | num_classes=19, 30 | norm_cfg=norm_cfg, 31 | align_corners=False, 32 | loss_decode=dict( 33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 34 | # model training and testing settings 35 | train_cfg=dict(), 36 | test_cfg=dict(mode='whole')) 37 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/lraspp_m-v3-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='MobileNetV3', 7 | arch='large', 8 | out_indices=(1, 3, 16), 9 | norm_cfg=norm_cfg), 10 | decode_head=dict( 11 | type='LRASPPHead', 12 | in_channels=(16, 24, 960), 13 | in_index=(0, 1, 2), 14 | channels=128, 15 | input_transform='multiple_select', 16 | dropout_ratio=0.1, 17 | num_classes=19, 18 | norm_cfg=norm_cfg, 19 | act_cfg=dict(type='ReLU'), 20 | align_corners=False, 21 | loss_decode=dict( 22 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 23 | # model training and testing settings 24 | train_cfg=dict(), 25 | test_cfg=dict(mode='whole')) 26 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/segformer_mit-b0.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='MixVisionTransformer', 8 | in_channels=3, 9 | embed_dims=32, 10 | num_stages=4, 11 | num_layers=[2, 2, 2, 2], 12 | num_heads=[1, 2, 5, 8], 13 | patch_sizes=[7, 3, 3, 3], 14 | sr_ratios=[8, 4, 2, 1], 15 | out_indices=(0, 1, 2, 3), 16 | mlp_ratio=4, 17 | qkv_bias=True, 18 | drop_rate=0.0, 19 | attn_drop_rate=0.0, 20 | drop_path_rate=0.1), 21 | decode_head=dict( 22 | type='SegformerHead', 23 | in_channels=[32, 64, 160, 256], 24 | in_index=[0, 1, 2, 3], 25 | channels=256, 26 | dropout_ratio=0.1, 27 | num_classes=19, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 32 | # model training and testing settings 33 | train_cfg=dict(), 34 | test_cfg=dict(mode='whole')) 35 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/segmenter_vit-b16_mask.py: -------------------------------------------------------------------------------- 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_base_p16_384_20220308-96dfe169.pth' # noqa 2 | # model settings 3 | backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True) 4 | model = dict( 5 | type='EncoderDecoder', 6 | pretrained=checkpoint, 7 | backbone=dict( 8 | type='VisionTransformer', 9 | img_size=(512, 512), 10 | patch_size=16, 11 | in_channels=3, 12 | embed_dims=768, 13 | num_layers=12, 14 | num_heads=12, 15 | drop_path_rate=0.1, 16 | attn_drop_rate=0.0, 17 | drop_rate=0.0, 18 | final_norm=True, 19 | norm_cfg=backbone_norm_cfg, 20 | with_cls_token=True, 21 | interpolate_mode='bicubic', 22 | ), 23 | decode_head=dict( 24 | type='SegmenterMaskTransformerHead', 25 | in_channels=768, 26 | channels=768, 27 | num_classes=150, 28 | num_layers=2, 29 | num_heads=12, 30 | embed_dims=768, 31 | dropout_ratio=0.0, 32 | loss_decode=dict( 33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 34 | ), 35 | test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(480, 480)), 36 | ) 37 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_10k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=10000) 8 | checkpoint_config = dict(by_epoch=False, interval=1000) 9 | evaluation = dict(interval=1000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=20000) 8 | checkpoint_config = dict(by_epoch=False, interval=2000) 9 | evaluation = dict(interval=2000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_320k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=320000) 8 | checkpoint_config = dict(by_epoch=False, interval=32000) 9 | evaluation = dict(interval=32000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=40000) 8 | checkpoint_config = dict(by_epoch=False, interval=4000) 9 | evaluation = dict(interval=4000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_5k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=5000) 8 | checkpoint_config = dict(by_epoch=False, interval=1000) 9 | evaluation = dict(interval=1000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29510} 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 10 | -------------------------------------------------------------------------------- /segmentation/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29300} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch --deterministic ${@:3} 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from .datasets import * # noqa: F401,F403 7 | from .models import * # noqa: F401,F403 8 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from .ade import ADE20KDataset 7 | from .pipelines import * # noqa: F401,F403 8 | 9 | __all__ = ['ADE20KDataset'] 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from .transform import PadShortSide, SETR_Resize 7 | 8 | __all__ = [ 9 | 'SETR_Resize', 'PadShortSide', 10 | ] 11 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from .backbones import * # noqa: F401,F403 7 | from .decode_heads import * # noqa: F401,F403 8 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .intern_vit_6b import InternViT6B 2 | 3 | __all__ = ['InternViT6B'] 4 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .fcn_head import FCNHead 8 | 9 | __all__ = ['FCNHead'] 10 | -------------------------------------------------------------------------------- /segmentation/release.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | parser = argparse.ArgumentParser(description='Hyperparams') 6 | parser.add_argument('filename', nargs='?', type=str, default=None) 7 | 8 | args = parser.parse_args() 9 | 10 | model = torch.load(args.filename, map_location=torch.device('cpu')) 11 | model = model['module'] 12 | 13 | # new_model = {} 14 | # for k, v in model.items(): 15 | # if "backbone.blocks" in k: 16 | # continue 17 | # if "auxiliary_head" in k: 18 | # continue 19 | # if "pos_embed" in k or "patch_embed" in k or "cls_token" in k: 20 | # continue 21 | # try: 22 | # if "bn" in k: 23 | # print("fp32:", k) 24 | # new_model[k] = v 25 | # else: 26 | # new_model[k] = v 27 | # except: 28 | # new_model[k] = v 29 | # print(new_model.keys()) 30 | 31 | # new_dict = {'state_dict': new_state_dict} 32 | torch.save(model, args.filename.replace('.pt', '_release.pt')) 33 | -------------------------------------------------------------------------------- /segmentation/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /segmentation/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | GPUS=${GPUS:-8} 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 10 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 11 | SRUN_ARGS=${SRUN_ARGS:-""} 12 | PY_ARGS=${@:4} 13 | 14 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 15 | srun -p ${PARTITION} \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | ${SRUN_ARGS} \ 23 | python -u train.py ${CONFIG} --launcher="slurm" ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "torch_adam": true, 8 | "lr": 0.00004 9 | } 10 | }, 11 | "fp16": { 12 | "enabled": true, 13 | "auto_cast": true 14 | }, 15 | "steps_per_print": 50 16 | } 17 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_zero1_amp.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "AdamW", 6 | "params": { 7 | "lr": 0.00004 8 | } 9 | }, 10 | "amp": { 11 | "enabled": true, 12 | "opt_level": "O1" 13 | }, 14 | "log": { 15 | "steps_per_print": 50 16 | }, 17 | "wall_clock_breakdown": false 18 | } 19 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_zero1_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.00004 8 | } 9 | }, 10 | "bf16": { 11 | "enabled": true 12 | }, 13 | "zero_optimization": { 14 | "stage": 1, 15 | "reduce_bucket_size": 5e8, 16 | "overlap_comm": true 17 | }, 18 | "log": { 19 | "steps_per_print": 50 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_zero1_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "AdamW", 6 | "params": { 7 | "lr": 0.00004 8 | } 9 | }, 10 | "fp16": { 11 | "enabled": true, 12 | "auto_cast": false 13 | }, 14 | "zero_optimization": { 15 | "stage": 1, 16 | "allgather_partitions": true, 17 | "allgather_bucket_size": 1e9, 18 | "overlap_comm": true, 19 | "reduce_scatter": false, 20 | "reduce_bucket_size": 1e9, 21 | "contiguous_gradients": true 22 | }, 23 | "log": { 24 | "steps_per_print": 50 25 | }, 26 | "wall_clock_breakdown": false 27 | } 28 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_zero2_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.00004 8 | } 9 | }, 10 | "bf16": { 11 | "enabled": true 12 | }, 13 | "zero_optimization": { 14 | "stage": 1, 15 | "reduce_bucket_size": 5e8, 16 | "overlap_comm": true 17 | }, 18 | "log": { 19 | "steps_per_print": 50 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_zero2_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "AdamW", 6 | "params": { 7 | "lr": 0.00004 8 | } 9 | }, 10 | "fp16": { 11 | "enabled": true 12 | }, 13 | "zero_optimization": { 14 | "stage": 2, 15 | "allgather_partitions": true, 16 | "allgather_bucket_size": 1e9, 17 | "overlap_comm": true, 18 | "reduce_scatter": true, 19 | "reduce_bucket_size": 1e9, 20 | "contiguous_gradients": true 21 | }, 22 | "log": { 23 | "steps_per_print": 50 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /segmentation/zero_configs/adam_zero3_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "ZeroOneAdam", 6 | "params": { 7 | "lr": 1e-3, 8 | "weight_decay": 0.01, 9 | "bias_correction": false, 10 | "var_freeze_step": 1000, 11 | "var_update_scaler": 16, 12 | "local_step_scaler": 1000, 13 | "local_step_clipper": 16, 14 | "cuda_aware": false, 15 | "comm_backend_name": "nccl" 16 | } 17 | }, 18 | "fp16": { 19 | "enabled": true 20 | }, 21 | "zero_optimization": { 22 | "stage": 3, 23 | "contiguous_gradients": true, 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_prefetch_bucket_size": 1e7, 27 | "stage3_param_persistence_threshold": 1e5, 28 | "reduce_bucket_size": 1e7, 29 | "sub_group_size": 1e9 30 | }, 31 | "steps_per_print": 50 32 | } 33 | -------------------------------------------------------------------------------- /streamlit_demo/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [server] 2 | enableStaticServing = false 3 | enableXsrfProtection = false 4 | enableCORS = false 5 | 6 | [browser] # This ip and port will show in command prompt 7 | # serverAddress = "internvl.opengvlab.com" # Put your Local IP or Domain Name 8 | serverPort = 10003 9 | enableCORS = false 10 | -------------------------------------------------------------------------------- /streamlit_demo/constants.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 8 | WORKER_HEART_BEAT_INTERVAL = 15 9 | 10 | LOGDIR = 'logs/' 11 | 12 | # Model Constants 13 | IGNORE_INDEX = -100 14 | IMAGE_TOKEN_INDEX = -200 15 | DEFAULT_IMAGE_TOKEN = '' 16 | DEFAULT_IMAGE_PATCH_TOKEN = '' 17 | DEFAULT_IM_START_TOKEN = '' 18 | DEFAULT_IM_END_TOKEN = '' 19 | IMAGE_PLACEHOLDER = '' 20 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 21 | IMAGENET_STD = (0.229, 0.224, 0.225) 22 | 23 | server_error_msg = '**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**' 24 | -------------------------------------------------------------------------------- /streamlit_demo/gallery/astro_on_unicorn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/astro_on_unicorn.png -------------------------------------------------------------------------------- /streamlit_demo/gallery/cheetah.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/cheetah.png -------------------------------------------------------------------------------- /streamlit_demo/gallery/prod_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_1.jpeg -------------------------------------------------------------------------------- /streamlit_demo/gallery/prod_11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_11.jpg -------------------------------------------------------------------------------- /streamlit_demo/gallery/prod_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_12.png -------------------------------------------------------------------------------- /streamlit_demo/gallery/prod_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_4.png -------------------------------------------------------------------------------- /streamlit_demo/gallery/prod_9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_9.jpg -------------------------------------------------------------------------------- /streamlit_demo/gallery/prod_en_17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/gallery/prod_en_17.png -------------------------------------------------------------------------------- /streamlit_demo/static/SimHei.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/InternVL/d779db3b0581859753069c4113f69d367eff799b/streamlit_demo/static/SimHei.ttf --------------------------------------------------------------------------------