├── COINCIDE_cluster ├── LICENCE ├── pyproject.toml ├── scripts │ ├── extract_mm_projector.py │ ├── merge_lora_weights.py │ └── tiny_llava │ │ └── eval │ │ ├── llava_score │ │ ├── coincide │ │ │ ├── clustering.sh │ │ │ └── extract_embed.sh │ │ ├── d2_prune │ │ │ ├── d2_prune_selection.sh │ │ │ ├── score_aum.sh │ │ │ └── score_llava_embed.sh │ │ ├── el2n │ │ │ └── score_el2n.sh │ │ └── semdedup │ │ │ ├── score_SemDeDup.sh │ │ │ └── semdedup.sh │ │ └── vision_flan_score │ │ ├── coincide │ │ ├── clustering.sh │ │ └── extract_embed.sh │ │ ├── d2_prune │ │ ├── d2_prune_selection.sh │ │ ├── score_aum.sh │ │ └── score_llava_embed.sh │ │ ├── el2n │ │ └── score_el2n.sh │ │ └── semdedup │ │ ├── score_SemDeDup.sh │ │ └── semdedup.sh └── tinyllava │ ├── __init__.py │ ├── arguments.py │ ├── constants.py │ ├── conversation.py │ ├── data │ ├── __init__.py │ ├── dataset.py │ ├── preprocess │ │ ├── __init__.py │ │ ├── default.py │ │ ├── llama2.py │ │ ├── mpt.py │ │ ├── phi.py │ │ ├── plain.py │ │ ├── stablelm.py │ │ ├── tiny_llama.py │ │ └── v1.py │ └── process.py │ ├── eval │ ├── __init__.py │ └── score │ │ ├── SemDeDup │ │ ├── clustering │ │ │ ├── clustering.py │ │ │ ├── sort_clusters.py │ │ │ └── utils.py │ │ ├── compute_centroids.py │ │ ├── execute_semdedup.py │ │ └── extract_dedup_data.py │ │ ├── coincide │ │ ├── autograd_hacks.py │ │ ├── cluster_transferability.py │ │ ├── cluster_wise_prune.py │ │ ├── clustering.py │ │ ├── compute_centroids.py │ │ └── extract_embed.py │ │ ├── d2_prune │ │ ├── d2_prune.py │ │ ├── score_aum.py │ │ └── score_embed.py │ │ ├── el2n │ │ └── score_el2n.py │ │ ├── merge_values.py │ │ └── sort_values.py │ ├── mm_utils.py │ ├── model │ ├── __init__.py │ ├── builder.py │ ├── language_model │ │ ├── __init__.py │ │ ├── llava_llama.py │ │ ├── llava_phi.py │ │ ├── llava_qwen.py │ │ ├── llava_stablelm.py │ │ ├── phi │ │ │ ├── cache_utils.py │ │ │ ├── configuration_phi.py │ │ │ ├── convert_phi_weights_to_hf.py │ │ │ ├── modeling_attn_mask_utils.py │ │ │ ├── modeling_phi.py │ │ │ └── utils.py │ │ ├── qwen │ │ │ ├── configuration_qwen.py │ │ │ ├── cpp_kernels.py │ │ │ ├── llava_qwen.py │ │ │ ├── modeling_qwen.py │ │ │ ├── qwen_generation_utils.py │ │ │ └── tokenization_qwen.py │ │ └── stablelm │ │ │ ├── configuration_stablelm_epoch.py │ │ │ ├── modeling_stablelm_epoch.py │ │ │ └── tokenization_arcade100k.py │ ├── llava_arch.py │ ├── model_factory.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ └── siglip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py │ └── utils.py ├── COINCIDE_train ├── LICENSE ├── cog.yaml ├── llava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── eval │ │ ├── eval_gpt_review.py │ │ ├── eval_gpt_review_bench.py │ │ ├── eval_gpt_review_visual.py │ │ ├── eval_pope.py │ │ ├── eval_science_qa.py │ │ ├── eval_science_qa_gpt4.py │ │ ├── eval_science_qa_gpt4_requery.py │ │ ├── eval_textvqa.py │ │ ├── generate_webpage_data_from_table.py │ │ ├── m4c_evaluator.py │ │ ├── model_qa.py │ │ ├── model_vqa.py │ │ ├── model_vqa_loader.py │ │ ├── model_vqa_loader_prune.py │ │ ├── model_vqa_mmbench.py │ │ ├── model_vqa_science.py │ │ ├── qa_baseline_gpt35.py │ │ ├── run_llava.py │ │ ├── summarize_gpt_review.py │ │ └── to_seed_image.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mistral.py │ │ │ └── llava_mpt.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── utils.py │ ├── serve │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── controller.py │ │ ├── examples │ │ │ ├── extreme_ironing.jpg │ │ │ └── waterview.jpg │ │ ├── gradio_web_server.py │ │ ├── model_worker.py │ │ ├── register_worker.py │ │ ├── sglang_worker.py │ │ └── test_message.py │ ├── train │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_xformers_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ ├── train_mem.py │ │ └── train_xformers.py │ └── utils.py ├── playground │ └── data │ │ ├── coco2014_val_gpt4_qa_30x3.jsonl │ │ ├── coco2014_val_qa_eval │ │ ├── qa90_gpt4_answer.jsonl │ │ └── qa90_questions.jsonl │ │ ├── eval │ │ └── gqa │ │ │ └── data │ │ │ └── eval │ │ │ ├── eval.py │ │ │ └── eval_prune.py │ │ └── prompts │ │ ├── complex_reasoning │ │ ├── 000_caps.txt │ │ ├── 000_conv.txt │ │ ├── 001_caps.txt │ │ ├── 001_conv.txt │ │ ├── 002_caps.txt │ │ ├── 002_conv.txt │ │ └── system_message.txt │ │ ├── conversation │ │ ├── 000_caps.txt │ │ ├── 000_conv.txt │ │ ├── 001_caps.txt │ │ ├── 001_conv.txt │ │ └── system_message.txt │ │ └── detail_description │ │ ├── 000_caps.txt │ │ ├── 000_conv.txt │ │ ├── 001_caps.txt │ │ ├── 001_conv.txt │ │ ├── 002_caps.txt │ │ ├── 002_conv.txt │ │ └── system_message.txt ├── predict.py ├── pyproject.toml └── scripts │ ├── convert_gqa_for_eval.py │ ├── convert_mmbench_for_submission.py │ ├── convert_mmvet_for_eval.py │ ├── convert_seed_for_submission.py │ ├── convert_sqa_to_llava.py │ ├── convert_sqa_to_llava_base_prompt.py │ ├── convert_vizwiz_for_submission.py │ ├── convert_vqav2_for_submission.py │ ├── extract_mm_projector.py │ ├── merge_lora_weights.py │ ├── v1_5 │ ├── eval │ │ ├── gqa.sh │ │ ├── llavabench.sh │ │ ├── mmbench.sh │ │ ├── mmbench_cn.sh │ │ ├── mme.sh │ │ ├── mmvet.sh │ │ ├── pope.sh │ │ ├── seed.sh │ │ ├── sqa.sh │ │ ├── textvqa.sh │ │ ├── vizwiz.sh │ │ └── vqav2.sh │ ├── finetune_lora.sh │ ├── finetune_lora_coincide.sh │ ├── finetune_lora_random.sh │ └── vision_flan │ │ ├── finetune_lora_vision_flan.sh │ │ ├── finetune_lora_vision_flan_coincide.sh │ │ └── finetune_lora_vision_flan_random.sh │ ├── zero2.json │ ├── zero3.json │ └── zero3_offload.json ├── README.md └── assets └── concept_figure.png /COINCIDE_cluster/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "tinyllava" 7 | version = "1.0.0" 8 | description = "A Framework of Small-scale Large Multimodal Models." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.2.0", "torchvision==0.17.0", "tiktoken", 17 | "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://github.com/DLCV-BUAA/TinyLLaVABench" 31 | "Bug Tracker" = "https://github.com/DLCV-BUAA/TinyLLaVABench/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | 39 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tinyllava.model.builder import load_pretrained_model 3 | from tinyllava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/coincide/clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_PATH="/base_path" 4 | SAMPLE_RATIO=0.2 5 | TEMP=0.1 6 | 7 | python tinyllava/eval/score/coincide/compute_centroids.py \ 8 | --sim_metric cosine \ 9 | --Kmeans_with_cos_dist \ 10 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/tan_act_37111519_msa.npy \ 11 | --save_folder ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder \ 12 | --ncentroids 10000 \ 13 | --niter 50 \ 14 | --seed 1234 \ 15 | 16 | 17 | python tinyllava/eval/score/coincide/cluster_transferability.py \ 18 | --centroid_embed_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/kmeans_centroids.npy \ 19 | --transferability_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/transfer.npy \ 20 | --k 4 \ 21 | --knn_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/knn \ 22 | 23 | 24 | python tinyllava/eval/score/coincide/cluster_wise_prune.py \ 25 | --embedding_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/tan_act_37111519_msa.npy \ 26 | --cluster_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/nearest_cent.npy 27 | --transfer_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/transfer.npy \ 28 | --fraction $SAMPLE_RATIO \ 29 | --temp $TEMP \ 30 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/coincide/extract_embed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_PATH="/base_path" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | CKPT="TinyLLaVA-2.0B" 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/coincide/extract_embed.py \ 17 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 18 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 19 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 20 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --is_multimodal \ 24 | --batch_size 8 \ 25 | --layer_list 3 7 11 15 19 \ 26 | --version phi & 27 | done 28 | 29 | wait 30 | 31 | python tinyllava/eval/score/merge_values.py \ 32 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/tan_act_37111519_msa \ 33 | 34 | wait 35 | 36 | # End time 37 | end_time=$(date +%s) 38 | # Calculate execution time 39 | execution_time=$((end_time - start_time)) 40 | minutes=$((execution_time/60)) 41 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/d2_prune/d2_prune_selection.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | SAMPLE_RATIO=0.2 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | python tinyllava/eval/score/d2_prune/d2_prune.py \ 9 | --score-path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/aum_scores.npy \ 10 | --embed-path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/avg_llava_embed.npy \ 11 | --output-indices-path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/d2_prune_indices_${SAMPLE_RATIO}.npy \ 12 | --n-neighbors 5 \ 13 | --gamma 0.4 \ 14 | --fraction $SAMPLE_RATIO \ 15 | 16 | # End time 17 | end_time=$(date +%s) 18 | # Calculate execution time 19 | execution_time=$((end_time - start_time)) 20 | minutes=$((execution_time/60)) 21 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/d2_prune/score_aum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="TinyLLaVA-2.0B" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_aum.py \ 15 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 16 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 17 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 18 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/aum_scores \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --is_multimodal \ 22 | --batch_size 4 \ 23 | --version phi & 24 | done 25 | 26 | wait 27 | 28 | python tinyllava/eval/score/merge_values.py \ 29 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/aum_scores \ 30 | 31 | # End time 32 | end_time=$(date +%s) 33 | 34 | # Calculate execution time 35 | execution_time=$((end_time - start_time)) 36 | minutes=$((execution_time/60)) 37 | 38 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/d2_prune/score_llava_embed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="TinyLLaVA-2.0B" 4 | 5 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 6 | IFS=',' read -ra GPULIST <<< "$gpu_list" 7 | 8 | CHUNKS=${#GPULIST[@]} 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \ 12 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 13 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/avg_llava_embed \ 14 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 15 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --is_multimodal \ 19 | --version phi \ 20 | --avg_embed \ 21 | --batch_size 4 & 22 | done 23 | 24 | wait 25 | 26 | python tinyllava/eval/score/merge_values.py \ 27 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/avg_llava_embed \ 28 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/el2n/score_el2n.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="TinyLLaVA-2.0B" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/el2n/score_el2n.py \ 15 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 16 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 17 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 18 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_scores \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --is_multimodal \ 22 | --batch_size 4 \ 23 | --version phi & 24 | done 25 | 26 | wait 27 | 28 | python tinyllava/eval/score/merge_values.py \ 29 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_scores \ 30 | 31 | python tinyllava/eval/score/sort_values.py \ 32 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_scores.npy \ 33 | --save_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_indices.npy \ 34 | 35 | # End time 36 | end_time=$(date +%s) 37 | 38 | # Calculate execution time 39 | execution_time=$((end_time - start_time)) 40 | minutes=$((execution_time/60)) 41 | 42 | echo "Total execution time: ${minutes} minutes" 43 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/semdedup/score_SemDeDup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="TinyLLaVA-2.0B" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \ 15 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 16 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/llava_embed \ 17 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 18 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --is_multimodal \ 22 | --version phi \ 23 | --batch_size 4 & 24 | done 25 | 26 | wait 27 | 28 | python tinyllava/eval/score/merge_values.py \ 29 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/llava_embed \ 30 | 31 | # End time 32 | end_time=$(date +%s) 33 | 34 | # Calculate execution time 35 | execution_time=$((end_time - start_time)) 36 | minutes=$((execution_time/60)) 37 | 38 | echo "Total execution time: ${minutes} minutes" 39 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/semdedup/semdedup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | SAMPLE_RATIO=0.2 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | python tinyllava/eval/score/SemDeDup/compute_centroids.py \ 9 | --sim_metric cosine \ 10 | --keep_hard \ 11 | --Kmeans_with_cos_dist \ 12 | --emb_memory_loc ${BASE_PATH}/LLaVA/playground/data/TinyLLaVA-Instruction/llava_embed.npy \ 13 | --sorted_clusters_file_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/semdedup_llava_sorted_clusters \ 14 | --save_folder ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/semdedup_llava_save_folder \ 15 | --output_indices_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/semdedup_indices_${SAMPLE_RATIO}.npy \ 16 | --ncentroids 10000 \ 17 | --niter 50 \ 18 | --seed 1234 \ 19 | --prune_p $SAMPLE_RATIO \ 20 | --eps_list 0.266 0.26665 0.267 \ 21 | 22 | # For different SAMPLE_RATIO, note that eps_list should be manually found 23 | 24 | # End time 25 | end_time=$(date +%s) 26 | # Calculate execution time 27 | execution_time=$((end_time - start_time)) 28 | minutes=$((execution_time/60)) 29 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/coincide/clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_PATH="/base_path" 4 | SAMPLE_RATIO=0.2 5 | TEMP=0.1 6 | 7 | python tinyllava/eval/score/coincide/compute_centroids.py \ 8 | --sim_metric cosine \ 9 | --Kmeans_with_cos_dist \ 10 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/tan_act_37111519_msa.npy \ 11 | --save_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder \ 12 | --ncentroids 2500 \ 13 | --niter 50 \ 14 | --seed 1234 \ 15 | 16 | 17 | python tinyllava/eval/score/coincide/cluster_transferability.py \ 18 | --centroid_embed_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/kmeans_centroids.npy \ 19 | --transferability_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/transfer_lang.npy \ 20 | --k 4 \ 21 | --knn_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/knn \ 22 | 23 | 24 | python tinyllava/eval/score/coincide/cluster_wise_prune.py \ 25 | --embedding_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/tan_act_37111519_msa.npy \ 26 | --cluster_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/nearest_cent.npy 27 | --transfer_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/transfer.npy \ 28 | --fraction $SAMPLE_RATIO \ 29 | --temp $TEMP \ -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/coincide/extract_embed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_PATH="/base_path" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | CKPT="vison_flan_tinyllava_v100_2.0b" 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/coincide/extract_embed.py \ 17 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 18 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 19 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 20 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --is_multimodal \ 24 | --batch_size 8 \ 25 | --layer_list 3 7 11 15 19 \ 26 | --version phi & 27 | done 28 | 29 | wait 30 | 31 | python tinyllava/eval/score/merge_values.py \ 32 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/tan_act_37111519_msa \ 33 | 34 | wait 35 | 36 | # End time 37 | end_time=$(date +%s) 38 | # Calculate execution time 39 | execution_time=$((end_time - start_time)) 40 | minutes=$((execution_time/60)) 41 | echo "Total execution time: ${minutes} minutes" 42 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/d2_prune/d2_prune_selection.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | SAMPLE_RATIO=0.167 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | python tinyllava/eval/score/d2_prune/d2_prune.py \ 9 | --score-path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/aum_scores.npy \ 10 | --embed-path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/avg_llava_embed.npy \ 11 | --output-indices-path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/d2_prune_indices_${SAMPLE_RATIO}.npy \ 12 | --n-neighbors 5 \ 13 | --gamma 0.4 \ 14 | --fraction $SAMPLE_RATIO \ 15 | 16 | # End time 17 | end_time=$(date +%s) 18 | # Calculate execution time 19 | execution_time=$((end_time - start_time)) 20 | minutes=$((execution_time/60)) 21 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/d2_prune/score_aum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="vison_flan_tinyllava_v100_2.0b" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | 14 | for IDX in $(seq 0 $((CHUNKS-1))); do 15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_aum.py \ 16 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 17 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 18 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 19 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/aum_scores \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --is_multimodal \ 23 | --batch_size 4 \ 24 | --version phi & 25 | done 26 | 27 | wait 28 | 29 | python tinyllava/eval/score/merge_values.py \ 30 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/aum_scores \ 31 | 32 | # End time 33 | end_time=$(date +%s) 34 | 35 | # Calculate execution time 36 | execution_time=$((end_time - start_time)) 37 | minutes=$((execution_time/60)) 38 | 39 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/d2_prune/score_llava_embed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="vison_flan_tinyllava_v100_2.0b" 4 | 5 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 6 | IFS=',' read -ra GPULIST <<< "$gpu_list" 7 | 8 | CHUNKS=${#GPULIST[@]} 9 | 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \ 13 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 14 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/avg_llava_embed \ 15 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 16 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --is_multimodal \ 20 | --version phi \ 21 | --avg_embed \ 22 | --batch_size 4 & 23 | done 24 | 25 | wait 26 | 27 | python tinyllava/eval/score/merge_values.py \ 28 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/avg_llava_embed \ 29 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/el2n/score_el2n.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="vison_flan_tinyllava_v100_2.0b" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | 14 | for IDX in $(seq 0 $((CHUNKS-1))); do 15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/el2n/score_el2n.py \ 16 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 17 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 18 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 19 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_scores \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --is_multimodal \ 23 | --batch_size 4 \ 24 | --version phi & 25 | done 26 | 27 | wait 28 | 29 | python tinyllava/eval/score/merge_values.py \ 30 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_scores \ 31 | 32 | python tinyllava/eval/score/sort_values.py \ 33 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_scores.npy \ 34 | --save_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_indices.npy \ 35 | 36 | # End time 37 | end_time=$(date +%s) 38 | 39 | # Calculate execution time 40 | execution_time=$((end_time - start_time)) 41 | minutes=$((execution_time/60)) 42 | 43 | echo "Total execution time: ${minutes} minutes" 44 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/semdedup/score_SemDeDup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | CKPT="vision_flan_tinyllava_v100_2.0b" 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \ 15 | --model_path ${BASE_PATH}/checkpoints/$CKPT \ 16 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/llava_embed \ 17 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 18 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --is_multimodal \ 22 | --version phi \ 23 | --batch_size 4 & 24 | done 25 | 26 | wait 27 | 28 | python tinyllava/eval/score/merge_values.py \ 29 | --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/llava_embed \ 30 | 31 | # End time 32 | end_time=$(date +%s) 33 | 34 | # Calculate execution time 35 | execution_time=$((end_time - start_time)) 36 | minutes=$((execution_time/60)) 37 | 38 | echo "Total execution time: ${minutes} minutes" 39 | -------------------------------------------------------------------------------- /COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/semdedup/semdedup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASE_PATH="/base_path" 3 | SAMPLE_RATIO=0.167 4 | 5 | # Start time 6 | start_time=$(date +%s) 7 | 8 | python tinyllava/eval/score/SemDeDup/compute_centroids.py \ 9 | --sim_metric cosine \ 10 | --keep_hard \ 11 | --Kmeans_with_cos_dist \ 12 | --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/llava_embed.npy \ 13 | --sorted_clusters_file_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/semdedup_llava_sorted_clusters \ 14 | --save_folder ${BASE_PATH}/COINCIDE_trainplayground/data/vision-flan_191-task_1k/semdedup_llava_save_folder \ 15 | --output_indices_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/semdedup_indices_${SAMPLE_RATIO}.npy \ 16 | --ncentroids 3000 \ 17 | --niter 50 \ 18 | --seed 1234 \ 19 | --prune_p $SAMPLE_RATIO \ 20 | --eps_list 0.305 0.31 0.315 \ 21 | 22 | # For different SAMPLE_RATIO, note that eps_list should be manually found 23 | 24 | # End time 25 | end_time=$(date +%s) 26 | # Calculate execution time 27 | execution_time=$((end_time - start_time)) 28 | minutes=$((execution_time/60)) 29 | echo "Total execution time: ${minutes} minutes" -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/__init__.py: -------------------------------------------------------------------------------- 1 | from tinyllava.model import * 2 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Dict, Optional, Sequence, List 3 | 4 | import transformers 5 | 6 | @dataclass 7 | class ModelArguments: 8 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m") 9 | version: Optional[str] = field(default="v0") 10 | freeze_backbone: bool = field(default=False) 11 | tune_mm_mlp_adapter: bool = field(default=False) 12 | vision_tower: Optional[str] = field(default=None) 13 | mm_vision_select_layer: Optional[int] = field(default=-1) # default to the last layer 14 | pretrain_mm_mlp_adapter: Optional[str] = field(default=None) 15 | mm_projector_type: Optional[str] = field(default='linear') 16 | mm_use_im_start_end: bool = field(default=False) 17 | mm_use_im_patch_token: bool = field(default=True) 18 | mm_patch_merge_type: Optional[str] = field(default='flat') 19 | mm_vision_select_feature: Optional[str] = field(default="patch") 20 | resampler_hidden_size: Optional[int] = field(default=768) 21 | num_queries: Optional[int] = field(default=128) 22 | num_resampler_layers: Optional[int] = field(default=3) 23 | tune_vision_tower: bool = field(default=False) 24 | tune_entire_model: bool = field(default=False) 25 | tune_vit_from_layer: Optional[int] = field(default=100) 26 | tune_embed_tokens: Optional[int] = field(default=False) 27 | 28 | 29 | @dataclass 30 | class DataArguments: 31 | data_path: str = field(default=None, 32 | metadata={"help": "Path to the training data."}) 33 | lazy_preprocess: bool = False 34 | is_multimodal: bool = False 35 | image_folder: Optional[str] = field(default=None) 36 | image_aspect_ratio: str = 'square' 37 | 38 | 39 | @dataclass 40 | class TrainingArguments(transformers.TrainingArguments): 41 | cache_dir: Optional[str] = field(default=None) 42 | optim: str = field(default="adamw_torch") 43 | remove_unused_columns: bool = field(default=False) 44 | freeze_mm_mlp_adapter: bool = field(default=False) 45 | mpt_attn_impl: Optional[str] = field(default="triton") 46 | model_max_length: int = field( 47 | default=512, 48 | metadata={ 49 | "help": 50 | "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 51 | }, 52 | ) 53 | double_quant: bool = field( 54 | default=True, 55 | metadata={"help": "Compress the quantization statistics through double quantization."} 56 | ) 57 | quant_type: str = field( 58 | default="nf4", 59 | metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} 60 | ) 61 | bits: int = field( 62 | default=16, 63 | metadata={"help": "How many bits to use."} 64 | ) 65 | lora_enable: bool = False 66 | lora_r: int = 64 67 | lora_alpha: int = 16 68 | lora_dropout: float = 0.05 69 | lora_weight_path: str = "" 70 | lora_bias: str = "none" 71 | mm_projector_lr: Optional[float] = None 72 | group_by_modality_length: bool = field(default=False) 73 | vision_tower_lr: Optional[float] = None 74 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/data/__init__.py -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/data/preprocess/__init__.py -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/default.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('default') 15 | def preprocess_default( 16 | sources: Sequence[str], 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | conversations = [] 21 | for source in sources: 22 | header = f"{conversation_lib.default_conversation.system}\n\n" 23 | conversation = _add_speaker_and_signal(header, source) 24 | conversations.append(conversation) 25 | 26 | # tokenize conversations 27 | def get_tokenize_len(prompts): 28 | return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] 29 | 30 | if has_image: 31 | input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] 32 | else: 33 | conversations_tokenized = _tokenize_fn(conversations, tokenizer) 34 | input_ids = conversations_tokenized["input_ids"] 35 | 36 | targets = copy.deepcopy(input_ids) 37 | for target, source in zip(targets, sources): 38 | if has_image: 39 | tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source]) 40 | else: 41 | tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] 42 | speakers = [sentence["from"] for sentence in source] 43 | _mask_targets(target, tokenized_lens, speakers) 44 | 45 | return dict(input_ids=input_ids, labels=targets) 46 | 47 | 48 | def _tokenize_fn(strings: Sequence[str], 49 | tokenizer: transformers.PreTrainedTokenizer) -> Dict: 50 | """Tokenize a list of strings.""" 51 | tokenized_list = [ 52 | tokenizer( 53 | text, 54 | return_tensors="pt", 55 | padding="longest", 56 | max_length=tokenizer.model_max_length, 57 | truncation=True, 58 | ) for text in strings 59 | ] 60 | input_ids = labels = [ 61 | tokenized.input_ids[0] for tokenized in tokenized_list 62 | ] 63 | input_ids_lens = labels_lens = [ 64 | tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() 65 | for tokenized in tokenized_list 66 | ] 67 | return dict( 68 | input_ids=input_ids, 69 | labels=labels, 70 | input_ids_lens=input_ids_lens, 71 | labels_lens=labels_lens, 72 | ) 73 | 74 | 75 | def _add_speaker_and_signal(header, source, get_conversation=True): 76 | """Add speaker and start/end signal on each round.""" 77 | BEGIN_SIGNAL = "### " 78 | END_SIGNAL = "\n" 79 | conversation = header 80 | for sentence in source: 81 | from_str = sentence["from"] 82 | if from_str.lower() == "human": 83 | from_str = conversation_lib.default_conversation.roles[0] 84 | elif from_str.lower() == "gpt": 85 | from_str = conversation_lib.default_conversation.roles[1] 86 | else: 87 | from_str = 'unknown' 88 | sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + 89 | sentence["value"] + END_SIGNAL) 90 | if get_conversation: 91 | conversation += sentence["value"] 92 | conversation += BEGIN_SIGNAL 93 | return conversation 94 | 95 | 96 | def _mask_targets(target, tokenized_lens, speakers): 97 | # cur_idx = 0 98 | cur_idx = tokenized_lens[0] 99 | tokenized_lens = tokenized_lens[1:] 100 | target[:cur_idx] = IGNORE_INDEX 101 | for tokenized_len, speaker in zip(tokenized_lens, speakers): 102 | if speaker == "human": 103 | target[cur_idx + 2:cur_idx + tokenized_len] = IGNORE_INDEX 104 | cur_idx += tokenized_len 105 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/llama2.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('llama_2') 15 | def preprocess_llama_2( 16 | sources, 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | conv = conversation_lib.default_conversation.copy() 21 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 22 | 23 | # Apply prompt templates 24 | conversations = [] 25 | for i, source in enumerate(sources): 26 | if roles[source[0]["from"]] != conv.roles[0]: 27 | # Skip the first one if it is not from human 28 | source = source[1:] 29 | 30 | conv.messages = [] 31 | for j, sentence in enumerate(source): 32 | role = roles[sentence["from"]] 33 | assert role == conv.roles[j % 2], f"{i}" 34 | conv.append_message(role, sentence["value"]) 35 | conversations.append(conv.get_prompt()) 36 | 37 | # Tokenize conversations 38 | 39 | if has_image: 40 | input_ids = torch.stack( 41 | [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) 42 | else: 43 | input_ids = tokenizer( 44 | conversations, 45 | return_tensors="pt", 46 | padding="longest", 47 | max_length=tokenizer.model_max_length, 48 | truncation=True, 49 | ).input_ids 50 | 51 | targets = input_ids.clone() 52 | 53 | assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2 54 | 55 | # Mask targets 56 | sep = "[/INST] " 57 | for conversation, target in zip(conversations, targets): 58 | total_len = int(target.ne(tokenizer.pad_token_id).sum()) 59 | 60 | rounds = conversation.split(conv.sep2) 61 | cur_len = 1 62 | target[:cur_len] = IGNORE_INDEX 63 | for i, rou in enumerate(rounds): 64 | if rou == "": 65 | break 66 | 67 | parts = rou.split(sep) 68 | if len(parts) != 2: 69 | break 70 | parts[0] += sep 71 | 72 | if has_image: 73 | round_len = len(tokenizer_image_token(rou, tokenizer)) 74 | instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 75 | else: 76 | round_len = len(tokenizer(rou).input_ids) 77 | instruction_len = len(tokenizer(parts[0]).input_ids) - 2 78 | 79 | target[cur_len: cur_len + instruction_len] = IGNORE_INDEX 80 | 81 | cur_len += round_len 82 | target[cur_len:] = IGNORE_INDEX 83 | 84 | if cur_len < tokenizer.model_max_length: 85 | if cur_len != total_len: 86 | target[:] = IGNORE_INDEX 87 | print( 88 | f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." 89 | f" (ignored)" 90 | ) 91 | 92 | return dict( 93 | input_ids=input_ids, 94 | labels=targets, 95 | ) -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/mpt.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('mpt') 15 | def preprocess_mpt( 16 | sources, 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | conv = conversation_lib.default_conversation.copy() 21 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 22 | 23 | # Apply prompt templates 24 | conversations = [] 25 | for i, source in enumerate(sources): 26 | if roles[source[0]["from"]] != conv.roles[0]: 27 | # Skip the first one if it is not from human 28 | source = source[1:] 29 | 30 | conv.messages = [] 31 | for j, sentence in enumerate(source): 32 | role = roles[sentence["from"]] 33 | assert role == conv.roles[j % 2], f"{i}" 34 | conv.append_message(role, sentence["value"]) 35 | conversations.append(conv.get_prompt()) 36 | 37 | # Tokenize conversations 38 | input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], 39 | dim=0) 40 | targets = input_ids.clone() 41 | assert conv.sep_style == conversation_lib.SeparatorStyle.MPT 42 | 43 | # Mask targets 44 | sep = conv.sep + conv.roles[1] 45 | for conversation, target in zip(conversations, targets): 46 | total_len = int(target.ne(tokenizer.pad_token_id).sum()) 47 | 48 | rounds = conversation.split(conv.sep) 49 | re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt 50 | for conv_idx in range(3, len(rounds), 2): 51 | re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx + 2])) # user + gpt 52 | cur_len = 0 53 | target[:cur_len] = IGNORE_INDEX 54 | for i, rou in enumerate(re_rounds): 55 | if rou == "": 56 | break 57 | 58 | parts = rou.split(sep) 59 | if len(parts) != 2: 60 | break 61 | parts[0] += sep 62 | round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer)) 63 | instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) 64 | target[cur_len: cur_len + instruction_len] = IGNORE_INDEX 65 | 66 | cur_len += round_len 67 | target[cur_len:] = IGNORE_INDEX 68 | 69 | if cur_len < tokenizer.model_max_length: 70 | if cur_len != total_len: 71 | target[:] = IGNORE_INDEX 72 | print( 73 | f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." 74 | f" (ignored)" 75 | ) 76 | 77 | return dict( 78 | input_ids=input_ids, 79 | labels=targets, 80 | ) 81 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/phi.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('phi') 15 | def preprocess_phi( 16 | sources, 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | conv = conversation_lib.default_conversation.copy() 21 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 22 | 23 | # print('00000000000', sources) 24 | # Apply prompt templates 25 | conversations = [] 26 | 27 | for i, source in enumerate(sources): 28 | if roles[source[0]["from"]] != conv.roles[0]: 29 | # Skip the first one if it is not from human 30 | source = source[1:] 31 | 32 | conv.messages = [] 33 | for j, sentence in enumerate(source): 34 | role = roles[sentence["from"]] 35 | assert role == conv.roles[j % 2], f"{i}" 36 | conv.append_message(role, sentence["value"]) 37 | conversations.append(conv.get_prompt()) 38 | # Tokenize conversations 39 | if has_image: 40 | input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) 41 | else: 42 | input_ids = tokenizer( 43 | conversations, 44 | return_tensors="pt", 45 | padding="longest", 46 | max_length=tokenizer.model_max_length, 47 | truncation=True, 48 | ).input_ids 49 | 50 | targets = input_ids.clone() 51 | 52 | assert conv.sep_style == conversation_lib.SeparatorStyle.TWO 53 | # print(tokenizer) 54 | # Mask targets 55 | sep = conv.sep + conv.roles[1] + ": " 56 | for conversation, target in zip(conversations, targets): 57 | total_len = int(target.ne(tokenizer.pad_token_id).sum()) + conversation.count(conv.sep2) 58 | 59 | rounds = conversation.split(conv.sep2) 60 | cur_len = 0 61 | # target[:cur_len] = IGNORE_INDEX 62 | for i, rou in enumerate(rounds): 63 | if rou == "": 64 | break 65 | 66 | parts = rou.split(sep) 67 | if len(parts) != 2: 68 | break 69 | parts[0] += sep 70 | 71 | if has_image: 72 | round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 73 | instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 74 | else: 75 | round_len = len(tokenizer(rou).input_ids) + 1 76 | instruction_len = len(tokenizer(parts[0]).input_ids) - 1 77 | 78 | target[cur_len : cur_len + instruction_len] = IGNORE_INDEX 79 | 80 | cur_len += round_len 81 | target[cur_len:] = IGNORE_INDEX 82 | 83 | if cur_len < tokenizer.model_max_length: 84 | if cur_len != total_len: 85 | print( 86 | f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." 87 | f" (ignored)" 88 | ) 89 | print("number of rounds: ", len(rounds) - 1) 90 | print("rounds: ", rounds[:-1]) 91 | print("conversation: ", conversations) 92 | # print(target) 93 | # print(input_ids) 94 | target[:] = IGNORE_INDEX 95 | 96 | return dict( 97 | input_ids=input_ids, 98 | labels=targets, 99 | ) -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/plain.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('plain') 15 | def preprocess_plain( 16 | sources: Sequence[str], 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | # add end signal and concatenate together 21 | conversations = [] 22 | for source in sources: 23 | assert len(source) == 2 24 | assert DEFAULT_IMAGE_TOKEN in source[0]['value'] 25 | source[0]['value'] = DEFAULT_IMAGE_TOKEN 26 | conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep 27 | conversations.append(conversation) 28 | # tokenize conversations 29 | input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] 30 | targets = copy.deepcopy(input_ids) 31 | for target, source in zip(targets, sources): 32 | tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) 33 | target[:tokenized_len] = IGNORE_INDEX 34 | 35 | return dict(input_ids=input_ids, labels=targets) 36 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/stablelm.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('stablelm') 15 | def preprocess_stablelm( 16 | sources, 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | conv = conversation_lib.default_conversation.copy() 21 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 22 | 23 | # print('00000000000', sources) 24 | # Apply prompt templates 25 | conversations = [] 26 | 27 | for i, source in enumerate(sources): 28 | if roles[source[0]["from"]] != conv.roles[0]: 29 | # Skip the first one if it is not from human 30 | source = source[1:] 31 | 32 | conv.messages = [] 33 | for j, sentence in enumerate(source): 34 | role = roles[sentence["from"]] 35 | assert role == conv.roles[j % 2], f"{i}" 36 | conv.append_message(role, sentence["value"]) 37 | conversations.append(conv.get_prompt()) 38 | # Tokenize conversations 39 | if has_image: 40 | input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) 41 | else: 42 | input_ids = tokenizer( 43 | conversations, 44 | return_tensors="pt", 45 | padding="longest", 46 | max_length=tokenizer.model_max_length, 47 | truncation=True, 48 | ).input_ids 49 | 50 | targets = input_ids.clone() 51 | 52 | assert conv.sep_style == conversation_lib.SeparatorStyle.TWO 53 | # print(tokenizer) 54 | # Mask targets 55 | sep = conv.sep + conv.roles[1] + ": " 56 | for conversation, target in zip(conversations, targets): 57 | total_len = int(target.ne(tokenizer.pad_token_id).sum()) 58 | 59 | rounds = conversation.split(conv.sep2) 60 | cur_len = 0 61 | target[:cur_len] = IGNORE_INDEX 62 | for i, rou in enumerate(rounds): 63 | if rou == "": 64 | break 65 | 66 | parts = rou.split(sep) 67 | # print('i rou, parts', i, rou, parts) 68 | if len(parts) != 2: 69 | break 70 | parts[0] += sep 71 | # print('after add sep, parts', parts) 72 | 73 | if has_image: 74 | round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 # for eos_token 75 | instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 76 | else: 77 | round_len = len(tokenizer(rou).input_ids) + 1 # for eos_token 78 | instruction_len = len(tokenizer(parts[0]).input_ids) - 1 79 | target[cur_len : cur_len + instruction_len] = IGNORE_INDEX 80 | 81 | cur_len += round_len 82 | target[cur_len:] = IGNORE_INDEX 83 | 84 | if cur_len < tokenizer.model_max_length: 85 | if cur_len != total_len: 86 | target[:] = IGNORE_INDEX 87 | print( 88 | f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." 89 | f" (ignored)" 90 | ) 91 | 92 | return dict( 93 | input_ids=input_ids, 94 | labels=targets, 95 | ) -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/tiny_llama.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | 13 | 14 | @register_preprocess('tiny_llama') 15 | def preprocess_tiny_llama( 16 | sources, 17 | tokenizer: transformers.PreTrainedTokenizer, 18 | has_image: bool = False 19 | ) -> Dict: 20 | conv = conversation_lib.default_conversation.copy() 21 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 22 | 23 | # Apply prompt templates 24 | conversations = [] 25 | for i, source in enumerate(sources): 26 | if roles[source[0]["from"]] != conv.roles[0]: 27 | # Skip the first one if it is not from human 28 | source = source[1:] 29 | 30 | conv.messages = [] 31 | for j, sentence in enumerate(source): 32 | role = roles[sentence["from"]] 33 | assert role == conv.roles[j % 2], f"{i}" 34 | conv.append_message(role, sentence["value"]) 35 | conversations.append(conv.get_prompt()) 36 | 37 | # Tokenize conversations 38 | 39 | if has_image: 40 | input_ids = torch.stack( 41 | [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) 42 | else: 43 | input_ids = tokenizer( 44 | conversations, 45 | return_tensors="pt", 46 | padding="longest", 47 | max_length=tokenizer.model_max_length, 48 | truncation=True, 49 | ).input_ids 50 | 51 | targets = input_ids.clone() 52 | 53 | assert conv.sep_style == conversation_lib.SeparatorStyle.TINY_LLAMA 54 | 55 | for conversation, target in zip(conversations, targets): 56 | total_len = int(target.ne(tokenizer.pad_token_id).sum()) 57 | 58 | rounds = conversation.split(conv.sep2) 59 | cur_len = 1 60 | target[:cur_len] = IGNORE_INDEX 61 | sep = "<|assistant|>\n" 62 | for i, rou in enumerate(rounds): 63 | if rou == "": 64 | break 65 | 66 | parts = rou.split(sep) 67 | if len(parts) != 2: 68 | # print("current part len: ", len(parts)) 69 | break 70 | parts[0] = parts[0] + sep 71 | 72 | if has_image: 73 | round_len = len(tokenizer_image_token(rou, tokenizer)) 74 | instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 75 | else: 76 | round_len = len(tokenizer(rou).input_ids) 77 | instruction_len = len(tokenizer(parts[0]).input_ids) - 1 78 | 79 | target[cur_len: cur_len + instruction_len] = IGNORE_INDEX 80 | 81 | cur_len += round_len 82 | target[cur_len:] = IGNORE_INDEX 83 | 84 | if cur_len < tokenizer.model_max_length: 85 | if cur_len != total_len: 86 | target[:] = IGNORE_INDEX 87 | print( 88 | f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." 89 | f" (ignored)" 90 | ) 91 | 92 | return dict( 93 | input_ids=input_ids, 94 | labels=targets, 95 | ) -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/preprocess/v1.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence, List 2 | import copy 3 | 4 | import transformers 5 | import torch 6 | 7 | from tinyllava.data.process import register_preprocess 8 | from tinyllava.mm_utils import tokenizer_image_token 9 | from tinyllava import conversation as conversation_lib 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ 11 | DEFAULT_IM_END_TOKEN 12 | import tokenizers 13 | 14 | from packaging import version 15 | 16 | IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14') 17 | 18 | 19 | @register_preprocess('v1') 20 | def preprocess_v1( 21 | sources, 22 | tokenizer: transformers.PreTrainedTokenizer, 23 | has_image: bool = False 24 | ) -> Dict: 25 | conv = conversation_lib.default_conversation.copy() 26 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 27 | 28 | # Apply prompt templates 29 | conversations = [] 30 | for i, source in enumerate(sources): 31 | if roles[source[0]["from"]] != conv.roles[0]: 32 | # Skip the first one if it is not from human 33 | source = source[1:] 34 | 35 | conv.messages = [] 36 | for j, sentence in enumerate(source): 37 | role = roles[sentence["from"]] 38 | assert role == conv.roles[j % 2], f"{i}" 39 | conv.append_message(role, sentence["value"]) 40 | conversations.append(conv.get_prompt()) 41 | 42 | # Tokenize conversations 43 | 44 | if has_image: 45 | input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) 46 | else: 47 | input_ids = tokenizer( 48 | conversations, 49 | return_tensors="pt", 50 | padding="longest", 51 | max_length=tokenizer.model_max_length, 52 | truncation=True, 53 | ).input_ids 54 | 55 | targets = input_ids.clone() 56 | 57 | assert conv.sep_style == conversation_lib.SeparatorStyle.TWO 58 | 59 | # Mask targets 60 | sep = conv.sep + conv.roles[1] + ": " 61 | for conversation, target in zip(conversations, targets): 62 | total_len = int(target.ne(tokenizer.pad_token_id).sum()) 63 | 64 | rounds = conversation.split(conv.sep2) 65 | cur_len = 1 66 | target[:cur_len] = IGNORE_INDEX 67 | for i, rou in enumerate(rounds): 68 | if rou == "": 69 | break 70 | 71 | parts = rou.split(sep) 72 | if len(parts) != 2: 73 | break 74 | parts[0] += sep 75 | 76 | if has_image: 77 | round_len = len(tokenizer_image_token(rou, tokenizer)) 78 | instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 79 | else: 80 | round_len = len(tokenizer(rou).input_ids) 81 | instruction_len = len(tokenizer(parts[0]).input_ids) - 2 82 | 83 | if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14: 84 | round_len -= 1 85 | instruction_len -= 1 86 | 87 | target[cur_len : cur_len + instruction_len] = IGNORE_INDEX 88 | 89 | cur_len += round_len 90 | target[cur_len:] = IGNORE_INDEX 91 | 92 | if cur_len < tokenizer.model_max_length: 93 | if cur_len != total_len: 94 | 95 | print( 96 | f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." 97 | f" (ignored)" 98 | ) 99 | # print("number of rounds: ", len(rounds) - 1) 100 | # print("rounds: ", rounds[:-1]) 101 | # print("conversation: ", conversations) 102 | # print(target) 103 | # print(input_ids) 104 | target[:] = IGNORE_INDEX 105 | 106 | return dict( 107 | input_ids=input_ids, 108 | labels=targets, 109 | ) 110 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/data/process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | from typing import Dict, Optional, Sequence, List 4 | 5 | import transformers 6 | 7 | from tinyllava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 8 | from tinyllava import conversation as conversation_lib 9 | from tinyllava.arguments import * 10 | 11 | PREPROCESS_REGISTRY = {} 12 | 13 | def register_preprocess(name): 14 | def register_preprocess_cls(cls): 15 | if name in PREPROCESS_REGISTRY: 16 | return PREPROCESS_REGISTRY[name] 17 | 18 | PREPROCESS_REGISTRY[name] = cls 19 | return cls 20 | 21 | return register_preprocess_cls 22 | 23 | 24 | def import_modules(modules_dir, namespace): 25 | for file in os.listdir(modules_dir): 26 | path = os.path.join(modules_dir, file) 27 | 28 | if ( 29 | not file.startswith("_") 30 | and not file.startswith(".") 31 | and (file.endswith(".py") or os.path.isdir(path)) 32 | ): 33 | module_name = file[: file.find(".py")] if file.endswith(".py") else file 34 | importlib.import_module(namespace + "." + module_name) 35 | 36 | models_dir = os.path.join(os.path.dirname(__file__), 'preprocess') 37 | import_modules(models_dir, "tinyllava.data.preprocess") 38 | 39 | 40 | def PreprocessSelect(version): 41 | result = PREPROCESS_REGISTRY.get(version, None) 42 | if result is None: 43 | for name in PREPROCESS_REGISTRY.keys(): 44 | if version in name: 45 | result = PREPROCESS_REGISTRY[name] 46 | break 47 | if result is None: 48 | result = PREPROCESS_REGISTRY['default'] 49 | return result 50 | 51 | 52 | 53 | def preprocess_multimodal( 54 | sources: Sequence[str], 55 | data_args: DataArguments 56 | ) -> Dict: 57 | is_multimodal = data_args.is_multimodal 58 | if not is_multimodal: 59 | return sources 60 | 61 | for source in sources: 62 | for sentence in source: 63 | if DEFAULT_IMAGE_TOKEN in sentence['value']: 64 | sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() 65 | sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] 66 | sentence['value'] = sentence['value'].strip() 67 | if "mmtag" in conversation_lib.default_conversation.version: 68 | sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, 69 | '' + DEFAULT_IMAGE_TOKEN + '') 70 | replace_token = DEFAULT_IMAGE_TOKEN 71 | if data_args.mm_use_im_start_end: 72 | replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN 73 | sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) 74 | 75 | return sources 76 | 77 | 78 | def preprocess( 79 | sources: Sequence[str], 80 | tokenizer: transformers.PreTrainedTokenizer, 81 | has_image: bool = False 82 | ) -> Dict: 83 | return PreprocessSelect(conversation_lib.default_conversation.version)(sources, tokenizer, has_image) 84 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/eval/__init__.py -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/SemDeDup/clustering/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import logging 9 | import os 10 | import numpy as np 11 | import random 12 | 13 | 14 | def seed_everything(seed: int = 42): 15 | """ 16 | Function to set seed for random number generators for reproducibility. 17 | 18 | Args: 19 | seed: The seed value to use for random number generators. Default is 42. 20 | 21 | Returns: 22 | None 23 | """ 24 | # Set seed values for various random number generators 25 | random.seed(seed) 26 | os.environ["PYTHONHASHSEED"] = str(seed) 27 | np.random.seed(seed) 28 | torch.manual_seed(seed) 29 | torch.cuda.manual_seed(seed) 30 | torch.cuda.manual_seed_all(seed) 31 | 32 | # Ensure deterministic behavior for CUDA algorithms 33 | torch.backends.cudnn.deterministic = True 34 | torch.backends.cudnn.benchmark = False 35 | 36 | 37 | # def get_logger( 38 | # file_name="logger.log", level=logging.INFO, stdout=False 39 | # ) -> logging.Logger: 40 | # """ 41 | # When the level is set to "logging.INFO", the debugging logs will not be saved (lower level). 42 | # """ 43 | # # See https://www.loggly.com/ultimate-guide/python-logging-basics/ for more information about pyhton logging module 44 | # logger = logging.getLogger() # uses the module name 45 | # # set log level 46 | # logger.setLevel(level) 47 | # logger.handlers = [] 48 | # # define file handler and set formatter 49 | # file_handler = logging.FileHandler( 50 | # file_name 51 | # ) # or use logging.handlers.WatchedFileHandler(os.environ.get("LOGFILE", file_name)) 52 | # # define formatter 53 | # formatter = logging.Formatter( 54 | # "%(asctime)s : %(levelname)s : %(name)s : %(message)s" 55 | # ) # or use logging.BASIC_FORMAT 56 | # file_handler.setFormatter(formatter) 57 | 58 | # stdout_handler = ( 59 | # logging.StreamHandler() 60 | # ) # .setLevel(logging.DEBUG) #.setFormatter(CustomFormatter(fmt)) 61 | 62 | # # add handler to logger 63 | # # if not logger.hasHandlers(): 64 | # logger.addHandler(file_handler) 65 | # if stdout: 66 | # logger.addHandler(stdout_handler) 67 | 68 | # return logger 69 | 70 | 71 | # import logging 72 | 73 | 74 | def get_logger( 75 | file_name: str = "logger.log", level: int = logging.INFO, stdout: bool = False 76 | ) -> logging.Logger: 77 | """ 78 | Initialize and configure the logger object to save log entries to a file and optionally print to stdout. 79 | 80 | :param file_name: The name of the log file. 81 | :param level: The logging level to use (default: INFO). 82 | :param stdout: Whether to enable printing log entries to stdout (default: False). 83 | :return: A configured logging.Logger instance. 84 | """ 85 | logger = logging.getLogger(__name__) 86 | 87 | # Set the logging level 88 | logger.setLevel(level) 89 | 90 | # Remove any existing handlers from the logger 91 | logger.handlers = [] 92 | 93 | # Create a file handler for the logger 94 | file_handler = logging.FileHandler(file_name) 95 | 96 | # Define the formatter for the log entries 97 | formatter = logging.Formatter( 98 | "%(asctime)s : %(levelname)s : %(name)s : %(message)s" 99 | ) 100 | 101 | # Set the formatter for the file handler 102 | file_handler.setFormatter(formatter) 103 | 104 | # Add the file handler to the logger 105 | logger.addHandler(file_handler) 106 | 107 | # Optionally add a stdout handler to the logger 108 | if stdout: 109 | stdout_handler = logging.StreamHandler() 110 | stdout_handler.setFormatter(formatter) 111 | logger.addHandler(stdout_handler) 112 | 113 | # Return the configured logger instance 114 | return logger 115 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/SemDeDup/compute_centroids.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import numpy as np 4 | import os 5 | import logging 6 | from tinyllava.eval.score.SemDeDup.clustering.clustering import compute_centroids 7 | from tinyllava.eval.score.SemDeDup.clustering.sort_clusters import assign_and_sort_clusters 8 | from tinyllava.eval.score.SemDeDup.execute_semdedup import execute_semdedup 9 | from tinyllava.eval.score.SemDeDup.extract_dedup_data import extract_pruned_data 10 | 11 | logger = logging.getLogger(__name__) 12 | logger.addHandler(logging.StreamHandler()) 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--sim_metric", type=str, default="cosine") 18 | parser.add_argument("--keep_hard", action='store_true') 19 | parser.add_argument("--Kmeans_with_cos_dist", action='store_true') 20 | parser.add_argument("--emb_memory_loc", type=str, default="emb.npy") 21 | parser.add_argument("--sorted_clusters_file_loc", type=str, default="./sorted_clusters") 22 | parser.add_argument("--save_folder", type=str, default="./save_folder") 23 | parser.add_argument("--output_indices_path", type=str, default="./selected_indices.json") 24 | parser.add_argument("--ncentroids", type=int, default=500) # proportional to dataset size 25 | parser.add_argument("--niter", type=int, default=100) 26 | parser.add_argument("--seed", type=int, default=1234) 27 | parser.add_argument("--prune_p", type=float, default=.0) 28 | parser.add_argument("--eps_list", type=float, nargs='+', default=[0.48, 0.47, 0.46, 0.45, 0.44]) 29 | 30 | args = parser.parse_args() 31 | 32 | ## -- Fix the seed 33 | SEED = args.seed 34 | random.seed(SEED) 35 | 36 | emb_memory = np.load(args.emb_memory_loc) 37 | dataset_size, emb_size = emb_memory.shape 38 | # Normalize since SemDeDup uses Spherical Kmeans clustering with normalized embeddings, referring to paper, even in language modality with OPT model. 39 | emb_memory = emb_memory / np.linalg.norm(emb_memory, axis=-1, keepdims=True) 40 | 41 | compute_centroids( 42 | data=emb_memory, 43 | ncentroids=args.ncentroids, 44 | niter=args.niter, 45 | seed=args.seed, 46 | Kmeans_with_cos_dist=args.Kmeans_with_cos_dist, 47 | save_folder=args.save_folder, 48 | logger=logger, 49 | verbose=True, 50 | ) 51 | 52 | indices_list = np.array(list(range(dataset_size))) 53 | assign_and_sort_clusters( 54 | data=emb_memory, 55 | paths_list=indices_list, 56 | sim_metric=args.sim_metric, 57 | keep_hard=args.keep_hard, 58 | kmeans_with_cos_dist=args.Kmeans_with_cos_dist, 59 | save_folder=args.save_folder, 60 | sorted_clusters_file_loc=args.sorted_clusters_file_loc, 61 | cluster_ids=range(0, args.ncentroids), 62 | logger=logger, 63 | ) 64 | 65 | execute_semdedup( 66 | embs=emb_memory, 67 | cluster_ids=range(0, args.ncentroids), 68 | save_loc=args.save_folder, 69 | sorted_clusters_path=args.sorted_clusters_file_loc, 70 | eps_list=args.eps_list, 71 | which_to_keep= "hard" if args.keep_hard else "easy", 72 | ) 73 | 74 | target_length = int(args.prune_p * dataset_size) 75 | extract_pruned_data( 76 | sorted_clusters_path=args.sorted_clusters_file_loc, 77 | semdedup_pruning_tables_path=os.path.join(args.save_folder, "dataframes"), 78 | eps_list=args.eps_list, 79 | num_clusters=args.ncentroids, 80 | output_indices_path=args.output_indices_path, 81 | target_length=target_length, 82 | ) 83 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/SemDeDup/extract_dedup_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | from tqdm import tqdm 9 | import pickle 10 | import numpy as np 11 | import json 12 | 13 | IMAGE_NAME_INDEX = 0 14 | 15 | def extract_pruned_data( 16 | sorted_clusters_path, 17 | semdedup_pruning_tables_path, 18 | eps_list, 19 | num_clusters, 20 | output_indices_path, 21 | target_length, 22 | retreive_kept_samples=True, 23 | ): 24 | 25 | target_close = [] 26 | for eps in eps_list: 27 | 28 | ## -- list of paths to the examples we want to keep/remove. 29 | example_paths = [] 30 | 31 | for cluster_id in tqdm(range(0, num_clusters)): 32 | 33 | cluster_i = np.load( 34 | os.path.join(sorted_clusters_path, f"cluster_{cluster_id}.npy") 35 | ) 36 | with open( 37 | f"{semdedup_pruning_tables_path}/cluster_{cluster_id}.pkl", "rb" 38 | ) as file: 39 | semdedup_pruning_tables = pickle.load(file) 40 | 41 | ## -- See which examples to keep/remove from this cluster. 42 | ## -- Use retreive_kept_samples=True when kept dataset size <= 50%. This will return a smaller output text file, 43 | ## -- semdedup_pruning_tables contain True values for the examples to be removed. 44 | images_to_keep_or_remove = semdedup_pruning_tables[f"eps={eps}"][ 45 | semdedup_pruning_tables[f"eps={eps}"] == (not retreive_kept_samples) 46 | ].index.to_numpy() 47 | if "indices" in semdedup_pruning_tables.columns: 48 | cluster_i = cluster_i[semdedup_pruning_tables["indices"]] 49 | ## -- retrieve only the examples we want and add to the list. 50 | dedup_cluster = cluster_i[images_to_keep_or_remove] 51 | example_paths += dedup_cluster[:, IMAGE_NAME_INDEX].astype("int").tolist() 52 | 53 | if (abs(len(example_paths) - target_length) < abs(len(target_close) - target_length)) and (len(example_paths) - target_length >= 0): 54 | target_close = example_paths 55 | 56 | assert len(target_close) != 0 57 | 58 | num_diff = len(target_close) - target_length 59 | print(f"Selected data - Target length: {num_diff}") 60 | 61 | selected_indices = np.array(target_close) 62 | selected_indices = selected_indices[:target_length] 63 | np.save(output_indices_path, selected_indices) 64 | 65 | return 66 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/coincide/cluster_transferability.py: -------------------------------------------------------------------------------- 1 | # Here, we assume that the embeddings, clip-llava_score is already calculated 2 | # Codes from https://github.com/adymaharana/d2pruning?tab=readme-ov-file 3 | 4 | import os 5 | import math 6 | import argparse 7 | import time 8 | import numpy as np 9 | import faiss 10 | import torch 11 | import copy 12 | from sklearn.metrics.pairwise import cosine_similarity 13 | 14 | from tqdm import tqdm 15 | 16 | if __name__ == "__main__": 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--centroid_embed_path", type=str, required=True) 20 | parser.add_argument("--transferability_path", type=str, required=True, help="Path to output directory") 21 | parser.add_argument("--k", type=int, default=4) 22 | parser.add_argument("--knn_path", type=str, required=True) 23 | 24 | args = parser.parse_args() 25 | 26 | centroid_embed = np.load(args.centroid_embed_path) 27 | centroid_embed = centroid_embed.reshape(-1, 5, 4096) 28 | centroid_embed = centroid_embed[:,:,2048:] 29 | centroid_embed = centroid_embed.reshape(-1, 5*2048) 30 | 31 | cosine_sim = cosine_similarity(centroid_embed, centroid_embed) 32 | 33 | knn_cluster_indices = np.argsort(cosine_sim, axis=-1)[:,::-1][:,:args.k+1] 34 | knn_cluster_similarity = cosine_sim[np.arange(len(cosine_sim))[:,None], knn_cluster_indices] 35 | 36 | np.save(args.knn_path + '_indices.npy', knn_cluster_indices) 37 | np.save(args.knn_path + '_similarity.npy', knn_cluster_similarity) 38 | 39 | mask = cosine_sim > 0.9 40 | cosine_sim[mask] = 0 41 | transfer = cosine_sim.sum(axis=-1) / (~mask).sum(axis=-1) 42 | # transfer = cosine_sim.mean(axis=-1) 43 | np.save(args.transferability_path, transfer) 44 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/coincide/compute_centroids.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import numpy as np 4 | import os 5 | import logging 6 | from tinyllava.eval.score.coincide.clustering import compute_centroids 7 | 8 | logger = logging.getLogger(__name__) 9 | logger.addHandler(logging.StreamHandler()) 10 | logger.setLevel(logging.INFO) 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--sim_metric", type=str, default="cosine") 15 | parser.add_argument("--Kmeans_with_cos_dist", action='store_true') 16 | parser.add_argument("--emb_memory_loc", type=str, default="emb.npy") 17 | parser.add_argument("--save_folder", type=str, default="./save_folder") 18 | parser.add_argument("--ncentroids", type=int, default=500) # proportional to dataset size 19 | parser.add_argument("--niter", type=int, default=100) 20 | parser.add_argument("--seed", type=int, default=1234) 21 | 22 | args = parser.parse_args() 23 | 24 | ## -- Fix the seed 25 | SEED = args.seed 26 | random.seed(SEED) 27 | 28 | emb_memory = np.load(args.emb_memory_loc) 29 | dataset_size, emb_size = emb_memory.shape 30 | # Normalize since SemDeDup uses Spherical Kmeans clustering with normalized embeddings, referring to paper, even in language modality with OPT model. 31 | # emb_memory = emb_memory / np.linalg.norm(emb_memory, axis=-1, keepdims=True) 32 | 33 | compute_centroids( 34 | data=emb_memory, 35 | ncentroids=args.ncentroids, 36 | niter=args.niter, 37 | seed=args.seed, 38 | Kmeans_with_cos_dist=args.Kmeans_with_cos_dist, 39 | save_folder=args.save_folder, 40 | logger=logger, 41 | verbose=True, 42 | ) 43 | 44 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/merge_values.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import numpy as np 4 | import os 5 | import json 6 | import glob 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--score_path", type=str, default=".") 11 | parser.add_argument("--average", action='store_true') 12 | args = parser.parse_args() 13 | 14 | score_files = sorted(glob.glob(args.score_path + '_[0-9]*.npy')) 15 | 16 | scores = [] 17 | for score_file in score_files: 18 | scores.append(np.load(score_file)) 19 | 20 | scores = np.concatenate(scores) 21 | if args.average: 22 | scores = np.mean(scores, axis=0) 23 | 24 | recover_indices = np.load(args.score_path + '_recover_indices.npy') 25 | scores = scores[recover_indices] 26 | 27 | np.save(args.score_path + '.npy', scores) 28 | 29 | for score_file in score_files: 30 | os.remove(score_file) 31 | os.remove(args.score_path + '_recover_indices.npy') 32 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/eval/score/sort_values.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--score_path", type=str, default=".") 7 | parser.add_argument("--save_path", type=str, default="./temp.json") 8 | args = parser.parse_args() 9 | 10 | scores = np.load(args.score_path) 11 | score_indices = np.argsort(scores) 12 | 13 | np.save(args.save_path, score_indices) 14 | 15 | 16 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from tinyllava.model.language_model.llava_llama import TinyLlavaLlamaForCausalLM, TinyLlavaConfig 2 | from tinyllava.model.language_model.llava_stablelm import TinyLlavaStablelmForCausalLM, TinyLlavaStablelmConfig 3 | from tinyllava.model.language_model.llava_phi import TinyLlavaPhiForCausalLM, TinyLlavaPhiConfig 4 | from tinyllava.model.language_model.llava_qwen import LlavaQwen2Config, LlavaQwen2ForCausalLM 5 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/language_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/model/language_model/__init__.py -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/language_model/qwen/configuration_qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from transformers import PretrainedConfig 7 | 8 | 9 | class QWenConfig(PretrainedConfig): 10 | model_type = "qwen" 11 | keys_to_ignore_at_inference = ["past_key_values"] 12 | 13 | def __init__( 14 | self, 15 | vocab_size=151936, 16 | hidden_size=4096, 17 | num_hidden_layers=32, 18 | num_attention_heads=32, 19 | emb_dropout_prob=0.0, 20 | attn_dropout_prob=0.0, 21 | layer_norm_epsilon=1e-6, 22 | initializer_range=0.02, 23 | max_position_embeddings=8192, 24 | scale_attn_weights=True, 25 | use_cache=True, 26 | bf16=False, 27 | fp16=False, 28 | fp32=False, 29 | kv_channels=128, 30 | rotary_pct=1.0, 31 | rotary_emb_base=10000, 32 | use_dynamic_ntk=True, 33 | use_logn_attn=True, 34 | use_flash_attn="auto", 35 | intermediate_size=22016, 36 | no_bias=True, 37 | tie_word_embeddings=False, 38 | use_cache_quantization=False, 39 | use_cache_kernel=False, 40 | softmax_in_fp32=False, 41 | **kwargs, 42 | ): 43 | self.vocab_size = vocab_size 44 | self.hidden_size = hidden_size 45 | self.intermediate_size = intermediate_size 46 | self.num_hidden_layers = num_hidden_layers 47 | self.num_attention_heads = num_attention_heads 48 | self.emb_dropout_prob = emb_dropout_prob 49 | self.attn_dropout_prob = attn_dropout_prob 50 | self.layer_norm_epsilon = layer_norm_epsilon 51 | self.initializer_range = initializer_range 52 | self.scale_attn_weights = scale_attn_weights 53 | self.use_cache = use_cache 54 | self.max_position_embeddings = max_position_embeddings 55 | self.bf16 = bf16 56 | self.fp16 = fp16 57 | self.fp32 = fp32 58 | self.kv_channels = kv_channels 59 | self.rotary_pct = rotary_pct 60 | self.rotary_emb_base = rotary_emb_base 61 | self.use_dynamic_ntk = use_dynamic_ntk 62 | self.use_logn_attn = use_logn_attn 63 | self.use_flash_attn = use_flash_attn 64 | self.no_bias = no_bias 65 | self.use_cache_quantization = use_cache_quantization 66 | self.use_cache_kernel = use_cache_kernel 67 | self.softmax_in_fp32 = softmax_in_fp32 68 | super().__init__( 69 | tie_word_embeddings=tie_word_embeddings, 70 | **kwargs 71 | ) 72 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/language_model/qwen/cpp_kernels.py: -------------------------------------------------------------------------------- 1 | from torch.utils import cpp_extension 2 | import pathlib 3 | import os 4 | import subprocess 5 | 6 | def _get_cuda_bare_metal_version(cuda_dir): 7 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], 8 | universal_newlines=True) 9 | output = raw_output.split() 10 | release_idx = output.index("release") + 1 11 | release = output[release_idx].split(".") 12 | bare_metal_major = release[0] 13 | bare_metal_minor = release[1][0] 14 | 15 | return raw_output, bare_metal_major, bare_metal_minor 16 | 17 | def _create_build_dir(buildpath): 18 | try: 19 | os.mkdir(buildpath) 20 | except OSError: 21 | if not os.path.isdir(buildpath): 22 | print(f"Creation of the build directory {buildpath} failed") 23 | 24 | # Check if cuda 11 is installed for compute capability 8.0 25 | cc_flag = [] 26 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 27 | if int(bare_metal_major) >= 11: 28 | cc_flag.append('-gencode') 29 | cc_flag.append('arch=compute_80,code=sm_80') 30 | if int(bare_metal_minor) >= 7: 31 | cc_flag.append('-gencode') 32 | cc_flag.append('arch=compute_90,code=sm_90') 33 | 34 | # Build path 35 | srcpath = pathlib.Path(__file__).parent.absolute() 36 | buildpath = srcpath / 'build' 37 | _create_build_dir(buildpath) 38 | 39 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 40 | return cpp_extension.load( 41 | name=name, 42 | sources=sources, 43 | build_directory=buildpath, 44 | extra_cflags=['-O3', ], 45 | extra_cuda_cflags=['-O3', 46 | '-gencode', 'arch=compute_70,code=sm_70', 47 | '--use_fast_math'] + extra_cuda_flags + cc_flag, 48 | verbose=1 49 | ) 50 | 51 | extra_flags = [] 52 | 53 | cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp", 54 | "./cache_autogptq_cuda_kernel_256.cu"] 55 | cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags) 56 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/language_model/qwen/llava_qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/model/language_model/qwen/llava_qwen.py -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/model_factory.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | MODEL_REGISTRY = {} 5 | TOKENIZER_REGISTRY = {} 6 | 7 | 8 | def ModelSelect(model_name_or_path): 9 | model = None 10 | for name in MODEL_REGISTRY.keys(): 11 | if name.lower() in model_name_or_path.lower(): 12 | model = MODEL_REGISTRY[name] 13 | if model is None: 14 | model = MODEL_REGISTRY['llama'] 15 | return model 16 | 17 | 18 | def TokenizerSelect(model_name_or_path): 19 | tokenizer_init = None 20 | for name in TOKENIZER_REGISTRY.keys(): 21 | if name.lower() in model_name_or_path.lower(): 22 | tokenizer_init = TOKENIZER_REGISTRY[name] 23 | if tokenizer_init is None: 24 | tokenizer_init = TOKENIZER_REGISTRY['llama'] 25 | return tokenizer_init 26 | 27 | 28 | def register_model(name): 29 | def register_model_cls(cls): 30 | if name in MODEL_REGISTRY: 31 | return MODEL_REGISTRY[name] 32 | 33 | MODEL_REGISTRY[name] = cls 34 | # FIXME: Find a more elegant way to do this 35 | if name == 'phi': 36 | MODEL_REGISTRY['TinyLLaVA-3.1B'] = cls 37 | elif name == 'stablelm': 38 | MODEL_REGISTRY['TinyLLaVA-2.0B'] = cls 39 | elif name == 'llama': 40 | MODEL_REGISTRY['TinyLLaVA-1.5B'] = cls 41 | return cls 42 | 43 | return register_model_cls 44 | 45 | 46 | def register_tokenizer(name): 47 | def register_tokenizer_cls(cls): 48 | if name in TOKENIZER_REGISTRY: 49 | return TOKENIZER_REGISTRY[name] 50 | 51 | TOKENIZER_REGISTRY[name] = cls 52 | # FIXME: Find a more elegant way to do this 53 | if name == 'phi': 54 | TOKENIZER_REGISTRY['TinyLLaVA-3.1B'] = cls 55 | elif name == 'stablelm': 56 | TOKENIZER_REGISTRY['TinyLLaVA-2.0B'] = cls 57 | elif name == 'llama': 58 | TOKENIZER_REGISTRY['TinyLLaVA-1.5B'] = cls 59 | return cls 60 | 61 | return register_tokenizer_cls 62 | 63 | 64 | def import_models(models_dir, namespace): 65 | for file in os.listdir(models_dir): 66 | path = os.path.join(models_dir, file) 67 | if ( 68 | not file.startswith("_") 69 | and not file.startswith(".") 70 | and file.endswith(".py") 71 | ): 72 | model_name = file[: file.find(".py")] if file.endswith(".py") else file 73 | importlib.import_module(namespace + "." + model_name) 74 | 75 | 76 | # automatically import any Python files in the models/ directory 77 | models_dir = os.path.join(os.path.dirname(__file__), 'language_model') 78 | import_models(models_dir, "tinyllava.model.language_model") -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tinyllava.model.multimodal_encoder.clip_encoder import CLIPVisionTower 3 | from tinyllava.model.multimodal_encoder.siglip_encoder import SigLipVisionTower 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if 'siglip' not in vision_tower.lower(): 9 | if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith('laion'): 10 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 11 | else: 12 | if is_absolute_path_exists or vision_tower.startswith("google") or vision_tower.startswith('bczhou'): 13 | return SigLipVisionTower(vision_tower, vision_tower_cfg, **kwargs) 14 | raise ValueError(f'Unknown vision tower: {vision_tower}') 15 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | 17 | if not delay_load: 18 | self.load_model() 19 | elif getattr(args, 'unfreeze_mm_vision_tower', False): 20 | self.load_model() 21 | else: 22 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 23 | 24 | def load_model(self): 25 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 26 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 27 | self.vision_tower.requires_grad_(False) 28 | 29 | self.is_loaded = True 30 | 31 | def feature_select(self, image_forward_outs): 32 | image_features = image_forward_outs.hidden_states[self.select_layer] 33 | if self.select_feature == 'patch': 34 | image_features = image_features[:, 1:] 35 | elif self.select_feature == 'cls_patch': 36 | image_features = image_features 37 | else: 38 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 39 | return image_features 40 | 41 | # @torch.no_grad() 42 | def forward(self, images): 43 | if type(images) is list: 44 | image_features = [] 45 | for image in images: 46 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 47 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 48 | image_features.append(image_feature) 49 | else: 50 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 51 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 52 | 53 | return image_features 54 | 55 | @property 56 | def dummy_feature(self): 57 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 58 | 59 | @property 60 | def dtype(self): 61 | return self.vision_tower.dtype 62 | 63 | @property 64 | def device(self): 65 | return self.vision_tower.device 66 | 67 | @property 68 | def config(self): 69 | if self.is_loaded: 70 | return self.vision_tower.config 71 | else: 72 | return self.cfg_only 73 | 74 | @property 75 | def hidden_size(self): 76 | return self.config.hidden_size 77 | 78 | @property 79 | def num_patches_per_side(self): 80 | return self.config.image_size // self.config.patch_size 81 | 82 | @property 83 | def num_patches(self): 84 | return (self.config.image_size // self.config.patch_size) ** 2 85 | -------------------------------------------------------------------------------- /COINCIDE_cluster/tinyllava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'tinyllava' in config and 'tinyllava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "tinyllava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /COINCIDE_train/cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from llava.model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 llava_score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | @torch.inference_mode() 14 | def eval_model(model_name, questions_file, answers_file): 15 | # Model 16 | disable_torch_init() 17 | model_name = os.path.expanduser(model_name) 18 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 19 | model = AutoModelForCausalLM.from_pretrained(model_name, 20 | torch_dtype=torch.float16).cuda() 21 | 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 41 | try: 42 | index = outputs.index(conv.sep, len(prompt)) 43 | except ValueError: 44 | outputs += conv.sep 45 | index = outputs.index(conv.sep, len(prompt)) 46 | 47 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 48 | ans_id = shortuuid.uuid() 49 | ans_file.write(json.dumps({"question_id": idx, 50 | "text": outputs, 51 | "answer_id": ans_id, 52 | "model_id": model_name, 53 | "metadata": {}}) + "\n") 54 | ans_file.flush() 55 | ans_file.close() 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 61 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 62 | args = parser.parse_args() 63 | 64 | eval_model(args.model_name, args.question_file, args.answers_file) 65 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | 36 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 37 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 38 | answers_file = os.path.expanduser(args.answers_file) 39 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 40 | ans_file = open(answers_file, "w") 41 | for line in tqdm(questions): 42 | idx = line["question_id"] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | cur_prompt = qs 46 | if model.config.mm_use_im_start_end: 47 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 48 | else: 49 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 50 | 51 | conv = conv_templates[args.conv_mode].copy() 52 | conv.append_message(conv.roles[0], qs) 53 | conv.append_message(conv.roles[1], None) 54 | prompt = conv.get_prompt() 55 | 56 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 57 | 58 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 59 | image_tensor = process_images([image], image_processor, model.config)[0] 60 | 61 | with torch.inference_mode(): 62 | output_ids = model.generate( 63 | input_ids, 64 | images=image_tensor.unsqueeze(0).half().cuda(), 65 | image_sizes=[image.size], 66 | do_sample=True if args.temperature > 0 else False, 67 | temperature=args.temperature, 68 | top_p=args.top_p, 69 | num_beams=args.num_beams, 70 | # no_repeat_ngram_size=3, 71 | max_new_tokens=1024, 72 | use_cache=True) 73 | 74 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 75 | 76 | ans_id = shortuuid.uuid() 77 | ans_file.write(json.dumps({"question_id": idx, 78 | "prompt": cur_prompt, 79 | "text": outputs, 80 | "answer_id": ans_id, 81 | "model_id": model_name, 82 | "metadata": {}}) + "\n") 83 | ans_file.flush() 84 | ans_file.close() 85 | 86 | if __name__ == "__main__": 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 89 | parser.add_argument("--model-base", type=str, default=None) 90 | parser.add_argument("--image-folder", type=str, default="") 91 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 92 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 93 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 94 | parser.add_argument("--num-chunks", type=int, default=1) 95 | parser.add_argument("--chunk-idx", type=int, default=0) 96 | parser.add_argument("--temperature", type=float, default=0.2) 97 | parser.add_argument("--top_p", type=float, default=None) 98 | parser.add_argument("--num_beams", type=int, default=1) 99 | args = parser.parse_args() 100 | 101 | eval_model(args) 102 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['llava_score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/eval/to_seed_image.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import os 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--original-file", type=str, default="tables/question.jsonl") 8 | parser.add_argument("--modify-file", type=str, default="answer.jsonl") 9 | args = parser.parse_args() 10 | 11 | seedbench_list = [json.loads(q) for q in open(args.original_file, "r")] 12 | new_list = [] 13 | count = 0 14 | for idx in range(len(seedbench_list)): 15 | if seedbench_list[idx]['image'].startswith('SEED-Bench-video-image'): 16 | continue 17 | else: 18 | new_list.append(seedbench_list[idx]) 19 | 20 | mod_file = os.path.expanduser(args.modify_file) 21 | os.makedirs(os.path.dirname(mod_file), exist_ok=True) 22 | mod_file = open(args.modify_file, "w") 23 | for idx in range(len(new_list)): 24 | 25 | mod_file.write(json.dumps(new_list[idx]) + "\n") 26 | 27 | mod_file.close() 28 | 29 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from llava.model.language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 3 | from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 4 | 5 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | 20 | from transformers import AutoConfig, AutoModelForCausalLM, \ 21 | MptConfig, MptForCausalLM, MptModel 22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 23 | 24 | 25 | class LlavaMptConfig(MptConfig): 26 | model_type = "llava_mpt" 27 | 28 | 29 | class LlavaMptModel(LlavaMetaModel, MptModel): 30 | config_class = LlavaMptConfig 31 | 32 | def __init__(self, config: MptConfig): 33 | config.hidden_size = config.d_model 34 | super(LlavaMptModel, self).__init__(config) 35 | 36 | def embed_tokens(self, x): 37 | return self.wte(x) 38 | 39 | 40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): 41 | config_class = LlavaMptConfig 42 | supports_gradient_checkpointing = True 43 | 44 | def __init__(self, config): 45 | super(MptForCausalLM, self).__init__(config) 46 | 47 | self.transformer = LlavaMptModel(config) 48 | self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.transformer 55 | 56 | def _set_gradient_checkpointing(self, module, value=False): 57 | if isinstance(module, LlavaMptModel): 58 | module.gradient_checkpointing = value 59 | 60 | def forward( 61 | self, 62 | input_ids: Optional[torch.LongTensor] = None, 63 | past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, 64 | attention_mask: Optional[torch.Tensor] = None, 65 | inputs_embeds: Optional[torch.Tensor] = None, 66 | labels: Optional[torch.Tensor] = None, 67 | use_cache: Optional[bool] = None, 68 | output_attentions: Optional[bool] = None, 69 | output_hidden_states: Optional[bool] = None, 70 | return_dict: Optional[bool] = None, 71 | images=None): 72 | 73 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 74 | 75 | return super().forward( 76 | input_ids, 77 | past_key_values=past_key_values, 78 | attention_mask=attention_mask, 79 | inputs_embeds=inputs_embeds, 80 | labels=labels, 81 | use_cache=use_cache, 82 | output_attentions=output_attentions, 83 | output_hidden_states=output_hidden_states, 84 | return_dict=return_dict, 85 | ) 86 | 87 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 88 | images = kwargs.pop("images", None) 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs 91 | ) 92 | _inputs['images'] = images 93 | return _inputs 94 | 95 | 96 | AutoConfig.register("llava_mpt", LlavaMptConfig) 97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) 98 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | 17 | if not delay_load: 18 | self.load_model() 19 | elif getattr(args, 'unfreeze_mm_vision_tower', False): 20 | self.load_model() 21 | else: 22 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 23 | 24 | def load_model(self, device_map=None): 25 | if self.is_loaded: 26 | print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name)) 27 | return 28 | 29 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 30 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) 31 | self.vision_tower.requires_grad_(False) 32 | 33 | self.is_loaded = True 34 | 35 | def feature_select(self, image_forward_outs): 36 | image_features = image_forward_outs.hidden_states[self.select_layer] 37 | if self.select_feature == 'patch': 38 | image_features = image_features[:, 1:] 39 | elif self.select_feature == 'cls_patch': 40 | image_features = image_features 41 | else: 42 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 43 | return image_features 44 | 45 | @torch.no_grad() 46 | def forward(self, images): 47 | if type(images) is list: 48 | image_features = [] 49 | for image in images: 50 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 51 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 52 | image_features.append(image_feature) 53 | else: 54 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 55 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 56 | 57 | return image_features 58 | 59 | @property 60 | def dummy_feature(self): 61 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 62 | 63 | @property 64 | def dtype(self): 65 | return self.vision_tower.dtype 66 | 67 | @property 68 | def device(self): 69 | return self.vision_tower.device 70 | 71 | @property 72 | def config(self): 73 | if self.is_loaded: 74 | return self.vision_tower.config 75 | else: 76 | return self.cfg_only 77 | 78 | @property 79 | def hidden_size(self): 80 | return self.config.hidden_size 81 | 82 | @property 83 | def num_patches_per_side(self): 84 | return self.config.image_size // self.config.patch_size 85 | 86 | @property 87 | def num_patches(self): 88 | return (self.config.image_size // self.config.patch_size) ** 2 89 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_train/llava/serve/__init__.py -------------------------------------------------------------------------------- /COINCIDE_train/llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_train/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /COINCIDE_train/llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_train/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /COINCIDE_train/llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /COINCIDE_train/llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/000_caps.txt: -------------------------------------------------------------------------------- 1 | A man wearing multiple neck ties making a goofy face. 2 | A man in a white shirt wearing very many ties. 3 | a man with ties on poses for a picture 4 | A man wearing multiple ties on his neck. 5 | A young man smiles while wearing several ties. 6 | 7 | tie: [0.574, 0.298, 0.752, 0.704] 8 | tie: [0.464, 0.339, 0.639, 0.789] 9 | tie: [0.349, 0.363, 0.563, 0.732] 10 | tie: [0.259, 0.255, 0.668, 0.805] 11 | person: [0.019, 0.065, 0.962, 0.988] 12 | person: [0.0, 0.24, 0.214, 1.0] 13 | tie: [0.316, 0.778, 0.443, 0.867] 14 | tie: [0.386, 0.707, 0.496, 0.801] 15 | tie: [0.251, 0.354, 0.402, 0.747] 16 | tie: [0.44, 0.318, 0.63, 0.397] 17 | tie: [0.566, 0.33, 0.655, 0.543] 18 | tie: [0.25, 0.409, 0.359, 0.556] -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/000_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What is unusual about this photo? 3 | === 4 | Answer: 5 | In the photo, the man is wearing a total of ten ties around his neck. This is considered unusual because it is not common for someone to wear multiple ties at once. In a typical setting, a person would wear only one tie. The man's goofy facial expression also adds to the unconventional and humorous nature of the image. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/001_caps.txt: -------------------------------------------------------------------------------- 1 | A group of people standing outside of a black vehicle with various luggage. 2 | Luggage surrounds a vehicle in an underground parking area 3 | People try to fit all of their luggage in an SUV. 4 | The sport utility vehicle is parked in the public garage, being packed for a trip 5 | Some people with luggage near a van that is transporting it. 6 | 7 | person: [0.681, 0.242, 0.774, 0.694] 8 | person: [0.63, 0.222, 0.686, 0.516] 9 | person: [0.444, 0.233, 0.487, 0.34] 10 | backpack: [0.384, 0.696, 0.485, 0.914] 11 | backpack: [0.755, 0.413, 0.846, 0.692] 12 | suitcase: [0.758, 0.413, 0.845, 0.69] 13 | suitcase: [0.1, 0.497, 0.173, 0.579] 14 | bicycle: [0.282, 0.363, 0.327, 0.442] 15 | car: [0.786, 0.25, 0.848, 0.322] 16 | car: [0.783, 0.27, 0.827, 0.335] 17 | car: [0.86, 0.254, 0.891, 0.3] 18 | car: [0.261, 0.101, 0.787, 0.626] -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/001_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What challenges do these people face? 3 | === 4 | Answer: 5 | In the image, a group of people is standing outside a black SUV in a parking area, surrounded by various pieces of luggage, including suitcases and backpacks. They are facing the challenge of fitting all their luggage into the black SUV. There are multiple suitcases and backpacks to be packed, which suggests that the group has a significant amount of belongings to accommodate. They might have to strategize and arrange the luggage efficiently to ensure that everything fits properly into the vehicle. Additionally, they need to consider the comfort of the passengers and visibility while driving, so the placement of the luggage must not obstruct the driver's view or make the passengers uncomfortable during the trip. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/002_caps.txt: -------------------------------------------------------------------------------- 1 | There is a movie theater that displays the show times above the doors. 2 | A red fire hydrant is deep in the snow. 3 | The fire hydrant is in the snow near a recently plowed sidewalk. 4 | This city has had a very hard winter with snow. 5 | A hotel for dogs in the snow in winter. 6 | 7 | fire hydrant: [0.326, 0.612, 0.426, 0.72] -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/002_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What challenges might this city face? 3 | === 4 | Answer: 5 | The city faces challenges due to the harsh winter conditions and heavy snowfall. In the image, a red fire hydrant is almost buried deep in the snow, which indicates the significant amount of snow the city has experienced. This can lead to various challenges such as difficulties in transportation, increased risk of accidents, and disruptions to daily life. For example, the recently plowed sidewalk near the fire hydrant shows that the city has to constantly clear snow from roads and sidewalks to maintain access and safety for pedestrians and vehicles. Moreover, emergency services, like firefighters, might face challenges accessing crucial equipment, such as fire hydrants, during emergencies due to the snow accumulation. This highlights the importance of effective snow management strategies and preparedness in such cities to minimize the impact of harsh winter conditions on residents and essential services. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/complex_reasoning/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 2 | 3 | The task is to use the provided caption and bounding box information, create a plausible question about the image, and provide the answer in detail. 4 | 5 | Create complex questions beyond describing the scene. 6 | To answer such questions, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request. Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first. 7 | 8 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects. 9 | 10 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at the image. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/conversation/000_caps.txt: -------------------------------------------------------------------------------- 1 | There is a movie theater that displays the show times above the doors. 2 | A red fire hydrant is deep in the snow. 3 | The fire hydrant is in the snow near a recently plowed sidewalk. 4 | This city has had a very hard winter with snow. 5 | A hotel for dogs in the snow in winter. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/conversation/000_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What color is the fire hydrant in the image? 3 | === 4 | Answer: 5 | The fire hydrant in the image is red. 6 | === 7 | Question: 8 | Is the fire hydrant fully covered by snow or partially visible? 9 | === 10 | Answer: 11 | The fire hydrant is partially visible, with a significant portion of it covered by the deep snow. 12 | === 13 | Question: 14 | What is the condition of the sidewalk near the fire hydrant? 15 | === 16 | Answer: 17 | The sidewalk near the fire hydrant has been recently plowed, so it is relatively clear of snow compared to the surrounding area. 18 | === 19 | Question: 20 | What does the image suggest about the winter season in the city? 21 | === 22 | Answer: 23 | The image suggests that the winter season in the city has been quite harsh and challenging. The significant snow accumulation seen in the image indicates that the city has experienced heavy snowfall, which can lead to various difficulties and disruptions for its residents. 24 | 25 | For instance, the deep snow covering the fire hydrant could potentially hinder its accessibility in case of emergencies, which poses safety concerns. The recently plowed sidewalk implies that the city's maintenance crews have been actively working to keep the walkways clear and safe for pedestrians, but the sheer amount of snowfall might make it difficult to keep up with the continuous snow removal efforts. 26 | 27 | Furthermore, such extreme winter conditions can impact transportation, leading to traffic issues, delays in public transit, and increased risks of accidents due to slippery roads. It can also cause problems for businesses and schools, as people might face challenges commuting to work or attending classes. Additionally, the heavy snow can put extra strain on infrastructure, such as roofs and power lines, increasing the likelihood of structural damage or power outages. 28 | 29 | In conclusion, the image of the red fire hydrant deep in the snow and the recently plowed sidewalk suggest that the city has faced a particularly severe winter season, with substantial snowfall that has likely caused various challenges and disruptions for its residents and infrastructure. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/conversation/001_caps.txt: -------------------------------------------------------------------------------- 1 | A man is skiing in the open snow covered hills 2 | A skier is making his way into the snow. 3 | A skier on the foothills of a huge mountain range. 4 | A skier looks at mountains while standing near a trail sign. 5 | a single person skiing by an area with a lot of bushes -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/conversation/001_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What is the skier doing? 3 | === 4 | Answer: 5 | The skier is skiing in the snow-covered hills, making his way through the snow and enjoying the beautiful mountain scenery. They are also standing near a trail sign, which indicates their location on the mountain. 6 | === 7 | Question: 8 | Are there any other people or skiers visible in the image? 9 | === 10 | Answer: 11 | No, there is only a single person skiing in the image, and they are surrounded by an area with a lot of bushes. 12 | === 13 | Question: 14 | What kind of skiing is the person doing? Cross-country or downhill? 15 | === 16 | Answer: 17 | Based on what I can see in the photo, it is not possible to definitively determine whether the skier is doing cross-country or downhill skiing. However, given the presence of a trail sign and the fact that they are skiing in the foothills, it is more likely that the skier is engaged in cross-country skiing. Downhill skiing typically takes place on more steep slopes and involves ski lifts, whereas cross-country skiing often occurs on relatively flatter terrain with marked trails. 18 | === 19 | Question: 20 | Can you explain cross-country and downhill skiing a bit more? 21 | === 22 | Answer: 23 | Cross-country skiing and downhill skiing are two distinct types of skiing, each with its own techniques, equipment, and terrain preferences. 24 | 25 | 1. Cross-Country Skiing: 26 | Also known as Nordic skiing, cross-country skiing involves traversing relatively flat terrain or gentle slopes using skis and poles. It is a popular recreational and competitive sport that focuses on endurance, balance, and aerobic fitness. Cross-country skiers use their own locomotion to move across snow-covered terrain, rather than relying on ski lifts or other forms of assistance. 27 | There are two main techniques in cross-country skiing: 28 | - Classic: This technique involves skiing in parallel tracks, using a forward-and-backward gliding motion while keeping the skis parallel to each other. The skier uses their poles to propel themselves forward. 29 | - Skate skiing: This technique is similar to ice skating, where the skier pushes off one ski at an angle and glides on the other. It requires a smooth, groomed surface and is typically faster than the classic technique. 30 | Cross-country ski equipment consists of lightweight skis, boots, bindings, and poles. The boots are more flexible than those used in downhill skiing, allowing for better ankle movement and control. 31 | 32 | 2. Downhill Skiing: 33 | Also known as alpine skiing, downhill skiing involves descending slopes at high speeds using skis and poles for balance and control. This sport is more focused on speed, technique, and navigating challenging terrain, including steep slopes, moguls, and even jumps. 34 | Downhill skiing can be further categorized into several disciplines, such as slalom, giant slalom, super-G, and downhill racing. Each discipline has its own set of rules, courses, and ski equipment. 35 | Downhill ski equipment includes heavier and stiffer skis, boots, bindings, and poles than those used in cross-country skiing. The boots are more rigid to provide better support and control during high-speed descents and sharp turns. 36 | 37 | In summary, cross-country skiing is an endurance-based sport that involves traveling across flat or gently sloping terrain, while downhill skiing is focused on speed and technique as skiers navigate steeper slopes and challenging terrain. Both sports require specialized equipment and techniques, but they offer different experiences and challenges to participants. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/conversation/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, describing the same image you are looking at. Answer all questions as you are seeing the image. 2 | 3 | Design a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question. 4 | Ask diverse questions and give corresponding answers. 5 | 6 | Include questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers: 7 | (1) one can see the content in the image that the question asks about and can answer confidently; 8 | (2) one can determine confidently from the image that it is not in the image. 9 | Do not ask any question that cannot be answered confidently. 10 | 11 | Also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details. 12 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized. You can include multiple paragraphs if necessary. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/000_caps.txt: -------------------------------------------------------------------------------- 1 | A harbor filled with lots of boats next to a building. 2 | A bicycle parked in front of several boats at a dock. 3 | A red bicycle in front of a line of docked white yachts 4 | A bike sits before boats which sit before a long building. 5 | A bicycle is a convenient means of land transportation when you live on a boat. 6 | 7 | bicycle: [0.287, 0.641, 0.507, 0.874] 8 | bicycle: [0.566, 0.667, 0.63, 0.731] 9 | boat: [0.318, 0.579, 0.575, 0.724] 10 | boat: [0.704, 0.607, 0.818, 0.727] 11 | boat: [0.818, 0.601, 0.942, 0.744] 12 | boat: [0.002, 0.53, 0.243, 0.71] 13 | boat: [0.541, 0.611, 0.668, 0.731] 14 | person: [0.778, 0.527, 0.797, 0.57] 15 | cup: [0.708, 0.733, 0.724, 0.758] 16 | boat: [0.236, 0.532, 0.404, 0.64] 17 | boat: [0.81, 0.632, 0.836, 0.676] 18 | boat: [0.957, 0.526, 1.0, 0.752] -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/000_conv.txt: -------------------------------------------------------------------------------- 1 | It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a few white yachts lined up, standing out from the rest. There is a red bicycle prominently parked in front of the line of docked boats, serving as a convenient means of land transportation for those living on the boats. Another bicycle can be seen further back in the scene, near the middle of the harbor. 2 | 3 | A person is visible near the right side of the harbor, possibly enjoying the view or attending to their boat. Additionally, there is a cup placed on a surface near the middle of the scene. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/001_caps.txt: -------------------------------------------------------------------------------- 1 | A group of people standing outside of a black vehicle with various luggage. 2 | Luggage surrounds a vehicle in an underground parking area 3 | People try to fit all of their luggage in an SUV. 4 | The sport utility vehicle is parked in the public garage, being packed for a trip 5 | Some people with luggage near a van that is transporting it. 6 | 7 | person: [0.681, 0.242, 0.774, 0.694] 8 | person: [0.63, 0.222, 0.686, 0.516] 9 | person: [0.444, 0.233, 0.487, 0.34] 10 | backpack: [0.384, 0.696, 0.485, 0.914] 11 | backpack: [0.755, 0.413, 0.846, 0.692] 12 | suitcase: [0.758, 0.413, 0.845, 0.69] 13 | suitcase: [0.1, 0.497, 0.173, 0.579] 14 | bicycle: [0.282, 0.363, 0.327, 0.442] 15 | car: [0.786, 0.25, 0.848, 0.322] 16 | car: [0.783, 0.27, 0.827, 0.335] 17 | car: [0.86, 0.254, 0.891, 0.3] 18 | car: [0.261, 0.101, 0.787, 0.626] -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/001_conv.txt: -------------------------------------------------------------------------------- 1 | The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the scene, with one person standing closer to the left side of the vehicle, another person in the middle, and the third person on the right side. They are all working together to pack their luggage into the SUV for a trip. 2 | 3 | In the parking area, various luggage items are scattered around the vehicle. There are two backpacks, one located near the left rear wheel and the other closer to the right side of the vehicle. Additionally, there are two suitcases, one on the right side of the car and another further away near the center of the parking area. A bicycle can also be seen on the left side of the vehicle. 4 | 5 | Other cars are parked around the main SUV, with one car positioned behind it and slightly to the left, another behind and slightly to the right, and the third car further behind on the right side. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/002_caps.txt: -------------------------------------------------------------------------------- 1 | A man holds a Wii-mote above his head while another looks on. 2 | A guy and his friend are playing Nintendo Wii. 3 | A young man is holding a video game remote over his head. 4 | two men standing in a room while one plays with a wii mote 5 | Some guys standing and playing a video game. 6 | 7 | couch: [0.697, 0.759, 0.995, 1.0] 8 | dining table: [0.426, 0.755, 1.0, 0.987] 9 | person: [0.082, 0.252, 0.342, 1.0] 10 | person: [0.399, 0.085, 0.742, 0.982] 11 | remote: [0.477, 0.135, 0.516, 0.187] 12 | sink: [0.016, 0.501, 0.063, 0.52] 13 | potted plant: [0.798, 0.384, 0.888, 0.645] 14 | refrigerator: [0.305, 0.389, 0.414, 0.547] 15 | chair: [0.72, 0.509, 0.858, 0.725] -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/002_conv.txt: -------------------------------------------------------------------------------- 1 | The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is holding a Wii remote above his head with enthusiasm, while the other man looks on, likely enjoying the friendly competition. 2 | 3 | The room appears to be a living space with a couch located in the background and a dining table nearby. A potted plant can be seen placed close to the couch, and a chair is situated in the middle of the room. The room also features a kitchen area with a sink and a refrigerator visible in the background. -------------------------------------------------------------------------------- /COINCIDE_train/playground/data/prompts/detail_description/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 2 | 3 | Using the provided caption and bounding box information, describe the scene in a detailed manner. 4 | 5 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects. 6 | 7 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at the image. -------------------------------------------------------------------------------- /COINCIDE_train/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.2.2.post1" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.2.0", "torchvision==0.17.0", "tiktoken", 17 | "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://llava-vl.github.io" 31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | # eval_single(args.result_file, eval_only_type='video') 63 | 64 | # with open(args.result_upload_file, 'w') as fp: 65 | # for question in data['questions']: 66 | # qid = question['question_id'] 67 | # if qid in results: 68 | # result = results[qid] 69 | # else: 70 | # result = results[int(qid)] 71 | # fp.write(json.dumps({ 72 | # 'question_id': qid, 73 | # 'prediction': result['text'] 74 | # }) + '\n') 75 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fire 4 | import re 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot 6 | 7 | 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): 9 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 10 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 11 | 12 | split_problems = build_prompt_chatbot( 13 | problems, split_indices, prompt_format, 14 | use_caption=False, is_test=False) 15 | 16 | target_format = [] 17 | for prob_id, (input, output) in split_problems.items(): 18 | if input.startswith('Question: '): 19 | input = input.replace('Question: ', '') 20 | if output.startswith('Answer: '): 21 | output = output.replace('Answer: ', '') 22 | 23 | raw_prob_data = problems[prob_id] 24 | if raw_prob_data['image'] is None: 25 | target_format.append({ 26 | "id": prob_id, 27 | "conversations": [ 28 | {'from': 'human', 'value': f"{input}"}, 29 | {'from': 'gpt', 'value': f"{output}"}, 30 | ], 31 | }) 32 | 33 | else: 34 | target_format.append({ 35 | "id": prob_id, 36 | "image": os.path.join(prob_id, raw_prob_data['image']), 37 | "conversations": [ 38 | {'from': 'human', 'value': f"{input}\n"}, 39 | {'from': 'gpt', 'value': f"{output}"}, 40 | ], 41 | }) 42 | 43 | print(f'Number of samples: {len(target_format)}') 44 | 45 | with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: 46 | json.dump(target_format, f, indent=2) 47 | 48 | 49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): 50 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 51 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 52 | 53 | split_problems = build_prompt_chatbot( 54 | problems, split_indices, prompt_format, 55 | use_caption=False, is_test=False) 56 | 57 | writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") 58 | for prob_id, (input, output) in split_problems.items(): 59 | if input.startswith('Question: '): 60 | input = input.replace('Question: ', '') 61 | if output.startswith('Answer: '): 62 | output = output.replace('Answer: ', '') 63 | 64 | raw_prob_data = problems[prob_id] 65 | if raw_prob_data['image'] is None: 66 | data = { 67 | "id": prob_id, 68 | "instruction": f"{input}", 69 | "output": f"{output}", 70 | } 71 | 72 | else: 73 | data = { 74 | "id": prob_id, 75 | "image": os.path.join(prob_id, raw_prob_data['image']), 76 | "instruction": f"{input}\n", 77 | "output": f"{output}", 78 | } 79 | writer.write(json.dumps(data) + '\n') 80 | writer.close() 81 | 82 | 83 | def main(task, **kwargs): 84 | globals()[task](**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | fire.Fire(main) 89 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | BASE="vicuna-7b-v1.5" 10 | SPLIT="llava_gqa_testdev_balanced" 11 | BASE_PATH="/base_path" 12 | GQADIR="${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/data" 13 | 14 | for IDX in $(seq 0 $((CHUNKS-1))); do 15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 16 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 17 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 18 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/$SPLIT.jsonl \ 19 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/data/images \ 20 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --temperature 0 \ 24 | --conv-mode vicuna_v1 & 25 | done 26 | 27 | wait 28 | 29 | output_file=${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 30 | 31 | # Clear out the output file if it exists. 32 | > "$output_file" 33 | 34 | # Loop through the indices and concatenate each file. 35 | for IDX in $(seq 0 $((CHUNKS-1))); do 36 | cat ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 37 | done 38 | 39 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 40 | 41 | cd $GQADIR 42 | python eval/eval.py --tier testdev_balanced 43 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/images \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/answers/${CKPT}.jsonl \ 13 | --temperature 0 \ 14 | --conv-mode vicuna_v1 15 | 16 | mkdir -p ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/reviews 17 | 18 | python llava/eval/eval_gpt_review_bench.py \ 19 | --question ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 20 | --context ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 21 | --rule ${BASE_PATH}/COINCIDE_train/llava/eval/table/rule.json \ 22 | --answer-list \ 23 | ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 24 | ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/answers/${CKPT}.jsonl \ 25 | --output \ 26 | ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT}.jsonl 27 | 28 | python llava/eval/summarize_gpt_review.py -f ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT}.jsonl 29 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | SPLIT="mmbench_dev_20230712" 6 | BASE_PATH="/base_path" 7 | 8 | python -m llava.eval.model_vqa_mmbench \ 9 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 10 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 11 | --question-file ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/$SPLIT.tsv \ 12 | --answers-file ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers/$SPLIT/${CKPT}.jsonl \ 13 | --single-pred-prompt \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | mkdir -p ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers_upload/$SPLIT 18 | 19 | python scripts/convert_mmbench_for_submission.py \ 20 | --annotation-file ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/$SPLIT.tsv \ 21 | --result-dir ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers/$SPLIT \ 22 | --upload-dir ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers_upload/$SPLIT \ 23 | --experiment $CKPT 24 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | SPLIT="mmbench_dev_cn_20231003" 6 | BASE_PATH="/base_path" 7 | 8 | python -m llava.eval.model_vqa_mmbench \ 9 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 10 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 11 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/$SPLIT.tsv \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers/$SPLIT/${CKPT}.jsonl \ 13 | --lang cn \ 14 | --single-pred-prompt \ 15 | --temperature 0 \ 16 | --conv-mode vicuna_v1 17 | 18 | mkdir -p ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers_upload/$SPLIT 19 | 20 | python scripts/convert_mmbench_for_submission.py \ 21 | --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/$SPLIT.tsv \ 22 | --result-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers/$SPLIT \ 23 | --upload-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers_upload/$SPLIT \ 24 | --experiment $CKPT 25 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa_loader \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME/llava_mme.jsonl \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME/MME_Benchmark_release_version \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME/answers/${CKPT}.jsonl \ 13 | --temperature 0 \ 14 | --conv-mode vicuna_v1 15 | 16 | cd ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME 17 | 18 | python convert_answer_to_mme.py --experiment $CKPT 19 | 20 | cd eval_tool 21 | 22 | python calculation.py --results_dir answers/$CKPT 23 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/images \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/answers/${CKPT}.jsonl \ 13 | --temperature 0 \ 14 | --conv-mode vicuna_v1 15 | 16 | mkdir -p ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/results 17 | 18 | python scripts/convert_mmvet_for_eval.py \ 19 | --src ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/answers/${CKPT}.jsonl \ 20 | --dst ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/results/${CKPT}.json 21 | 22 | python playground/data/eval/mm-vet/MM-Vet/mm-vet_evaluator.py \ 23 | --mmvet_path ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet \ 24 | --result_file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/results/${CKPT}.json \ 25 | --result_path ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet 26 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa_loader \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/llava_pope_test.jsonl \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/val2014 \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/answers/${CKPT}.jsonl \ 13 | --temperature 0 \ 14 | --conv-mode vicuna_v1 15 | 16 | python llava/eval/eval_pope.py \ 17 | --annotation-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/coco \ 18 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/llava_pope_test.jsonl \ 19 | --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/answers/${CKPT}.jsonl 20 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | BASE="vicuna-7b-v1.5" 10 | BASE_PATH="/base_path" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 15 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 16 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/llava-seed-bench_mod.jsonl \ 17 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench \ 18 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | # Evaluate 38 | python scripts/convert_seed_for_submission.py \ 39 | --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/SEED-Bench.json \ 40 | --result-file $output_file \ 41 | --result-upload-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers_upload/${CKPT}.jsonl 42 | 43 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa_science \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/llava_test_CQM-A.json \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/images/test \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}.jsonl \ 13 | --single-pred-prompt \ 14 | --temperature 0 \ 15 | --conv-mode vicuna_v1 16 | 17 | python llava/eval/eval_science_qa.py \ 18 | --base-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa \ 19 | --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}.jsonl \ 20 | --output-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \ 21 | --output-result ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}_result.json 22 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa_loader \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/train_images \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/answers/${CKPT}.jsonl \ 13 | --temperature 0 \ 14 | --conv-mode vicuna_v1 15 | 16 | python -m llava.eval.eval_textvqa \ 17 | --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 18 | --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/answers/${CKPT}.jsonl 19 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | BASE="vicuna-7b-v1.5" 5 | BASE_PATH="/base_path" 6 | 7 | python -m llava.eval.model_vqa_loader \ 8 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 9 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 10 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/llava_test.jsonl \ 11 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/test \ 12 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/answers/${CKPT}.jsonl \ 13 | --temperature 0 \ 14 | --conv-mode vicuna_v1 15 | 16 | python scripts/convert_vizwiz_for_submission.py \ 17 | --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/llava_test.jsonl \ 18 | --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/answers/${CKPT}.jsonl \ 19 | --result-upload-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/answers_upload/${CKPT}.json 20 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | BASE="vicuna-7b-v1.5" 10 | SPLIT="llava_vqav2_mscoco_test-dev2015" 11 | BASE_PATH="/base_path" 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 15 | --model-path ${BASE_PATH}/checkpoints/$CKPT \ 16 | --model-base ${BASE_PATH}/checkpoints/$BASE \ 17 | --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/$SPLIT.jsonl \ 18 | --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/test2015 \ 19 | --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --temperature 0 \ 23 | --conv-mode vicuna_v1 & 24 | done 25 | 26 | wait 27 | 28 | output_file=${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 39 | # 40 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_PATH="/base_path" 4 | 5 | deepspeed llava/train/train_xformers.py \ 6 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 7 | --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \ 8 | --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \ 9 | --version v1 \ 10 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 11 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 12 | --vision_tower openai/clip-vit-large-patch14-336 \ 13 | --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ 14 | --mm_projector_type mlp2x_gelu \ 15 | --mm_vision_select_layer -2 \ 16 | --mm_use_im_start_end False \ 17 | --mm_use_im_patch_token False \ 18 | --image_aspect_ratio pad \ 19 | --group_by_modality_length True \ 20 | --bf16 False \ 21 | --output_dir ${BASE_PATH}/checkpoints/llava_lora_v1.5_v100 \ 22 | --num_train_epochs 1 \ 23 | --per_device_train_batch_size 4 \ 24 | --per_device_eval_batch_size 4 \ 25 | --gradient_accumulation_steps 8 \ 26 | --eval_accumulation_steps 8 \ 27 | --evaluation_strategy "no" \ 28 | --save_strategy "steps" \ 29 | --save_steps 50000 \ 30 | --save_total_limit 1 \ 31 | --learning_rate 2e-4 \ 32 | --weight_decay 0. \ 33 | --warmup_ratio 0.03 \ 34 | --lr_scheduler_type "cosine" \ 35 | --logging_steps 1 \ 36 | --tf32 False \ 37 | --fp16 True \ 38 | --model_max_length 2048 \ 39 | --gradient_checkpointing True \ 40 | --dataloader_num_workers 4 \ 41 | --lazy_preprocess True \ 42 | --report_to wandb \ 43 | --run_name llava_lora_v1.5_v100 \ 44 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/finetune_lora_coincide.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" == "" ]; then 4 | PRUNE_RATIO=0.2 5 | else 6 | PRUNE_RATIO=$1 7 | fi 8 | 9 | BASE_PATH="/base_path" 10 | 11 | deepspeed llava/train/train_xformers.py \ 12 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 13 | --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \ 14 | --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \ 15 | --version v1 \ 16 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 17 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 18 | --vision_tower openai/clip-vit-large-patch14-336 \ 19 | --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ 20 | --mm_projector_type mlp2x_gelu \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --image_aspect_ratio pad \ 25 | --group_by_modality_length True \ 26 | --bf16 False \ 27 | --output_dir ${BASE_PATH}/checkpoints/llava_lora_tinyllava_ours_37111519_mmd_transfer_mod_density_prune_${PRUNE_RATIO}_v1.5 \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 4 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 8 \ 32 | --eval_accumulation_steps 8 \ 33 | --evaluation_strategy "no" \ 34 | --save_strategy "steps" \ 35 | --save_steps 50000 \ 36 | --save_total_limit 1 \ 37 | --learning_rate 2e-4 \ 38 | --weight_decay 0. \ 39 | --warmup_ratio 0.03 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 1 \ 42 | --tf32 False \ 43 | --fp16 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb \ 49 | --run_name llava_lora_tinyllava_ours_37111519_mmd_transfer_mod_density_prune_${PRUNE_RATIO}_v1.5 \ 50 | --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/ours_37111519_indices_mmd_trans_mod_density_${PRUNE_RATIO}.npy \ 51 | --prune_p $PRUNE_RATIO \ 52 | --prune_subset "bottom" \ 53 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/finetune_lora_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" == "" ]; then 4 | PRUNE_RATIO=0.2 5 | else 6 | PRUNE_RATIO=$1 7 | fi 8 | 9 | BASE_PATH="/base_path" 10 | 11 | deepspeed llava/train/train_xformers.py \ 12 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 13 | --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \ 14 | --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \ 15 | --version v1 \ 16 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \ 17 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \ 18 | --vision_tower openai/clip-vit-large-patch14-336 \ 19 | --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ 20 | --mm_projector_type mlp2x_gelu \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --image_aspect_ratio pad \ 25 | --group_by_modality_length True \ 26 | --bf16 False \ 27 | --output_dir ${BASE_PATH}/checkpoints/llava_lora_random_prune_${PRUNE_RATIO}_v1.5 \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 4 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 8 \ 32 | --eval_accumulation_steps 8 \ 33 | --evaluation_strategy "no" \ 34 | --save_strategy "steps" \ 35 | --save_steps 50000 \ 36 | --save_total_limit 1 \ 37 | --learning_rate 2e-4 \ 38 | --weight_decay 0. \ 39 | --warmup_ratio 0.03 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 1 \ 42 | --tf32 False \ 43 | --fp16 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb \ 49 | --run_name llava_lora_random_prune_${PRUNE_RATIO}_v1.5 \ 50 | --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/LLaVA-Instruction/random_indices.npy \ 51 | --prune_p $PRUNE_RATIO \ 52 | --prune_subset "bottom" \ 53 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/vision_flan/finetune_lora_vision_flan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_PATH="/base_path" 4 | 5 | deepspeed llava/train/train_xformers.py \ 6 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 7 | --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \ 8 | --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \ 9 | --version v1 \ 10 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 11 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 12 | --vision_tower openai/clip-vit-large-patch14-336 \ 13 | --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ 14 | --mm_projector_type mlp2x_gelu \ 15 | --mm_vision_select_layer -2 \ 16 | --mm_use_im_start_end False \ 17 | --mm_use_im_patch_token False \ 18 | --image_aspect_ratio pad \ 19 | --group_by_modality_length True \ 20 | --bf16 False \ 21 | --output_dir ${BASE_PATH}/checkpoints/llava_lora_vision_flan_lora_v1.5_v100 \ 22 | --num_train_epochs 1 \ 23 | --per_device_train_batch_size 4 \ 24 | --per_device_eval_batch_size 4 \ 25 | --gradient_accumulation_steps 8 \ 26 | --eval_accumulation_steps 8 \ 27 | --evaluation_strategy "no" \ 28 | --save_strategy "steps" \ 29 | --save_steps 50000 \ 30 | --save_total_limit 1 \ 31 | --learning_rate 2e-4 \ 32 | --weight_decay 0. \ 33 | --warmup_ratio 0.03 \ 34 | --lr_scheduler_type "cosine" \ 35 | --logging_steps 1 \ 36 | --tf32 False \ 37 | --fp16 True \ 38 | --model_max_length 2048 \ 39 | --gradient_checkpointing True \ 40 | --dataloader_num_workers 4 \ 41 | --lazy_preprocess True \ 42 | --report_to wandb \ 43 | --run_name llava_lora_vision_flan_lora_v1.5_v100 \ 44 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/vision_flan/finetune_lora_vision_flan_coincide.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" == "" ]; then 4 | PRUNE_RATIO=0.2 5 | else 6 | PRUNE_RATIO=$1 7 | fi 8 | 9 | BASE_PATH="/base_path" 10 | 11 | deepspeed llava/train/train_xformers.py \ 12 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 13 | --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \ 14 | --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \ 15 | --version v1 \ 16 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 17 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 18 | --vision_tower openai/clip-vit-large-patch14-336 \ 19 | --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ 20 | --mm_projector_type mlp2x_gelu \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --image_aspect_ratio pad \ 25 | --group_by_modality_length True \ 26 | --bf16 False \ 27 | --output_dir ${BASE_PATH}/checkpoints/llava_lora_vision_flan_lora_ours_3000_prune_${PRUNE_RATIO}_v1.5 \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 4 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 8 \ 32 | --eval_accumulation_steps 8 \ 33 | --evaluation_strategy "no" \ 34 | --save_strategy "steps" \ 35 | --save_steps 50000 \ 36 | --save_total_limit 1 \ 37 | --learning_rate 2e-4 \ 38 | --weight_decay 0. \ 39 | --warmup_ratio 0.03 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 1 \ 42 | --tf32 False \ 43 | --fp16 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb \ 49 | --run_name llava_lora_vision_flan_lora_ours_3000_prune_${PRUNE_RATIO}_v1.5 \ 50 | --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/ours_3000_${PRUNE_RATIO}.npy \ 51 | --prune_p $PRUNE_RATIO \ 52 | --prune_subset "bottom" \ 53 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/v1_5/vision_flan/finetune_lora_vision_flan_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | if [ "$1" == "" ]; then 5 | PRUNE_RATIO=0.2 6 | else 7 | PRUNE_RATIO=$1 8 | fi 9 | 10 | BASE_PATH="/base_path" 11 | 12 | deepspeed llava/train/train_xformers.py \ 13 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 14 | --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \ 15 | --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \ 16 | --version v1 \ 17 | --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \ 18 | --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \ 19 | --vision_tower openai/clip-vit-large-patch14-336 \ 20 | --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --mm_vision_select_layer -2 \ 23 | --mm_use_im_start_end False \ 24 | --mm_use_im_patch_token False \ 25 | --image_aspect_ratio pad \ 26 | --group_by_modality_length True \ 27 | --bf16 False \ 28 | --output_dir ${BASE_PATH}/checkpoints/llava_lora_vision_flan_lora_random_prune_${PRUNE_RATIO}_v1.5 \ 29 | --num_train_epochs 1 \ 30 | --per_device_train_batch_size 4 \ 31 | --per_device_eval_batch_size 4 \ 32 | --gradient_accumulation_steps 8 \ 33 | --eval_accumulation_steps 8 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-4 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 False \ 44 | --fp16 True \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --dataloader_num_workers 4 \ 48 | --lazy_preprocess True \ 49 | --report_to wandb \ 50 | --run_name llava_lora_vision_flan_lora_random_prune_${PRUNE_RATIO}_v1.5 \ 51 | --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/random_indices.npy \ 52 | --prune_p $PRUNE_RATIO \ 53 | -------------------------------------------------------------------------------- /COINCIDE_train/scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /COINCIDE_train/scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /COINCIDE_train/scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # COINCIDE 2 | 3 | ### **Concept-skill Transferability-based Data Selection for Large Vision-Language Models, EMNLP 2024** 4 | * Authors: [Jaewoo Lee](https://g-jwlee.github.io/), [Boyang Li^](http://www.boyangli.org/), [Sung Ju Hwang^](http://www.sungjuhwang.com/) 5 | * [Paper](https://arxiv.org/abs/2406.10995) 6 | 7 | ## Introduction 8 | 9 | Training Large Vision-Langauge Models on extensive visual instruction tuning (VIT) data incurs significant computational cost, making the process infeasible for small academic labs and individual researchers. 10 | Additionally, it is not clear if all the VIT data are necessary for good generalization, as different vision-language (VL) tasks have different abilities to transfer to downstream tasks. 11 | 12 | We propose COINCIDE (COre-INstruction Concept-skIll Data Election), which identifies VL concept-skill compositions through data clustering using activations from an off-the-shelf, small VLM. 13 | 14 | 15 |
Teaser
16 |

17 | 18 | From each cluster, COINCIDE selects training data for a target LVLM by considering transferability (i.e., how well knowledge from each cluster can facilitate LVLM's learning in other clusters) and internal density of clusters. 19 | 20 |

21 | 22 | ## Install 23 | Since we use different models for 1. clustering and 2. training, different conda environments are required for each process. 24 | 25 | ### Clustering 26 | ```shell 27 | conda create -n coincide_cluster python=3.10 -y 28 | conda activate coincide_cluster 29 | pip install --upgrade pip # enable PEP 660 support 30 | pip install -e . 31 | pip install -e ".[train]" 32 | ``` 33 | 34 | ### Training 35 | ```shell 36 | conda create -n coincide_train python=3.10 -y 37 | conda activate coincide_train 38 | cd COINCIDE_train 39 | pip install --upgrade pip # enable PEP 660 support 40 | pip install -e . 41 | pip install -e ".[train]" 42 | pip install flash-attn --no-build-isolation 43 | ``` 44 | 45 | ## Dataset 46 | ### LLaVA-1.5 47 | Follow the visual instruction tuning dataset download guides in the [official Github page](https://github.com/haotian-liu/LLaVA) 48 | Place the downloaded files in `COINCIDE_train/playground/data`. 49 | Also, prepare for the evaluation benchmark datasets by following the instructions in the page. 50 | 51 | ### Vision-Flan 52 | Similarly, download the Vision-Flan visual instruction tuning dataset through the [official project page](https://vision-flan.github.io/) 53 | Place the downloaded files in `COINCIDE_train/playground/data`. 54 | 55 | ## Models 56 | In this paper, we focus on two models: [TinyLLavA-2B](https://github.com/TinyLLaVA/TinyLLaVA_Factory/tree/tinyllava_bench) for clustering and 57 | [LLaVA-1.5-7B](https://github.com/haotian-liu/LLaVA) for training. Please download the models in `/checkpoints` directory.
58 | 59 | You can try to use other LVLMs such as [TinyLLaVA-OpenELM-450M-SigLIP-0.89B](https://github.com/TinyLLaVA/TinyLLaVA_Factory?tab=readme-ov-file) or 60 | [LLaVA-1.5-13B](https://github.com/haotian-liu/LLaVA), as we have done in our ablation studies. 61 | 62 | ## LLaVA-1.5 experiment 63 | ### Clustering 64 | Change directory `cd COINCIDE_cluster`. 65 | ```shell 66 | bash scripts/tiny_llava/eval/llava_score/coincide/extract_embed.sh 67 | bash scripts/tiny_llava/eval/llava_score/coincide/clustering.sh 68 | ``` 69 | 70 | ### Training 71 | Change directory `cd COINCIDE_train`. 72 | ```shell 73 | bash scripts/v1_5/finetune_lora_coincide.sh 74 | ``` 75 | 76 | ## Vision-Flan experiment 77 | ### Clustering 78 | Change directory `cd COINCIDE_cluster`. 79 | ```shell 80 | bash scripts/tiny_llava/eval/vision_flan_score/coincide/extract_embed.sh 81 | bash scripts/tiny_llava/eval/vision_flan_score/coincide/clustering.sh 82 | ``` 83 | 84 | ### Training 85 | Change directory `cd COINCIDE_train`. 86 | ```shell 87 | bash scripts/v1_5/vision_flan_score/finetune_lora_coincide.sh 88 | ``` 89 | 90 | ## Evaluation 91 | MODEL_NAME is the name of the directory that stores the model.
92 | {eval_benchmark} could be gqa, llavabench, mme, etc. 93 | ```shell 94 | bash scripts/v1_5/eval/{eval_benchmark}.sh MODEL_NAME 95 | ``` 96 | 97 | ## Bibtex 98 | ``` 99 | @inproceedings{lee2024coincide, 100 | title={Concept-skill Transferability-based Data Selection for Large Vision-Language Models}, 101 | author={Jaewoo Lee and Boyang Li and Sung Ju Hwang}, 102 | year={2024}, 103 | booktitle={Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 104 | } 105 | ``` 106 | -------------------------------------------------------------------------------- /assets/concept_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/assets/concept_figure.png --------------------------------------------------------------------------------