├── COINCIDE_cluster
    ├── LICENCE
    ├── pyproject.toml
    ├── scripts
    │   ├── extract_mm_projector.py
    │   ├── merge_lora_weights.py
    │   └── tiny_llava
    │   │   └── eval
    │   │       ├── llava_score
    │   │           ├── coincide
    │   │           │   ├── clustering.sh
    │   │           │   └── extract_embed.sh
    │   │           ├── d2_prune
    │   │           │   ├── d2_prune_selection.sh
    │   │           │   ├── score_aum.sh
    │   │           │   └── score_llava_embed.sh
    │   │           ├── el2n
    │   │           │   └── score_el2n.sh
    │   │           └── semdedup
    │   │           │   ├── score_SemDeDup.sh
    │   │           │   └── semdedup.sh
    │   │       └── vision_flan_score
    │   │           ├── coincide
    │   │               ├── clustering.sh
    │   │               └── extract_embed.sh
    │   │           ├── d2_prune
    │   │               ├── d2_prune_selection.sh
    │   │               ├── score_aum.sh
    │   │               └── score_llava_embed.sh
    │   │           ├── el2n
    │   │               └── score_el2n.sh
    │   │           └── semdedup
    │   │               ├── score_SemDeDup.sh
    │   │               └── semdedup.sh
    └── tinyllava
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── data
    │       ├── __init__.py
    │       ├── dataset.py
    │       ├── preprocess
    │       │   ├── __init__.py
    │       │   ├── default.py
    │       │   ├── llama2.py
    │       │   ├── mpt.py
    │       │   ├── phi.py
    │       │   ├── plain.py
    │       │   ├── stablelm.py
    │       │   ├── tiny_llama.py
    │       │   └── v1.py
    │       └── process.py
    │   ├── eval
    │       ├── __init__.py
    │       └── score
    │       │   ├── SemDeDup
    │       │       ├── clustering
    │       │       │   ├── clustering.py
    │       │       │   ├── sort_clusters.py
    │       │       │   └── utils.py
    │       │       ├── compute_centroids.py
    │       │       ├── execute_semdedup.py
    │       │       └── extract_dedup_data.py
    │       │   ├── coincide
    │       │       ├── autograd_hacks.py
    │       │       ├── cluster_transferability.py
    │       │       ├── cluster_wise_prune.py
    │       │       ├── clustering.py
    │       │       ├── compute_centroids.py
    │       │       └── extract_embed.py
    │       │   ├── d2_prune
    │       │       ├── d2_prune.py
    │       │       ├── score_aum.py
    │       │       └── score_embed.py
    │       │   ├── el2n
    │       │       └── score_el2n.py
    │       │   ├── merge_values.py
    │       │   └── sort_values.py
    │   ├── mm_utils.py
    │   ├── model
    │       ├── __init__.py
    │       ├── builder.py
    │       ├── language_model
    │       │   ├── __init__.py
    │       │   ├── llava_llama.py
    │       │   ├── llava_phi.py
    │       │   ├── llava_qwen.py
    │       │   ├── llava_stablelm.py
    │       │   ├── phi
    │       │   │   ├── cache_utils.py
    │       │   │   ├── configuration_phi.py
    │       │   │   ├── convert_phi_weights_to_hf.py
    │       │   │   ├── modeling_attn_mask_utils.py
    │       │   │   ├── modeling_phi.py
    │       │   │   └── utils.py
    │       │   ├── qwen
    │       │   │   ├── configuration_qwen.py
    │       │   │   ├── cpp_kernels.py
    │       │   │   ├── llava_qwen.py
    │       │   │   ├── modeling_qwen.py
    │       │   │   ├── qwen_generation_utils.py
    │       │   │   └── tokenization_qwen.py
    │       │   └── stablelm
    │       │   │   ├── configuration_stablelm_epoch.py
    │       │   │   ├── modeling_stablelm_epoch.py
    │       │   │   └── tokenization_arcade100k.py
    │       ├── llava_arch.py
    │       ├── model_factory.py
    │       ├── multimodal_encoder
    │       │   ├── builder.py
    │       │   ├── clip_encoder.py
    │       │   └── siglip_encoder.py
    │       ├── multimodal_projector
    │       │   └── builder.py
    │       └── utils.py
    │   └── utils.py
├── COINCIDE_train
    ├── LICENSE
    ├── cog.yaml
    ├── llava
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── eval_gpt_review.py
    │   │   ├── eval_gpt_review_bench.py
    │   │   ├── eval_gpt_review_visual.py
    │   │   ├── eval_pope.py
    │   │   ├── eval_science_qa.py
    │   │   ├── eval_science_qa_gpt4.py
    │   │   ├── eval_science_qa_gpt4_requery.py
    │   │   ├── eval_textvqa.py
    │   │   ├── generate_webpage_data_from_table.py
    │   │   ├── m4c_evaluator.py
    │   │   ├── model_qa.py
    │   │   ├── model_vqa.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_loader_prune.py
    │   │   ├── model_vqa_mmbench.py
    │   │   ├── model_vqa_science.py
    │   │   ├── qa_baseline_gpt35.py
    │   │   ├── run_llava.py
    │   │   ├── summarize_gpt_review.py
    │   │   └── to_seed_image.py
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── apply_delta.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── language_model
    │   │   │   ├── llava_llama.py
    │   │   │   ├── llava_mistral.py
    │   │   │   └── llava_mpt.py
    │   │   ├── llava_arch.py
    │   │   ├── make_delta.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   └── clip_encoder.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── utils.py
    │   ├── serve
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── controller.py
    │   │   ├── examples
    │   │   │   ├── extreme_ironing.jpg
    │   │   │   └── waterview.jpg
    │   │   ├── gradio_web_server.py
    │   │   ├── model_worker.py
    │   │   ├── register_worker.py
    │   │   ├── sglang_worker.py
    │   │   └── test_message.py
    │   ├── train
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llama_xformers_attn_monkey_patch.py
    │   │   ├── llava_trainer.py
    │   │   ├── train.py
    │   │   ├── train_mem.py
    │   │   └── train_xformers.py
    │   └── utils.py
    ├── playground
    │   └── data
    │   │   ├── coco2014_val_gpt4_qa_30x3.jsonl
    │   │   ├── coco2014_val_qa_eval
    │   │       ├── qa90_gpt4_answer.jsonl
    │   │       └── qa90_questions.jsonl
    │   │   ├── eval
    │   │       └── gqa
    │   │       │   └── data
    │   │       │       └── eval
    │   │       │           ├── eval.py
    │   │       │           └── eval_prune.py
    │   │   └── prompts
    │   │       ├── complex_reasoning
    │   │           ├── 000_caps.txt
    │   │           ├── 000_conv.txt
    │   │           ├── 001_caps.txt
    │   │           ├── 001_conv.txt
    │   │           ├── 002_caps.txt
    │   │           ├── 002_conv.txt
    │   │           └── system_message.txt
    │   │       ├── conversation
    │   │           ├── 000_caps.txt
    │   │           ├── 000_conv.txt
    │   │           ├── 001_caps.txt
    │   │           ├── 001_conv.txt
    │   │           └── system_message.txt
    │   │       └── detail_description
    │   │           ├── 000_caps.txt
    │   │           ├── 000_conv.txt
    │   │           ├── 001_caps.txt
    │   │           ├── 001_conv.txt
    │   │           ├── 002_caps.txt
    │   │           ├── 002_conv.txt
    │   │           └── system_message.txt
    ├── predict.py
    ├── pyproject.toml
    └── scripts
    │   ├── convert_gqa_for_eval.py
    │   ├── convert_mmbench_for_submission.py
    │   ├── convert_mmvet_for_eval.py
    │   ├── convert_seed_for_submission.py
    │   ├── convert_sqa_to_llava.py
    │   ├── convert_sqa_to_llava_base_prompt.py
    │   ├── convert_vizwiz_for_submission.py
    │   ├── convert_vqav2_for_submission.py
    │   ├── extract_mm_projector.py
    │   ├── merge_lora_weights.py
    │   ├── v1_5
    │       ├── eval
    │       │   ├── gqa.sh
    │       │   ├── llavabench.sh
    │       │   ├── mmbench.sh
    │       │   ├── mmbench_cn.sh
    │       │   ├── mme.sh
    │       │   ├── mmvet.sh
    │       │   ├── pope.sh
    │       │   ├── seed.sh
    │       │   ├── sqa.sh
    │       │   ├── textvqa.sh
    │       │   ├── vizwiz.sh
    │       │   └── vqav2.sh
    │       ├── finetune_lora.sh
    │       ├── finetune_lora_coincide.sh
    │       ├── finetune_lora_random.sh
    │       └── vision_flan
    │       │   ├── finetune_lora_vision_flan.sh
    │       │   ├── finetune_lora_vision_flan_coincide.sh
    │       │   └── finetune_lora_vision_flan_random.sh
    │   ├── zero2.json
    │   ├── zero3.json
    │   └── zero3_offload.json
├── README.md
└── assets
    └── concept_figure.png


/COINCIDE_cluster/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "tinyllava"
 7 | version = "1.0.0"
 8 | description = "A Framework of Small-scale Large Multimodal Models."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.2.0", "torchvision==0.17.0", "tiktoken",
17 |     "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.21.0", "peft", "bitsandbytes",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
20 |     "gradio==4.16.0", "gradio_client==0.8.1",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.12.6", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://github.com/DLCV-BUAA/TinyLLaVABench"
31 | "Bug Tracker" = "https://github.com/DLCV-BUAA/TinyLLaVABench/issues"
32 | 
33 | [tool.setuptools.packages.find]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 | 
36 | [tool.wheel]
37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
38 | 
39 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tinyllava.model.builder import load_pretrained_model
 3 | from tinyllava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/coincide/clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_PATH="/base_path"
 4 | SAMPLE_RATIO=0.2
 5 | TEMP=0.1
 6 | 
 7 | python tinyllava/eval/score/coincide/compute_centroids.py \
 8 |         --sim_metric cosine \
 9 |         --Kmeans_with_cos_dist \
10 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/tan_act_37111519_msa.npy \
11 |         --save_folder ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder \
12 |         --ncentroids 10000 \
13 |         --niter 50 \
14 |         --seed 1234 \
15 | 
16 | 
17 | python tinyllava/eval/score/coincide/cluster_transferability.py \
18 |         --centroid_embed_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/kmeans_centroids.npy \
19 |         --transferability_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/transfer.npy \
20 |         --k 4 \
21 |         --knn_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/knn \
22 | 
23 | 
24 | python tinyllava/eval/score/coincide/cluster_wise_prune.py \
25 |         --embedding_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/tan_act_37111519_msa.npy \
26 |         --cluster_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/nearest_cent.npy
27 |         --transfer_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/10000_msa_save_folder/transfer.npy \
28 |         --fraction $SAMPLE_RATIO \
29 |         --temp $TEMP \
30 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/coincide/extract_embed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_PATH="/base_path"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | CKPT="TinyLLaVA-2.0B"
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/coincide/extract_embed.py \
17 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
18 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
19 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
20 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --is_multimodal \
24 |         --batch_size 8 \
25 |         --layer_list 3 7 11 15 19 \
26 |         --version phi &
27 | done
28 | 
29 | wait
30 | 
31 | python tinyllava/eval/score/merge_values.py \
32 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/tan_act_37111519_msa \
33 | 
34 | wait
35 | 
36 | # End time
37 | end_time=$(date +%s)
38 | # Calculate execution time
39 | execution_time=$((end_time - start_time))
40 | minutes=$((execution_time/60))
41 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/d2_prune/d2_prune_selection.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | SAMPLE_RATIO=0.2
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | python tinyllava/eval/score/d2_prune/d2_prune.py \
 9 |         --score-path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/aum_scores.npy \
10 |         --embed-path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/avg_llava_embed.npy \
11 |         --output-indices-path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/d2_prune_indices_${SAMPLE_RATIO}.npy \
12 |         --n-neighbors 5 \
13 |         --gamma 0.4 \
14 |         --fraction $SAMPLE_RATIO \
15 | 
16 | # End time
17 | end_time=$(date +%s)
18 | # Calculate execution time
19 | execution_time=$((end_time - start_time))
20 | minutes=$((execution_time/60))
21 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/d2_prune/score_aum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="TinyLLaVA-2.0B"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_aum.py \
15 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
16 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
17 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
18 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/aum_scores \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --is_multimodal \
22 |         --batch_size 4 \
23 |         --version phi &
24 | done
25 | 
26 | wait
27 | 
28 | python tinyllava/eval/score/merge_values.py \
29 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/aum_scores \
30 | 
31 | # End time
32 | end_time=$(date +%s)
33 | 
34 | # Calculate execution time
35 | execution_time=$((end_time - start_time))
36 | minutes=$((execution_time/60))
37 | 
38 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/d2_prune/score_llava_embed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="TinyLLaVA-2.0B"
 4 | 
 5 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 6 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 7 | 
 8 | CHUNKS=${#GPULIST[@]}
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \
12 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
13 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/avg_llava_embed \
14 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
15 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
16 |         --num-chunks $CHUNKS \
17 |         --chunk-idx $IDX \
18 |         --is_multimodal \
19 |         --version phi \
20 |         --avg_embed \
21 |         --batch_size 4 &
22 | done
23 | 
24 | wait
25 | 
26 | python tinyllava/eval/score/merge_values.py \
27 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/avg_llava_embed \
28 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/el2n/score_el2n.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="TinyLLaVA-2.0B"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/el2n/score_el2n.py \
15 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
16 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
17 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
18 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_scores \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --is_multimodal \
22 |         --batch_size 4 \
23 |         --version phi &
24 | done
25 | 
26 | wait
27 | 
28 | python tinyllava/eval/score/merge_values.py \
29 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_scores \
30 | 
31 | python tinyllava/eval/score/sort_values.py \
32 |     --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_scores.npy \
33 |     --save_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/el2n_indices.npy \
34 | 
35 | # End time
36 | end_time=$(date +%s)
37 | 
38 | # Calculate execution time
39 | execution_time=$((end_time - start_time))
40 | minutes=$((execution_time/60))
41 | 
42 | echo "Total execution time: ${minutes} minutes"
43 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/semdedup/score_SemDeDup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="TinyLLaVA-2.0B"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \
15 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
16 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/llava_embed \
17 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
18 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --is_multimodal \
22 |         --version phi \
23 |         --batch_size 4 &
24 | done
25 | 
26 | wait
27 | 
28 | python tinyllava/eval/score/merge_values.py \
29 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/llava_embed \
30 | 
31 | # End time
32 | end_time=$(date +%s)
33 | 
34 | # Calculate execution time
35 | execution_time=$((end_time - start_time))
36 | minutes=$((execution_time/60))
37 | 
38 | echo "Total execution time: ${minutes} minutes"
39 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/llava_score/semdedup/semdedup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | SAMPLE_RATIO=0.2
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | python tinyllava/eval/score/SemDeDup/compute_centroids.py \
 9 |         --sim_metric cosine \
10 |         --keep_hard \
11 |         --Kmeans_with_cos_dist \
12 |         --emb_memory_loc ${BASE_PATH}/LLaVA/playground/data/TinyLLaVA-Instruction/llava_embed.npy \
13 |         --sorted_clusters_file_loc ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/semdedup_llava_sorted_clusters \
14 |         --save_folder ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/semdedup_llava_save_folder \
15 |         --output_indices_path ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/semdedup_indices_${SAMPLE_RATIO}.npy \
16 |         --ncentroids 10000 \
17 |         --niter 50 \
18 |         --seed 1234 \
19 |         --prune_p $SAMPLE_RATIO \
20 |         --eps_list 0.266 0.26665 0.267 \
21 | 
22 | # For different SAMPLE_RATIO, note that eps_list should be manually found
23 | 
24 | # End time
25 | end_time=$(date +%s)
26 | # Calculate execution time
27 | execution_time=$((end_time - start_time))
28 | minutes=$((execution_time/60))
29 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/coincide/clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_PATH="/base_path"
 4 | SAMPLE_RATIO=0.2
 5 | TEMP=0.1
 6 | 
 7 | python tinyllava/eval/score/coincide/compute_centroids.py \
 8 |         --sim_metric cosine \
 9 |         --Kmeans_with_cos_dist \
10 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/tan_act_37111519_msa.npy \
11 |         --save_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder \
12 |         --ncentroids 2500 \
13 |         --niter 50 \
14 |         --seed 1234 \
15 | 
16 | 
17 | python tinyllava/eval/score/coincide/cluster_transferability.py \
18 |         --centroid_embed_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/kmeans_centroids.npy \
19 |         --transferability_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/transfer_lang.npy \
20 |         --k 4 \
21 |         --knn_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/knn \
22 | 
23 | 
24 | python tinyllava/eval/score/coincide/cluster_wise_prune.py \
25 |         --embedding_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/tan_act_37111519_msa.npy \
26 |         --cluster_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/nearest_cent.npy
27 |         --transfer_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/2500_save_folder/transfer.npy \
28 |         --fraction $SAMPLE_RATIO \
29 |         --temp $TEMP \


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/coincide/extract_embed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_PATH="/base_path"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | CKPT="vison_flan_tinyllava_v100_2.0b"
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/coincide/extract_embed.py \
17 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
18 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
19 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
20 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --is_multimodal \
24 |         --batch_size 8 \
25 |         --layer_list 3 7 11 15 19 \
26 |         --version phi &
27 | done
28 | 
29 | wait
30 | 
31 | python tinyllava/eval/score/merge_values.py \
32 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/tan_act_37111519_msa \
33 | 
34 | wait
35 | 
36 | # End time
37 | end_time=$(date +%s)
38 | # Calculate execution time
39 | execution_time=$((end_time - start_time))
40 | minutes=$((execution_time/60))
41 | echo "Total execution time: ${minutes} minutes"
42 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/d2_prune/d2_prune_selection.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | SAMPLE_RATIO=0.167
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | python tinyllava/eval/score/d2_prune/d2_prune.py \
 9 |         --score-path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/aum_scores.npy \
10 |         --embed-path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/avg_llava_embed.npy \
11 |         --output-indices-path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/d2_prune_indices_${SAMPLE_RATIO}.npy \
12 |         --n-neighbors 5 \
13 |         --gamma 0.4 \
14 |         --fraction $SAMPLE_RATIO \
15 | 
16 | # End time
17 | end_time=$(date +%s)
18 | # Calculate execution time
19 | execution_time=$((end_time - start_time))
20 | minutes=$((execution_time/60))
21 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/d2_prune/score_aum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="vison_flan_tinyllava_v100_2.0b"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_aum.py \
16 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
17 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
18 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
19 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/aum_scores \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --is_multimodal \
23 |         --batch_size 4 \
24 |         --version phi &
25 | done
26 | 
27 | wait
28 | 
29 | python tinyllava/eval/score/merge_values.py \
30 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/aum_scores \
31 | 
32 | # End time
33 | end_time=$(date +%s)
34 | 
35 | # Calculate execution time
36 | execution_time=$((end_time - start_time))
37 | minutes=$((execution_time/60))
38 | 
39 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/d2_prune/score_llava_embed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="vison_flan_tinyllava_v100_2.0b"
 4 | 
 5 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 6 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 7 | 
 8 | CHUNKS=${#GPULIST[@]}
 9 | 
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \
13 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
14 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/avg_llava_embed \
15 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
16 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --is_multimodal \
20 |         --version phi \
21 |         --avg_embed \
22 |         --batch_size 4 &
23 | done
24 | 
25 | wait
26 | 
27 | python tinyllava/eval/score/merge_values.py \
28 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/avg_llava_embed \
29 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/el2n/score_el2n.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="vison_flan_tinyllava_v100_2.0b"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/el2n/score_el2n.py \
16 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
17 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
18 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
19 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_scores \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --is_multimodal \
23 |         --batch_size 4 \
24 |         --version phi &
25 | done
26 | 
27 | wait
28 | 
29 | python tinyllava/eval/score/merge_values.py \
30 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_scores \
31 | 
32 | python tinyllava/eval/score/sort_values.py \
33 |     --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_scores.npy \
34 |     --save_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/el2n_indices.npy \
35 | 
36 | # End time
37 | end_time=$(date +%s)
38 | 
39 | # Calculate execution time
40 | execution_time=$((end_time - start_time))
41 | minutes=$((execution_time/60))
42 | 
43 | echo "Total execution time: ${minutes} minutes"
44 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/semdedup/score_SemDeDup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | CKPT="vision_flan_tinyllava_v100_2.0b"
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | 
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python tinyllava/eval/score/d2_prune/score_llava_embed.py \
15 |         --model_path ${BASE_PATH}/checkpoints/$CKPT \
16 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/llava_embed \
17 |         --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
18 |         --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --is_multimodal \
22 |         --version phi \
23 |         --batch_size 4 &
24 | done
25 | 
26 | wait
27 | 
28 | python tinyllava/eval/score/merge_values.py \
29 |         --score_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/llava_embed \
30 | 
31 | # End time
32 | end_time=$(date +%s)
33 | 
34 | # Calculate execution time
35 | execution_time=$((end_time - start_time))
36 | minutes=$((execution_time/60))
37 | 
38 | echo "Total execution time: ${minutes} minutes"
39 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/scripts/tiny_llava/eval/vision_flan_score/semdedup/semdedup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASE_PATH="/base_path"
 3 | SAMPLE_RATIO=0.167
 4 | 
 5 | # Start time
 6 | start_time=$(date +%s)
 7 | 
 8 | python tinyllava/eval/score/SemDeDup/compute_centroids.py \
 9 |         --sim_metric cosine \
10 |         --keep_hard \
11 |         --Kmeans_with_cos_dist \
12 |         --emb_memory_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/llava_embed.npy \
13 |         --sorted_clusters_file_loc ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/semdedup_llava_sorted_clusters \
14 |         --save_folder ${BASE_PATH}/COINCIDE_trainplayground/data/vision-flan_191-task_1k/semdedup_llava_save_folder \
15 |         --output_indices_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/semdedup_indices_${SAMPLE_RATIO}.npy \
16 |         --ncentroids 3000 \
17 |         --niter 50 \
18 |         --seed 1234 \
19 |         --prune_p $SAMPLE_RATIO \
20 |         --eps_list 0.305 0.31 0.315 \
21 | 
22 | # For different SAMPLE_RATIO, note that eps_list should be manually found
23 | 
24 | # End time
25 | end_time=$(date +%s)
26 | # Calculate execution time
27 | execution_time=$((end_time - start_time))
28 | minutes=$((execution_time/60))
29 | echo "Total execution time: ${minutes} minutes"


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/__init__.py:
--------------------------------------------------------------------------------
1 | from tinyllava.model import *
2 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/arguments.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Dict, Optional, Sequence, List
 3 | 
 4 | import transformers
 5 | 
 6 | @dataclass
 7 | class ModelArguments:
 8 |     model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
 9 |     version: Optional[str] = field(default="v0")
10 |     freeze_backbone: bool = field(default=False)
11 |     tune_mm_mlp_adapter: bool = field(default=False)
12 |     vision_tower: Optional[str] = field(default=None)
13 |     mm_vision_select_layer: Optional[int] = field(default=-1)  # default to the last layer
14 |     pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
15 |     mm_projector_type: Optional[str] = field(default='linear')
16 |     mm_use_im_start_end: bool = field(default=False)
17 |     mm_use_im_patch_token: bool = field(default=True)
18 |     mm_patch_merge_type: Optional[str] = field(default='flat')
19 |     mm_vision_select_feature: Optional[str] = field(default="patch")
20 |     resampler_hidden_size: Optional[int] = field(default=768)
21 |     num_queries: Optional[int] = field(default=128)
22 |     num_resampler_layers: Optional[int] = field(default=3)
23 |     tune_vision_tower: bool = field(default=False)
24 |     tune_entire_model: bool = field(default=False)
25 |     tune_vit_from_layer: Optional[int] = field(default=100)
26 |     tune_embed_tokens: Optional[int] = field(default=False)
27 | 
28 | 
29 | @dataclass
30 | class DataArguments:
31 |     data_path: str = field(default=None,
32 |                            metadata={"help": "Path to the training data."})
33 |     lazy_preprocess: bool = False
34 |     is_multimodal: bool = False
35 |     image_folder: Optional[str] = field(default=None)
36 |     image_aspect_ratio: str = 'square'
37 | 
38 | 
39 | @dataclass
40 | class TrainingArguments(transformers.TrainingArguments):
41 |     cache_dir: Optional[str] = field(default=None)
42 |     optim: str = field(default="adamw_torch")
43 |     remove_unused_columns: bool = field(default=False)
44 |     freeze_mm_mlp_adapter: bool = field(default=False)
45 |     mpt_attn_impl: Optional[str] = field(default="triton")
46 |     model_max_length: int = field(
47 |         default=512,
48 |         metadata={
49 |             "help":
50 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
51 |         },
52 |     )
53 |     double_quant: bool = field(
54 |         default=True,
55 |         metadata={"help": "Compress the quantization statistics through double quantization."}
56 |     )
57 |     quant_type: str = field(
58 |         default="nf4",
59 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
60 |     )
61 |     bits: int = field(
62 |         default=16,
63 |         metadata={"help": "How many bits to use."}
64 |     )
65 |     lora_enable: bool = False
66 |     lora_r: int = 64
67 |     lora_alpha: int = 16
68 |     lora_dropout: float = 0.05
69 |     lora_weight_path: str = ""
70 |     lora_bias: str = "none"
71 |     mm_projector_lr: Optional[float] = None
72 |     group_by_modality_length: bool = field(default=False)
73 |     vision_tower_lr: Optional[float] = None
74 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/data/__init__.py


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/data/preprocess/__init__.py


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/default.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional, Sequence, List
  2 | import copy
  3 | 
  4 | import transformers
  5 | import torch
  6 | 
  7 | from tinyllava.data.process import register_preprocess
  8 | from tinyllava.mm_utils import tokenizer_image_token
  9 | from tinyllava import conversation as conversation_lib
 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
 11 |     DEFAULT_IM_END_TOKEN
 12 | 
 13 | 
 14 | @register_preprocess('default')
 15 | def preprocess_default(
 16 |     sources: Sequence[str],
 17 |     tokenizer: transformers.PreTrainedTokenizer,
 18 |     has_image: bool = False
 19 | ) -> Dict:
 20 |     conversations = []
 21 |     for source in sources:
 22 |         header = f"{conversation_lib.default_conversation.system}\n\n"
 23 |         conversation = _add_speaker_and_signal(header, source)
 24 |         conversations.append(conversation)
 25 | 
 26 |     # tokenize conversations
 27 |     def get_tokenize_len(prompts):
 28 |         return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
 29 | 
 30 |     if has_image:
 31 |         input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
 32 |     else:
 33 |         conversations_tokenized = _tokenize_fn(conversations, tokenizer)
 34 |         input_ids = conversations_tokenized["input_ids"]
 35 | 
 36 |     targets = copy.deepcopy(input_ids)
 37 |     for target, source in zip(targets, sources):
 38 |         if has_image:
 39 |             tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
 40 |         else:
 41 |             tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
 42 |         speakers = [sentence["from"] for sentence in source]
 43 |         _mask_targets(target, tokenized_lens, speakers)
 44 | 
 45 |     return dict(input_ids=input_ids, labels=targets)
 46 | 
 47 | 
 48 | def _tokenize_fn(strings: Sequence[str],
 49 |                  tokenizer: transformers.PreTrainedTokenizer) -> Dict:
 50 |     """Tokenize a list of strings."""
 51 |     tokenized_list = [
 52 |         tokenizer(
 53 |             text,
 54 |             return_tensors="pt",
 55 |             padding="longest",
 56 |             max_length=tokenizer.model_max_length,
 57 |             truncation=True,
 58 |         ) for text in strings
 59 |     ]
 60 |     input_ids = labels = [
 61 |         tokenized.input_ids[0] for tokenized in tokenized_list
 62 |     ]
 63 |     input_ids_lens = labels_lens = [
 64 |         tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
 65 |         for tokenized in tokenized_list
 66 |     ]
 67 |     return dict(
 68 |         input_ids=input_ids,
 69 |         labels=labels,
 70 |         input_ids_lens=input_ids_lens,
 71 |         labels_lens=labels_lens,
 72 |     )
 73 | 
 74 | 
 75 | def _add_speaker_and_signal(header, source, get_conversation=True):
 76 |     """Add speaker and start/end signal on each round."""
 77 |     BEGIN_SIGNAL = "### "
 78 |     END_SIGNAL = "\n"
 79 |     conversation = header
 80 |     for sentence in source:
 81 |         from_str = sentence["from"]
 82 |         if from_str.lower() == "human":
 83 |             from_str = conversation_lib.default_conversation.roles[0]
 84 |         elif from_str.lower() == "gpt":
 85 |             from_str = conversation_lib.default_conversation.roles[1]
 86 |         else:
 87 |             from_str = 'unknown'
 88 |         sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
 89 |                              sentence["value"] + END_SIGNAL)
 90 |         if get_conversation:
 91 |             conversation += sentence["value"]
 92 |     conversation += BEGIN_SIGNAL
 93 |     return conversation
 94 | 
 95 | 
 96 | def _mask_targets(target, tokenized_lens, speakers):
 97 |     # cur_idx = 0
 98 |     cur_idx = tokenized_lens[0]
 99 |     tokenized_lens = tokenized_lens[1:]
100 |     target[:cur_idx] = IGNORE_INDEX
101 |     for tokenized_len, speaker in zip(tokenized_lens, speakers):
102 |         if speaker == "human":
103 |             target[cur_idx + 2:cur_idx + tokenized_len] = IGNORE_INDEX
104 |         cur_idx += tokenized_len
105 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/llama2.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Sequence, List
 2 | import copy
 3 | 
 4 | import transformers
 5 | import torch
 6 | 
 7 | from tinyllava.data.process import register_preprocess
 8 | from tinyllava.mm_utils import tokenizer_image_token
 9 | from tinyllava import conversation as conversation_lib
10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
11 |     DEFAULT_IM_END_TOKEN
12 | 
13 | 
14 | @register_preprocess('llama_2')
15 | def preprocess_llama_2(
16 |     sources,
17 |     tokenizer: transformers.PreTrainedTokenizer,
18 |     has_image: bool = False
19 | ) -> Dict:
20 |     conv = conversation_lib.default_conversation.copy()
21 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
22 | 
23 |     # Apply prompt templates
24 |     conversations = []
25 |     for i, source in enumerate(sources):
26 |         if roles[source[0]["from"]] != conv.roles[0]:
27 |             # Skip the first one if it is not from human
28 |             source = source[1:]
29 | 
30 |         conv.messages = []
31 |         for j, sentence in enumerate(source):
32 |             role = roles[sentence["from"]]
33 |             assert role == conv.roles[j % 2], f"{i}"
34 |             conv.append_message(role, sentence["value"])
35 |         conversations.append(conv.get_prompt())
36 | 
37 |     # Tokenize conversations
38 | 
39 |     if has_image:
40 |         input_ids = torch.stack(
41 |             [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
42 |     else:
43 |         input_ids = tokenizer(
44 |             conversations,
45 |             return_tensors="pt",
46 |             padding="longest",
47 |             max_length=tokenizer.model_max_length,
48 |             truncation=True,
49 |         ).input_ids
50 | 
51 |     targets = input_ids.clone()
52 | 
53 |     assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
54 | 
55 |     # Mask targets
56 |     sep = "[/INST] "
57 |     for conversation, target in zip(conversations, targets):
58 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
59 | 
60 |         rounds = conversation.split(conv.sep2)
61 |         cur_len = 1
62 |         target[:cur_len] = IGNORE_INDEX
63 |         for i, rou in enumerate(rounds):
64 |             if rou == "":
65 |                 break
66 | 
67 |             parts = rou.split(sep)
68 |             if len(parts) != 2:
69 |                 break
70 |             parts[0] += sep
71 | 
72 |             if has_image:
73 |                 round_len = len(tokenizer_image_token(rou, tokenizer))
74 |                 instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
75 |             else:
76 |                 round_len = len(tokenizer(rou).input_ids)
77 |                 instruction_len = len(tokenizer(parts[0]).input_ids) - 2
78 | 
79 |             target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
80 | 
81 |             cur_len += round_len
82 |         target[cur_len:] = IGNORE_INDEX
83 | 
84 |         if cur_len < tokenizer.model_max_length:
85 |             if cur_len != total_len:
86 |                 target[:] = IGNORE_INDEX
87 |                 print(
88 |                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
89 |                     f" (ignored)"
90 |                 )
91 | 
92 |     return dict(
93 |         input_ids=input_ids,
94 |         labels=targets,
95 |     )


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/mpt.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Sequence, List
 2 | import copy
 3 | 
 4 | import transformers
 5 | import torch
 6 | 
 7 | from tinyllava.data.process import register_preprocess
 8 | from tinyllava.mm_utils import tokenizer_image_token
 9 | from tinyllava import conversation as conversation_lib
10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
11 |     DEFAULT_IM_END_TOKEN
12 | 
13 | 
14 | @register_preprocess('mpt')
15 | def preprocess_mpt(
16 |     sources,
17 |     tokenizer: transformers.PreTrainedTokenizer,
18 |     has_image: bool = False
19 | ) -> Dict:
20 |     conv = conversation_lib.default_conversation.copy()
21 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
22 | 
23 |     # Apply prompt templates
24 |     conversations = []
25 |     for i, source in enumerate(sources):
26 |         if roles[source[0]["from"]] != conv.roles[0]:
27 |             # Skip the first one if it is not from human
28 |             source = source[1:]
29 | 
30 |         conv.messages = []
31 |         for j, sentence in enumerate(source):
32 |             role = roles[sentence["from"]]
33 |             assert role == conv.roles[j % 2], f"{i}"
34 |             conv.append_message(role, sentence["value"])
35 |         conversations.append(conv.get_prompt())
36 | 
37 |     # Tokenize conversations
38 |     input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations],
39 |                             dim=0)
40 |     targets = input_ids.clone()
41 |     assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
42 | 
43 |     # Mask targets
44 |     sep = conv.sep + conv.roles[1]
45 |     for conversation, target in zip(conversations, targets):
46 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
47 | 
48 |         rounds = conversation.split(conv.sep)
49 |         re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
50 |         for conv_idx in range(3, len(rounds), 2):
51 |             re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx + 2]))  # user + gpt
52 |         cur_len = 0
53 |         target[:cur_len] = IGNORE_INDEX
54 |         for i, rou in enumerate(re_rounds):
55 |             if rou == "":
56 |                 break
57 | 
58 |             parts = rou.split(sep)
59 |             if len(parts) != 2:
60 |                 break
61 |             parts[0] += sep
62 |             round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer))
63 |             instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
64 |             target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
65 | 
66 |             cur_len += round_len
67 |         target[cur_len:] = IGNORE_INDEX
68 | 
69 |         if cur_len < tokenizer.model_max_length:
70 |             if cur_len != total_len:
71 |                 target[:] = IGNORE_INDEX
72 |                 print(
73 |                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
74 |                     f" (ignored)"
75 |                 )
76 | 
77 |     return dict(
78 |         input_ids=input_ids,
79 |         labels=targets,
80 |     )
81 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/phi.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Sequence, List
 2 | import copy
 3 | 
 4 | import transformers
 5 | import torch
 6 | 
 7 | from tinyllava.data.process import register_preprocess
 8 | from tinyllava.mm_utils import tokenizer_image_token
 9 | from tinyllava import conversation as conversation_lib
10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
11 |     DEFAULT_IM_END_TOKEN
12 | 
13 | 
14 | @register_preprocess('phi')
15 | def preprocess_phi(
16 |     sources,
17 |     tokenizer: transformers.PreTrainedTokenizer,
18 |     has_image: bool = False
19 | ) -> Dict:
20 |     conv = conversation_lib.default_conversation.copy()
21 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
22 | 
23 |     # print('00000000000', sources)
24 |     # Apply prompt templates
25 |     conversations = []
26 | 
27 |     for i, source in enumerate(sources):
28 |         if roles[source[0]["from"]] != conv.roles[0]:
29 |             # Skip the first one if it is not from human
30 |             source = source[1:]
31 | 
32 |         conv.messages = []
33 |         for j, sentence in enumerate(source):
34 |             role = roles[sentence["from"]]
35 |             assert role == conv.roles[j % 2], f"{i}"
36 |             conv.append_message(role, sentence["value"])
37 |         conversations.append(conv.get_prompt())
38 |     # Tokenize conversations
39 |     if has_image:
40 |         input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
41 |     else:
42 |         input_ids = tokenizer(
43 |             conversations,
44 |             return_tensors="pt",
45 |             padding="longest",
46 |             max_length=tokenizer.model_max_length,
47 |             truncation=True,
48 |         ).input_ids
49 | 
50 |     targets = input_ids.clone()
51 | 
52 |     assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
53 |     # print(tokenizer)
54 |     # Mask targets
55 |     sep = conv.sep + conv.roles[1] + ": "
56 |     for conversation, target in zip(conversations, targets):
57 |         total_len = int(target.ne(tokenizer.pad_token_id).sum()) + conversation.count(conv.sep2)
58 | 
59 |         rounds = conversation.split(conv.sep2)
60 |         cur_len = 0
61 |         # target[:cur_len] = IGNORE_INDEX
62 |         for i, rou in enumerate(rounds):
63 |             if rou == "":
64 |                 break
65 | 
66 |             parts = rou.split(sep)
67 |             if len(parts) != 2:
68 |                 break
69 |             parts[0] += sep
70 | 
71 |             if has_image:
72 |                 round_len = len(tokenizer_image_token(rou, tokenizer)) + 1
73 |                 instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
74 |             else:
75 |                 round_len = len(tokenizer(rou).input_ids) + 1
76 |                 instruction_len = len(tokenizer(parts[0]).input_ids) - 1
77 | 
78 |             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
79 | 
80 |             cur_len += round_len
81 |         target[cur_len:] = IGNORE_INDEX
82 | 
83 |         if cur_len < tokenizer.model_max_length:
84 |             if cur_len != total_len:
85 |                 print(
86 |                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
87 |                     f" (ignored)"
88 |                 )
89 |                 print("number of rounds: ", len(rounds) - 1)
90 |                 print("rounds: ", rounds[:-1])
91 |                 print("conversation: ", conversations)
92 |                 # print(target)
93 |                 # print(input_ids)
94 |                 target[:] = IGNORE_INDEX
95 | 
96 |     return dict(
97 |         input_ids=input_ids,
98 |         labels=targets,
99 |     )


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/plain.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Sequence, List
 2 | import copy
 3 | 
 4 | import transformers
 5 | import torch
 6 | 
 7 | from tinyllava.data.process import register_preprocess
 8 | from tinyllava.mm_utils import tokenizer_image_token
 9 | from tinyllava import conversation as conversation_lib
10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
11 |     DEFAULT_IM_END_TOKEN
12 | 
13 | 
14 | @register_preprocess('plain')
15 | def preprocess_plain(
16 |     sources: Sequence[str],
17 |     tokenizer: transformers.PreTrainedTokenizer,
18 |     has_image: bool = False
19 | ) -> Dict:
20 |     # add end signal and concatenate together
21 |     conversations = []
22 |     for source in sources:
23 |         assert len(source) == 2
24 |         assert DEFAULT_IMAGE_TOKEN in source[0]['value']
25 |         source[0]['value'] = DEFAULT_IMAGE_TOKEN
26 |         conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
27 |         conversations.append(conversation)
28 |     # tokenize conversations
29 |     input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
30 |     targets = copy.deepcopy(input_ids)
31 |     for target, source in zip(targets, sources):
32 |         tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
33 |         target[:tokenized_len] = IGNORE_INDEX
34 | 
35 |     return dict(input_ids=input_ids, labels=targets)
36 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/stablelm.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Sequence, List
 2 | import copy
 3 | 
 4 | import transformers
 5 | import torch
 6 | 
 7 | from tinyllava.data.process import register_preprocess
 8 | from tinyllava.mm_utils import tokenizer_image_token
 9 | from tinyllava import conversation as conversation_lib
10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
11 |     DEFAULT_IM_END_TOKEN
12 | 
13 | 
14 | @register_preprocess('stablelm')
15 | def preprocess_stablelm(
16 |     sources,
17 |     tokenizer: transformers.PreTrainedTokenizer,
18 |     has_image: bool = False
19 | ) -> Dict:
20 |     conv = conversation_lib.default_conversation.copy()
21 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
22 | 
23 |     # print('00000000000', sources)
24 |     # Apply prompt templates
25 |     conversations = []
26 | 
27 |     for i, source in enumerate(sources):
28 |         if roles[source[0]["from"]] != conv.roles[0]:
29 |             # Skip the first one if it is not from human
30 |             source = source[1:]
31 | 
32 |         conv.messages = []
33 |         for j, sentence in enumerate(source):
34 |             role = roles[sentence["from"]]
35 |             assert role == conv.roles[j % 2], f"{i}"
36 |             conv.append_message(role, sentence["value"])
37 |         conversations.append(conv.get_prompt())
38 |     # Tokenize conversations
39 |     if has_image:
40 |         input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
41 |     else:
42 |         input_ids = tokenizer(
43 |             conversations,
44 |             return_tensors="pt",
45 |             padding="longest",
46 |             max_length=tokenizer.model_max_length,
47 |             truncation=True,
48 |         ).input_ids
49 | 
50 |     targets = input_ids.clone()
51 | 
52 |     assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
53 |     # print(tokenizer)
54 |     # Mask targets
55 |     sep = conv.sep + conv.roles[1] + ": "
56 |     for conversation, target in zip(conversations, targets):
57 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
58 | 
59 |         rounds = conversation.split(conv.sep2)
60 |         cur_len = 0
61 |         target[:cur_len] = IGNORE_INDEX
62 |         for i, rou in enumerate(rounds):
63 |             if rou == "":
64 |                 break
65 | 
66 |             parts = rou.split(sep)
67 |             # print('i rou, parts', i, rou, parts)
68 |             if len(parts) != 2:
69 |                 break
70 |             parts[0] += sep
71 |             # print('after add sep, parts', parts)
72 | 
73 |             if has_image:
74 |                 round_len = len(tokenizer_image_token(rou, tokenizer)) + 1  # for eos_token
75 |                 instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
76 |             else:
77 |                 round_len = len(tokenizer(rou).input_ids) + 1  # for eos_token
78 |                 instruction_len = len(tokenizer(parts[0]).input_ids) - 1
79 |             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
80 | 
81 |             cur_len += round_len
82 |         target[cur_len:] = IGNORE_INDEX
83 | 
84 |         if cur_len < tokenizer.model_max_length:
85 |             if cur_len != total_len:
86 |                 target[:] = IGNORE_INDEX
87 |                 print(
88 |                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
89 |                     f" (ignored)"
90 |                 )
91 | 
92 |     return dict(
93 |         input_ids=input_ids,
94 |         labels=targets,
95 |     )


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/tiny_llama.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Sequence, List
 2 | import copy
 3 | 
 4 | import transformers
 5 | import torch
 6 | 
 7 | from tinyllava.data.process import register_preprocess
 8 | from tinyllava.mm_utils import tokenizer_image_token
 9 | from tinyllava import conversation as conversation_lib
10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
11 |     DEFAULT_IM_END_TOKEN
12 | 
13 | 
14 | @register_preprocess('tiny_llama')
15 | def preprocess_tiny_llama(
16 |     sources,
17 |     tokenizer: transformers.PreTrainedTokenizer,
18 |     has_image: bool = False
19 | ) -> Dict:
20 |     conv = conversation_lib.default_conversation.copy()
21 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
22 | 
23 |     # Apply prompt templates
24 |     conversations = []
25 |     for i, source in enumerate(sources):
26 |         if roles[source[0]["from"]] != conv.roles[0]:
27 |             # Skip the first one if it is not from human
28 |             source = source[1:]
29 | 
30 |         conv.messages = []
31 |         for j, sentence in enumerate(source):
32 |             role = roles[sentence["from"]]
33 |             assert role == conv.roles[j % 2], f"{i}"
34 |             conv.append_message(role, sentence["value"])
35 |         conversations.append(conv.get_prompt())
36 | 
37 |     # Tokenize conversations
38 | 
39 |     if has_image:
40 |         input_ids = torch.stack(
41 |             [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
42 |     else:
43 |         input_ids = tokenizer(
44 |             conversations,
45 |             return_tensors="pt",
46 |             padding="longest",
47 |             max_length=tokenizer.model_max_length,
48 |             truncation=True,
49 |         ).input_ids
50 | 
51 |     targets = input_ids.clone()
52 | 
53 |     assert conv.sep_style == conversation_lib.SeparatorStyle.TINY_LLAMA
54 | 
55 |     for conversation, target in zip(conversations, targets):
56 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
57 | 
58 |         rounds = conversation.split(conv.sep2)
59 |         cur_len = 1
60 |         target[:cur_len] = IGNORE_INDEX
61 |         sep = "<|assistant|>\n"
62 |         for i, rou in enumerate(rounds):
63 |             if rou == "":
64 |                 break
65 | 
66 |             parts = rou.split(sep)
67 |             if len(parts) != 2:
68 |                 # print("current part len: ", len(parts))
69 |                 break
70 |             parts[0] = parts[0] + sep
71 | 
72 |             if has_image:
73 |                 round_len = len(tokenizer_image_token(rou, tokenizer))
74 |                 instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
75 |             else:
76 |                 round_len = len(tokenizer(rou).input_ids)
77 |                 instruction_len = len(tokenizer(parts[0]).input_ids) - 1
78 | 
79 |             target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
80 | 
81 |             cur_len += round_len
82 |         target[cur_len:] = IGNORE_INDEX
83 | 
84 |         if cur_len < tokenizer.model_max_length:
85 |             if cur_len != total_len:
86 |                 target[:] = IGNORE_INDEX
87 |                 print(
88 |                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
89 |                     f" (ignored)"
90 |                 )
91 | 
92 |     return dict(
93 |         input_ids=input_ids,
94 |         labels=targets,
95 |     )


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/preprocess/v1.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional, Sequence, List
  2 | import copy
  3 | 
  4 | import transformers
  5 | import torch
  6 | 
  7 | from tinyllava.data.process import register_preprocess
  8 | from tinyllava.mm_utils import tokenizer_image_token
  9 | from tinyllava import conversation as conversation_lib
 10 | from tinyllava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
 11 |     DEFAULT_IM_END_TOKEN
 12 | import tokenizers
 13 | 
 14 | from packaging import version
 15 | 
 16 | IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
 17 | 
 18 | 
 19 | @register_preprocess('v1')
 20 | def preprocess_v1(
 21 |     sources,
 22 |     tokenizer: transformers.PreTrainedTokenizer,
 23 |     has_image: bool = False
 24 | ) -> Dict:
 25 |     conv = conversation_lib.default_conversation.copy()
 26 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 27 | 
 28 |     # Apply prompt templates
 29 |     conversations = []
 30 |     for i, source in enumerate(sources):
 31 |         if roles[source[0]["from"]] != conv.roles[0]:
 32 |             # Skip the first one if it is not from human
 33 |             source = source[1:]
 34 | 
 35 |         conv.messages = []
 36 |         for j, sentence in enumerate(source):
 37 |             role = roles[sentence["from"]]
 38 |             assert role == conv.roles[j % 2], f"{i}"
 39 |             conv.append_message(role, sentence["value"])
 40 |         conversations.append(conv.get_prompt())
 41 | 
 42 |     # Tokenize conversations
 43 | 
 44 |     if has_image:
 45 |         input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
 46 |     else:
 47 |         input_ids = tokenizer(
 48 |             conversations,
 49 |             return_tensors="pt",
 50 |             padding="longest",
 51 |             max_length=tokenizer.model_max_length,
 52 |             truncation=True,
 53 |         ).input_ids
 54 | 
 55 |     targets = input_ids.clone()
 56 | 
 57 |     assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
 58 | 
 59 |     # Mask targets
 60 |     sep = conv.sep + conv.roles[1] + ": "
 61 |     for conversation, target in zip(conversations, targets):
 62 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
 63 | 
 64 |         rounds = conversation.split(conv.sep2)
 65 |         cur_len = 1
 66 |         target[:cur_len] = IGNORE_INDEX
 67 |         for i, rou in enumerate(rounds):
 68 |             if rou == "":
 69 |                 break
 70 | 
 71 |             parts = rou.split(sep)
 72 |             if len(parts) != 2:
 73 |                 break
 74 |             parts[0] += sep
 75 | 
 76 |             if has_image:
 77 |                 round_len = len(tokenizer_image_token(rou, tokenizer))
 78 |                 instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
 79 |             else:
 80 |                 round_len = len(tokenizer(rou).input_ids)
 81 |                 instruction_len = len(tokenizer(parts[0]).input_ids) - 2
 82 | 
 83 |             if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
 84 |                 round_len -= 1
 85 |                 instruction_len -= 1
 86 | 
 87 |             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
 88 | 
 89 |             cur_len += round_len
 90 |         target[cur_len:] = IGNORE_INDEX
 91 | 
 92 |         if cur_len < tokenizer.model_max_length:
 93 |             if cur_len != total_len:
 94 | 
 95 |                 print(
 96 |                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
 97 |                     f" (ignored)"
 98 |                 )
 99 |                 # print("number of rounds: ", len(rounds) - 1)
100 |                 # print("rounds: ", rounds[:-1])
101 |                 # print("conversation: ", conversations)
102 |                 # print(target)
103 |                 # print(input_ids)
104 |                 target[:] = IGNORE_INDEX
105 | 
106 |     return dict(
107 |         input_ids=input_ids,
108 |         labels=targets,
109 |     )
110 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/data/process.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | from typing import Dict, Optional, Sequence, List
 4 | 
 5 | import transformers
 6 | 
 7 | from tinyllava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 8 | from tinyllava import conversation as conversation_lib
 9 | from tinyllava.arguments import *
10 | 
11 | PREPROCESS_REGISTRY = {}
12 | 
13 | def register_preprocess(name):
14 |     def register_preprocess_cls(cls):
15 |         if name in PREPROCESS_REGISTRY:
16 |             return PREPROCESS_REGISTRY[name]
17 | 
18 |         PREPROCESS_REGISTRY[name] = cls
19 |         return cls
20 | 
21 |     return register_preprocess_cls
22 | 
23 | 
24 | def import_modules(modules_dir, namespace):
25 |     for file in os.listdir(modules_dir):
26 |         path = os.path.join(modules_dir, file)
27 | 
28 |         if (
29 |                 not file.startswith("_")
30 |                 and not file.startswith(".")
31 |                 and (file.endswith(".py") or os.path.isdir(path))
32 |         ):
33 |             module_name = file[: file.find(".py")] if file.endswith(".py") else file
34 |             importlib.import_module(namespace + "." + module_name)
35 | 
36 | models_dir = os.path.join(os.path.dirname(__file__), 'preprocess')
37 | import_modules(models_dir, "tinyllava.data.preprocess")
38 | 
39 | 
40 | def PreprocessSelect(version):
41 |     result = PREPROCESS_REGISTRY.get(version, None)
42 |     if result is None:
43 |         for name in PREPROCESS_REGISTRY.keys():
44 |             if version in name:
45 |                 result = PREPROCESS_REGISTRY[name]
46 |                 break
47 |     if result is None:
48 |         result = PREPROCESS_REGISTRY['default']
49 |     return result
50 | 
51 | 
52 | 
53 | def preprocess_multimodal(
54 |     sources: Sequence[str],
55 |     data_args: DataArguments
56 | ) -> Dict:
57 |     is_multimodal = data_args.is_multimodal
58 |     if not is_multimodal:
59 |         return sources
60 | 
61 |     for source in sources:
62 |         for sentence in source:
63 |             if DEFAULT_IMAGE_TOKEN in sentence['value']:
64 |                 sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
65 |                 sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
66 |                 sentence['value'] = sentence['value'].strip()
67 |                 if "mmtag" in conversation_lib.default_conversation.version:
68 |                     sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN,
69 |                                                                   '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
70 |             replace_token = DEFAULT_IMAGE_TOKEN
71 |             if data_args.mm_use_im_start_end:
72 |                 replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
73 |             sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
74 | 
75 |     return sources
76 | 
77 | 
78 | def preprocess(
79 |     sources: Sequence[str],
80 |     tokenizer: transformers.PreTrainedTokenizer,
81 |     has_image: bool = False
82 | ) -> Dict:
83 |     return PreprocessSelect(conversation_lib.default_conversation.version)(sources, tokenizer, has_image)
84 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/eval/__init__.py


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/SemDeDup/clustering/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import logging
  9 | import os
 10 | import numpy as np
 11 | import random
 12 | 
 13 | 
 14 | def seed_everything(seed: int = 42):
 15 |     """
 16 |     Function to set seed for random number generators for reproducibility.
 17 | 
 18 |     Args:
 19 |         seed: The seed value to use for random number generators. Default is 42.
 20 | 
 21 |     Returns:
 22 |         None
 23 |     """
 24 |     # Set seed values for various random number generators
 25 |     random.seed(seed)
 26 |     os.environ["PYTHONHASHSEED"] = str(seed)
 27 |     np.random.seed(seed)
 28 |     torch.manual_seed(seed)
 29 |     torch.cuda.manual_seed(seed)
 30 |     torch.cuda.manual_seed_all(seed)
 31 | 
 32 |     # Ensure deterministic behavior for CUDA algorithms
 33 |     torch.backends.cudnn.deterministic = True
 34 |     torch.backends.cudnn.benchmark = False
 35 | 
 36 | 
 37 | # def get_logger(
 38 | #     file_name="logger.log", level=logging.INFO, stdout=False
 39 | # ) -> logging.Logger:
 40 | #     """
 41 | #     When the level is set to "logging.INFO", the debugging logs will not be saved (lower level).
 42 | #     """
 43 | #     # See https://www.loggly.com/ultimate-guide/python-logging-basics/ for more information about pyhton logging module
 44 | #     logger = logging.getLogger()  # uses the module name
 45 | #     # set log level
 46 | #     logger.setLevel(level)
 47 | #     logger.handlers = []
 48 | #     # define file handler and set formatter
 49 | #     file_handler = logging.FileHandler(
 50 | #         file_name
 51 | #     )  # or use logging.handlers.WatchedFileHandler(os.environ.get("LOGFILE", file_name))
 52 | #     # define formatter
 53 | #     formatter = logging.Formatter(
 54 | #         "%(asctime)s : %(levelname)s : %(name)s : %(message)s"
 55 | #     )  # or use logging.BASIC_FORMAT
 56 | #     file_handler.setFormatter(formatter)
 57 | 
 58 | #     stdout_handler = (
 59 | #         logging.StreamHandler()
 60 | #     )  # .setLevel(logging.DEBUG) #.setFormatter(CustomFormatter(fmt))
 61 | 
 62 | #     # add handler to logger
 63 | #     # if not logger.hasHandlers():
 64 | #     logger.addHandler(file_handler)
 65 | #     if stdout:
 66 | #         logger.addHandler(stdout_handler)
 67 | 
 68 | #     return logger
 69 | 
 70 | 
 71 | #     import logging
 72 | 
 73 | 
 74 | def get_logger(
 75 |     file_name: str = "logger.log", level: int = logging.INFO, stdout: bool = False
 76 | ) -> logging.Logger:
 77 |     """
 78 |     Initialize and configure the logger object to save log entries to a file and optionally print to stdout.
 79 | 
 80 |     :param file_name: The name of the log file.
 81 |     :param level: The logging level to use (default: INFO).
 82 |     :param stdout: Whether to enable printing log entries to stdout (default: False).
 83 |     :return: A configured logging.Logger instance.
 84 |     """
 85 |     logger = logging.getLogger(__name__)
 86 | 
 87 |     # Set the logging level
 88 |     logger.setLevel(level)
 89 | 
 90 |     # Remove any existing handlers from the logger
 91 |     logger.handlers = []
 92 | 
 93 |     # Create a file handler for the logger
 94 |     file_handler = logging.FileHandler(file_name)
 95 | 
 96 |     # Define the formatter for the log entries
 97 |     formatter = logging.Formatter(
 98 |         "%(asctime)s : %(levelname)s : %(name)s : %(message)s"
 99 |     )
100 | 
101 |     # Set the formatter for the file handler
102 |     file_handler.setFormatter(formatter)
103 | 
104 |     # Add the file handler to the logger
105 |     logger.addHandler(file_handler)
106 | 
107 |     # Optionally add a stdout handler to the logger
108 |     if stdout:
109 |         stdout_handler = logging.StreamHandler()
110 |         stdout_handler.setFormatter(formatter)
111 |         logger.addHandler(stdout_handler)
112 | 
113 |     # Return the configured logger instance
114 |     return logger
115 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/SemDeDup/compute_centroids.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import numpy as np
 4 | import os
 5 | import logging
 6 | from tinyllava.eval.score.SemDeDup.clustering.clustering import compute_centroids
 7 | from tinyllava.eval.score.SemDeDup.clustering.sort_clusters import assign_and_sort_clusters
 8 | from tinyllava.eval.score.SemDeDup.execute_semdedup import execute_semdedup
 9 | from tinyllava.eval.score.SemDeDup.extract_dedup_data import extract_pruned_data
10 | 
11 | logger = logging.getLogger(__name__)
12 | logger.addHandler(logging.StreamHandler())
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("--sim_metric", type=str, default="cosine")
18 |     parser.add_argument("--keep_hard", action='store_true')
19 |     parser.add_argument("--Kmeans_with_cos_dist", action='store_true')
20 |     parser.add_argument("--emb_memory_loc", type=str, default="emb.npy")
21 |     parser.add_argument("--sorted_clusters_file_loc", type=str, default="./sorted_clusters")
22 |     parser.add_argument("--save_folder", type=str, default="./save_folder")
23 |     parser.add_argument("--output_indices_path", type=str, default="./selected_indices.json")
24 |     parser.add_argument("--ncentroids", type=int, default=500)  # proportional to dataset size
25 |     parser.add_argument("--niter", type=int, default=100)
26 |     parser.add_argument("--seed", type=int, default=1234)
27 |     parser.add_argument("--prune_p", type=float, default=.0)
28 |     parser.add_argument("--eps_list", type=float, nargs='+', default=[0.48, 0.47, 0.46, 0.45, 0.44])
29 | 
30 |     args = parser.parse_args()
31 | 
32 |     ## -- Fix the seed
33 |     SEED = args.seed
34 |     random.seed(SEED)
35 | 
36 |     emb_memory = np.load(args.emb_memory_loc)
37 |     dataset_size, emb_size = emb_memory.shape
38 |     # Normalize since SemDeDup uses Spherical Kmeans clustering with normalized embeddings, referring to paper, even in language modality with OPT model.
39 |     emb_memory = emb_memory / np.linalg.norm(emb_memory, axis=-1, keepdims=True)
40 | 
41 |     compute_centroids(
42 |         data=emb_memory,
43 |         ncentroids=args.ncentroids,
44 |         niter=args.niter,
45 |         seed=args.seed,
46 |         Kmeans_with_cos_dist=args.Kmeans_with_cos_dist,
47 |         save_folder=args.save_folder,
48 |         logger=logger,
49 |         verbose=True,
50 |     )
51 | 
52 |     indices_list = np.array(list(range(dataset_size)))
53 |     assign_and_sort_clusters(
54 |         data=emb_memory,
55 |         paths_list=indices_list,
56 |         sim_metric=args.sim_metric,
57 |         keep_hard=args.keep_hard,
58 |         kmeans_with_cos_dist=args.Kmeans_with_cos_dist,
59 |         save_folder=args.save_folder,
60 |         sorted_clusters_file_loc=args.sorted_clusters_file_loc,
61 |         cluster_ids=range(0, args.ncentroids),
62 |         logger=logger,
63 |     )
64 | 
65 |     execute_semdedup(
66 |         embs=emb_memory,
67 |         cluster_ids=range(0, args.ncentroids),
68 |         save_loc=args.save_folder,
69 |         sorted_clusters_path=args.sorted_clusters_file_loc,
70 |         eps_list=args.eps_list,
71 |         which_to_keep= "hard" if args.keep_hard else "easy",
72 |     )
73 | 
74 |     target_length = int(args.prune_p * dataset_size)
75 |     extract_pruned_data(
76 |         sorted_clusters_path=args.sorted_clusters_file_loc,
77 |         semdedup_pruning_tables_path=os.path.join(args.save_folder, "dataframes"),
78 |         eps_list=args.eps_list,
79 |         num_clusters=args.ncentroids,
80 |         output_indices_path=args.output_indices_path,
81 |         target_length=target_length,
82 |     )
83 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/SemDeDup/extract_dedup_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import os
 8 | from tqdm import tqdm
 9 | import pickle
10 | import numpy as np
11 | import json
12 | 
13 | IMAGE_NAME_INDEX = 0
14 | 
15 | def extract_pruned_data(
16 |     sorted_clusters_path,
17 |     semdedup_pruning_tables_path,
18 |     eps_list,
19 |     num_clusters,
20 |     output_indices_path,
21 |     target_length,
22 |     retreive_kept_samples=True,
23 | ):
24 | 
25 |     target_close = []
26 |     for eps in eps_list:
27 | 
28 |         ## -- list of paths to the examples we want to keep/remove.
29 |         example_paths = []
30 | 
31 |         for cluster_id in tqdm(range(0, num_clusters)):
32 | 
33 |             cluster_i = np.load(
34 |                 os.path.join(sorted_clusters_path, f"cluster_{cluster_id}.npy")
35 |             )
36 |             with open(
37 |                 f"{semdedup_pruning_tables_path}/cluster_{cluster_id}.pkl", "rb"
38 |             ) as file:
39 |                 semdedup_pruning_tables = pickle.load(file)
40 | 
41 |             ## -- See which examples to keep/remove from this cluster.
42 |             ## -- Use retreive_kept_samples=True when kept dataset size <= 50%. This will return a smaller output text file,
43 |             ## -- semdedup_pruning_tables contain True values for the examples to be removed.
44 |             images_to_keep_or_remove = semdedup_pruning_tables[f"eps={eps}"][
45 |                 semdedup_pruning_tables[f"eps={eps}"] == (not retreive_kept_samples)
46 |             ].index.to_numpy()
47 |             if "indices" in semdedup_pruning_tables.columns:
48 |                 cluster_i = cluster_i[semdedup_pruning_tables["indices"]]
49 |             ## -- retrieve only the examples we want and add to the list.
50 |             dedup_cluster = cluster_i[images_to_keep_or_remove]
51 |             example_paths += dedup_cluster[:, IMAGE_NAME_INDEX].astype("int").tolist()
52 | 
53 |         if (abs(len(example_paths) - target_length) < abs(len(target_close) - target_length)) and (len(example_paths) - target_length >= 0):
54 |             target_close = example_paths
55 | 
56 |     assert len(target_close) != 0
57 | 
58 |     num_diff = len(target_close) - target_length
59 |     print(f"Selected data - Target length: {num_diff}")
60 | 
61 |     selected_indices = np.array(target_close)
62 |     selected_indices = selected_indices[:target_length]
63 |     np.save(output_indices_path, selected_indices)
64 | 
65 |     return
66 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/coincide/cluster_transferability.py:
--------------------------------------------------------------------------------
 1 | # Here, we assume that the embeddings, clip-llava_score is already calculated
 2 | # Codes from https://github.com/adymaharana/d2pruning?tab=readme-ov-file
 3 | 
 4 | import os
 5 | import math
 6 | import argparse
 7 | import time
 8 | import numpy as np
 9 | import faiss
10 | import torch
11 | import copy
12 | from sklearn.metrics.pairwise import cosine_similarity
13 | 
14 | from tqdm import tqdm
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--centroid_embed_path", type=str, required=True)
20 |     parser.add_argument("--transferability_path", type=str, required=True, help="Path to output directory")
21 |     parser.add_argument("--k", type=int, default=4)
22 |     parser.add_argument("--knn_path", type=str, required=True)
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     centroid_embed = np.load(args.centroid_embed_path)
27 |     centroid_embed = centroid_embed.reshape(-1, 5, 4096)
28 |     centroid_embed = centroid_embed[:,:,2048:]
29 |     centroid_embed = centroid_embed.reshape(-1, 5*2048)
30 | 
31 |     cosine_sim = cosine_similarity(centroid_embed, centroid_embed)
32 | 
33 |     knn_cluster_indices = np.argsort(cosine_sim, axis=-1)[:,::-1][:,:args.k+1]
34 |     knn_cluster_similarity = cosine_sim[np.arange(len(cosine_sim))[:,None], knn_cluster_indices]
35 | 
36 |     np.save(args.knn_path + '_indices.npy', knn_cluster_indices)
37 |     np.save(args.knn_path + '_similarity.npy', knn_cluster_similarity)
38 | 
39 |     mask = cosine_sim > 0.9
40 |     cosine_sim[mask] = 0
41 |     transfer = cosine_sim.sum(axis=-1) / (~mask).sum(axis=-1)
42 |     # transfer = cosine_sim.mean(axis=-1)
43 |     np.save(args.transferability_path, transfer)
44 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/coincide/compute_centroids.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import numpy as np
 4 | import os
 5 | import logging
 6 | from tinyllava.eval.score.coincide.clustering import compute_centroids
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | logger.addHandler(logging.StreamHandler())
10 | logger.setLevel(logging.INFO)
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--sim_metric", type=str, default="cosine")
15 |     parser.add_argument("--Kmeans_with_cos_dist", action='store_true')
16 |     parser.add_argument("--emb_memory_loc", type=str, default="emb.npy")
17 |     parser.add_argument("--save_folder", type=str, default="./save_folder")
18 |     parser.add_argument("--ncentroids", type=int, default=500)  # proportional to dataset size
19 |     parser.add_argument("--niter", type=int, default=100)
20 |     parser.add_argument("--seed", type=int, default=1234)
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     ## -- Fix the seed
25 |     SEED = args.seed
26 |     random.seed(SEED)
27 | 
28 |     emb_memory = np.load(args.emb_memory_loc)
29 |     dataset_size, emb_size = emb_memory.shape
30 |     # Normalize since SemDeDup uses Spherical Kmeans clustering with normalized embeddings, referring to paper, even in language modality with OPT model.
31 |     # emb_memory = emb_memory / np.linalg.norm(emb_memory, axis=-1, keepdims=True)
32 | 
33 |     compute_centroids(
34 |         data=emb_memory,
35 |         ncentroids=args.ncentroids,
36 |         niter=args.niter,
37 |         seed=args.seed,
38 |         Kmeans_with_cos_dist=args.Kmeans_with_cos_dist,
39 |         save_folder=args.save_folder,
40 |         logger=logger,
41 |         verbose=True,
42 |     )
43 | 
44 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/merge_values.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import numpy as np
 4 | import os
 5 | import json
 6 | import glob
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--score_path", type=str, default=".")
11 |     parser.add_argument("--average", action='store_true')
12 |     args = parser.parse_args()
13 | 
14 |     score_files = sorted(glob.glob(args.score_path + '_[0-9]*.npy'))
15 | 
16 |     scores = []
17 |     for score_file in score_files:
18 |         scores.append(np.load(score_file))
19 | 
20 |     scores = np.concatenate(scores)
21 |     if args.average:
22 |         scores = np.mean(scores, axis=0)
23 | 
24 |     recover_indices = np.load(args.score_path + '_recover_indices.npy')
25 |     scores = scores[recover_indices]
26 | 
27 |     np.save(args.score_path + '.npy', scores)
28 | 
29 |     for score_file in score_files:
30 |         os.remove(score_file)
31 |     os.remove(args.score_path + '_recover_indices.npy')
32 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/eval/score/sort_values.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | 
 4 | if __name__ == "__main__":
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--score_path", type=str, default=".")
 7 |     parser.add_argument("--save_path", type=str, default="./temp.json")
 8 |     args = parser.parse_args()
 9 | 
10 |     scores = np.load(args.score_path)
11 |     score_indices = np.argsort(scores)
12 | 
13 |     np.save(args.save_path, score_indices)
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from tinyllava.model.language_model.llava_llama import TinyLlavaLlamaForCausalLM, TinyLlavaConfig
2 | from tinyllava.model.language_model.llava_stablelm import TinyLlavaStablelmForCausalLM, TinyLlavaStablelmConfig
3 | from tinyllava.model.language_model.llava_phi import TinyLlavaPhiForCausalLM, TinyLlavaPhiConfig
4 | from tinyllava.model.language_model.llava_qwen import LlavaQwen2Config, LlavaQwen2ForCausalLM
5 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/language_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/model/language_model/__init__.py


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/language_model/qwen/configuration_qwen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba Cloud.
 2 | #
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from transformers import PretrainedConfig
 7 | 
 8 | 
 9 | class QWenConfig(PretrainedConfig):
10 |     model_type = "qwen"
11 |     keys_to_ignore_at_inference = ["past_key_values"]
12 | 
13 |     def __init__(
14 |         self,
15 |         vocab_size=151936,
16 |         hidden_size=4096,
17 |         num_hidden_layers=32,
18 |         num_attention_heads=32,
19 |         emb_dropout_prob=0.0,
20 |         attn_dropout_prob=0.0,
21 |         layer_norm_epsilon=1e-6,
22 |         initializer_range=0.02,
23 |         max_position_embeddings=8192,
24 |         scale_attn_weights=True,
25 |         use_cache=True,
26 |         bf16=False,
27 |         fp16=False,
28 |         fp32=False,
29 |         kv_channels=128,
30 |         rotary_pct=1.0,
31 |         rotary_emb_base=10000,
32 |         use_dynamic_ntk=True,
33 |         use_logn_attn=True,
34 |         use_flash_attn="auto",
35 |         intermediate_size=22016,
36 |         no_bias=True,
37 |         tie_word_embeddings=False,
38 |         use_cache_quantization=False,
39 |         use_cache_kernel=False,
40 |         softmax_in_fp32=False,
41 |         **kwargs,
42 |     ):
43 |         self.vocab_size = vocab_size
44 |         self.hidden_size = hidden_size
45 |         self.intermediate_size = intermediate_size
46 |         self.num_hidden_layers = num_hidden_layers
47 |         self.num_attention_heads = num_attention_heads
48 |         self.emb_dropout_prob = emb_dropout_prob
49 |         self.attn_dropout_prob = attn_dropout_prob
50 |         self.layer_norm_epsilon = layer_norm_epsilon
51 |         self.initializer_range = initializer_range
52 |         self.scale_attn_weights = scale_attn_weights
53 |         self.use_cache = use_cache
54 |         self.max_position_embeddings = max_position_embeddings
55 |         self.bf16 = bf16
56 |         self.fp16 = fp16
57 |         self.fp32 = fp32
58 |         self.kv_channels = kv_channels
59 |         self.rotary_pct = rotary_pct
60 |         self.rotary_emb_base = rotary_emb_base
61 |         self.use_dynamic_ntk = use_dynamic_ntk
62 |         self.use_logn_attn = use_logn_attn
63 |         self.use_flash_attn = use_flash_attn
64 |         self.no_bias = no_bias
65 |         self.use_cache_quantization = use_cache_quantization
66 |         self.use_cache_kernel = use_cache_kernel
67 |         self.softmax_in_fp32 = softmax_in_fp32
68 |         super().__init__(
69 |             tie_word_embeddings=tie_word_embeddings,
70 |             **kwargs
71 |         )
72 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/language_model/qwen/cpp_kernels.py:
--------------------------------------------------------------------------------
 1 | from torch.utils import cpp_extension
 2 | import pathlib
 3 | import os
 4 | import subprocess
 5 | 
 6 | def _get_cuda_bare_metal_version(cuda_dir):
 7 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
 8 |                                          universal_newlines=True)
 9 |     output = raw_output.split()
10 |     release_idx = output.index("release") + 1
11 |     release = output[release_idx].split(".")
12 |     bare_metal_major = release[0]
13 |     bare_metal_minor = release[1][0]
14 | 
15 |     return raw_output, bare_metal_major, bare_metal_minor
16 | 
17 | def _create_build_dir(buildpath):
18 |     try:
19 |         os.mkdir(buildpath)
20 |     except OSError:
21 |         if not os.path.isdir(buildpath):
22 |             print(f"Creation of the build directory {buildpath} failed")
23 | 
24 | # Check if cuda 11 is installed for compute capability 8.0
25 | cc_flag = []
26 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
27 | if int(bare_metal_major) >= 11:
28 |     cc_flag.append('-gencode')
29 |     cc_flag.append('arch=compute_80,code=sm_80')
30 |     if int(bare_metal_minor) >= 7:
31 |         cc_flag.append('-gencode')
32 |         cc_flag.append('arch=compute_90,code=sm_90')
33 | 
34 | # Build path
35 | srcpath = pathlib.Path(__file__).parent.absolute()
36 | buildpath = srcpath / 'build'
37 | _create_build_dir(buildpath)
38 | 
39 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
40 |     return cpp_extension.load(
41 |         name=name,
42 |         sources=sources,
43 |         build_directory=buildpath,
44 |         extra_cflags=['-O3', ],
45 |         extra_cuda_cflags=['-O3',
46 |                            '-gencode', 'arch=compute_70,code=sm_70',
47 |                            '--use_fast_math'] + extra_cuda_flags + cc_flag,
48 |         verbose=1
49 |     )
50 | 
51 | extra_flags = []
52 | 
53 | cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp",
54 |            "./cache_autogptq_cuda_kernel_256.cu"]
55 | cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)
56 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/language_model/qwen/llava_qwen.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_cluster/tinyllava/model/language_model/qwen/llava_qwen.py


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/model_factory.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | MODEL_REGISTRY = {}
 5 | TOKENIZER_REGISTRY = {}
 6 | 
 7 | 
 8 | def ModelSelect(model_name_or_path):
 9 |     model = None
10 |     for name in MODEL_REGISTRY.keys():
11 |         if name.lower() in model_name_or_path.lower():
12 |             model = MODEL_REGISTRY[name]
13 |     if model is None:
14 |         model = MODEL_REGISTRY['llama']
15 |     return model
16 | 
17 | 
18 | def TokenizerSelect(model_name_or_path):
19 |     tokenizer_init = None
20 |     for name in TOKENIZER_REGISTRY.keys():
21 |         if name.lower() in model_name_or_path.lower():
22 |             tokenizer_init = TOKENIZER_REGISTRY[name]
23 |     if tokenizer_init is None:
24 |         tokenizer_init = TOKENIZER_REGISTRY['llama']
25 |     return tokenizer_init
26 | 
27 | 
28 | def register_model(name):
29 |     def register_model_cls(cls):
30 |         if name in MODEL_REGISTRY:
31 |             return MODEL_REGISTRY[name]
32 | 
33 |         MODEL_REGISTRY[name] = cls
34 |         # FIXME: Find a more elegant way to do this
35 |         if name == 'phi':
36 |             MODEL_REGISTRY['TinyLLaVA-3.1B'] = cls
37 |         elif name == 'stablelm':
38 |             MODEL_REGISTRY['TinyLLaVA-2.0B'] = cls
39 |         elif name == 'llama':
40 |             MODEL_REGISTRY['TinyLLaVA-1.5B'] = cls
41 |         return cls
42 | 
43 |     return register_model_cls
44 | 
45 | 
46 | def register_tokenizer(name):
47 |     def register_tokenizer_cls(cls):
48 |         if name in TOKENIZER_REGISTRY:
49 |             return TOKENIZER_REGISTRY[name]
50 | 
51 |         TOKENIZER_REGISTRY[name] = cls
52 |         # FIXME: Find a more elegant way to do this
53 |         if name == 'phi':
54 |             TOKENIZER_REGISTRY['TinyLLaVA-3.1B'] = cls
55 |         elif name == 'stablelm':
56 |             TOKENIZER_REGISTRY['TinyLLaVA-2.0B'] = cls
57 |         elif name == 'llama':
58 |             TOKENIZER_REGISTRY['TinyLLaVA-1.5B'] = cls
59 |         return cls
60 | 
61 |     return register_tokenizer_cls
62 | 
63 | 
64 | def import_models(models_dir, namespace):
65 |     for file in os.listdir(models_dir):
66 |         path = os.path.join(models_dir, file)
67 |         if (
68 |             not file.startswith("_")
69 |             and not file.startswith(".")
70 |             and file.endswith(".py")
71 |         ):
72 |             model_name = file[: file.find(".py")] if file.endswith(".py") else file
73 |             importlib.import_module(namespace + "." + model_name)
74 | 
75 | 
76 | # automatically import any Python files in the models/ directory
77 | models_dir = os.path.join(os.path.dirname(__file__), 'language_model')
78 | import_models(models_dir, "tinyllava.model.language_model")


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tinyllava.model.multimodal_encoder.clip_encoder import CLIPVisionTower
 3 | from tinyllava.model.multimodal_encoder.siglip_encoder import SigLipVisionTower
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if 'siglip' not in vision_tower.lower():
 9 |         if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith('laion'):
10 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
11 |     else:
12 |         if is_absolute_path_exists or vision_tower.startswith("google") or vision_tower.startswith('bczhou'):
13 |             return SigLipVisionTower(vision_tower, vision_tower_cfg, **kwargs)
14 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
15 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
20 |             self.load_model()
21 |         else:
22 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
23 | 
24 |     def load_model(self):
25 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
26 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
27 |         self.vision_tower.requires_grad_(False)
28 | 
29 |         self.is_loaded = True
30 | 
31 |     def feature_select(self, image_forward_outs):
32 |         image_features = image_forward_outs.hidden_states[self.select_layer]
33 |         if self.select_feature == 'patch':
34 |             image_features = image_features[:, 1:]
35 |         elif self.select_feature == 'cls_patch':
36 |             image_features = image_features
37 |         else:
38 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
39 |         return image_features
40 | 
41 |     # @torch.no_grad()
42 |     def forward(self, images):
43 |         if type(images) is list:
44 |             image_features = []
45 |             for image in images:
46 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
47 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
48 |                 image_features.append(image_feature)
49 |         else:
50 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
51 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
52 | 
53 |         return image_features
54 | 
55 |     @property
56 |     def dummy_feature(self):
57 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
58 | 
59 |     @property
60 |     def dtype(self):
61 |         return self.vision_tower.dtype
62 | 
63 |     @property
64 |     def device(self):
65 |         return self.vision_tower.device
66 | 
67 |     @property
68 |     def config(self):
69 |         if self.is_loaded:
70 |             return self.vision_tower.config
71 |         else:
72 |             return self.cfg_only
73 | 
74 |     @property
75 |     def hidden_size(self):
76 |         return self.config.hidden_size
77 | 
78 |     @property
79 |     def num_patches_per_side(self):
80 |         return self.config.image_size // self.config.patch_size
81 | 
82 |     @property
83 |     def num_patches(self):
84 |         return (self.config.image_size // self.config.patch_size) ** 2
85 | 


--------------------------------------------------------------------------------
/COINCIDE_cluster/tinyllava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'tinyllava' in config and 'tinyllava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "tinyllava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/COINCIDE_train/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   gpu: true
 6 | 
 7 |   python_version: "3.11"
 8 | 
 9 |   python_packages:
10 |     - "torch==2.0.1"
11 |     - "accelerate==0.21.0"
12 |     - "bitsandbytes==0.41.0"
13 |     - "deepspeed==0.9.5"
14 |     - "einops-exts==0.0.4"
15 |     - "einops==0.6.1"
16 |     - "gradio==3.35.2"
17 |     - "gradio_client==0.2.9"
18 |     - "httpx==0.24.0"
19 |     - "markdown2==2.4.10"
20 |     - "numpy==1.26.0"
21 |     - "peft==0.4.0"
22 |     - "scikit-learn==1.2.2"
23 |     - "sentencepiece==0.1.99"
24 |     - "shortuuid==1.0.11"
25 |     - "timm==0.6.13"
26 |     - "tokenizers==0.13.3"
27 |     - "torch==2.0.1"
28 |     - "torchvision==0.15.2"
29 |     - "transformers==4.31.0"
30 |     - "wandb==0.15.12"
31 |     - "wavedrom==2.0.3.post3"
32 |     - "Pygments==2.16.1"
33 |   run:
34 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35 | 
36 | # predict.py defines how predictions are run on your model
37 | predict: "predict.py:Predictor"
38 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from llava.model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     print('success!')
 36 |     return response['choices'][0]['message']['content']
 37 | 
 38 | 
 39 | def parse_score(review):
 40 |     try:
 41 |         score_pair = review.split('\n')[0]
 42 |         score_pair = score_pair.replace(',', ' ')
 43 |         sp = score_pair.split(' ')
 44 |         if len(sp) == 2:
 45 |             return [float(sp[0]), float(sp[1])]
 46 |         else:
 47 |             print('error', review)
 48 |             return [-1, -1]
 49 |     except Exception as e:
 50 |         print(e)
 51 |         print('error', review)
 52 |         return [-1, -1]
 53 | 
 54 | 
 55 | if __name__ == '__main__':
 56 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 57 |     parser.add_argument('-q', '--question')
 58 |     # parser.add_argument('-a', '--answer')
 59 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 60 |     parser.add_argument('-r', '--rule')
 61 |     parser.add_argument('-o', '--output')
 62 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 63 |     args = parser.parse_args()
 64 | 
 65 |     ray.init()
 66 | 
 67 |     f_q = open(os.path.expanduser(args.question))
 68 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 69 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     review_file = open(f'{args.output}', 'w')
 73 | 
 74 |     js_list = []
 75 |     handles = []
 76 |     idx = 0
 77 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 78 |         # if idx == 1:
 79 |         #     break
 80 | 
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         category = json.loads(ques_js)['category']
 86 |         if category in rule_dict:
 87 |             rule = rule_dict[category]
 88 |         else:
 89 |             rule = rule_dict['default']
 90 |         prompt = rule['prompt']
 91 |         role = rule['role']
 92 |         content = (f'[Question]\n{ques["text"]}\n\n'
 93 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 94 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 95 |                    f'[System]\n{prompt}\n\n')
 96 |         js_list.append({
 97 |             'id': idx+1,
 98 |             'question_id': ques['question_id'],
 99 |             'answer1_id': ans1['answer_id'],
100 |             'answer2_id': ans2['answer_id'],
101 |             'category': category})
102 |         idx += 1
103 |         handles.append(get_eval.remote(content, args.max_tokens))
104 |         # To avoid the rate limit set by OpenAI
105 |         time.sleep(NUM_SECONDS_TO_SLEEP)
106 | 
107 |     reviews = ray.get(handles)
108 |     for idx, review in enumerate(reviews):
109 |         scores = parse_score(review)
110 |         js_list[idx]['content'] = review
111 |         js_list[idx]['tuple'] = scores
112 |         review_file.write(json.dumps(js_list[idx]) + '\n')
113 |     review_file.close()
114 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 llava_score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
 3 | import torch
 4 | import os
 5 | import json
 6 | from tqdm import tqdm
 7 | import shortuuid
 8 | 
 9 | from llava.conversation import default_conversation
10 | from llava.utils import disable_torch_init
11 | 
12 | 
13 | @torch.inference_mode()
14 | def eval_model(model_name, questions_file, answers_file):
15 |     # Model
16 |     disable_torch_init()
17 |     model_name = os.path.expanduser(model_name)
18 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
19 |     model = AutoModelForCausalLM.from_pretrained(model_name,
20 |         torch_dtype=torch.float16).cuda()
21 | 
22 | 
23 |     ques_file = open(os.path.expanduser(questions_file), "r")
24 |     ans_file = open(os.path.expanduser(answers_file), "w")
25 |     for i, line in enumerate(tqdm(ques_file)):
26 |         idx = json.loads(line)["question_id"]
27 |         qs = json.loads(line)["text"]
28 |         cat = json.loads(line)["category"]
29 |         conv = default_conversation.copy()
30 |         conv.append_message(conv.roles[0], qs)
31 |         prompt = conv.get_prompt()
32 |         inputs = tokenizer([prompt])
33 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
34 |         output_ids = model.generate(
35 |             input_ids,
36 |             do_sample=True,
37 |             use_cache=True,
38 |             temperature=0.7,
39 |             max_new_tokens=1024,)
40 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
41 |         try:
42 |             index = outputs.index(conv.sep, len(prompt))
43 |         except ValueError:
44 |             outputs += conv.sep
45 |             index = outputs.index(conv.sep, len(prompt))
46 | 
47 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
48 |         ans_id = shortuuid.uuid()
49 |         ans_file.write(json.dumps({"question_id": idx,
50 |                                    "text": outputs,
51 |                                    "answer_id": ans_id,
52 |                                    "model_id": model_name,
53 |                                    "metadata": {}}) + "\n")
54 |         ans_file.flush()
55 |     ans_file.close()
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
60 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
61 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
62 |     args = parser.parse_args()
63 | 
64 |     eval_model(args.model_name, args.question_file, args.answers_file)
65 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/model_vqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.model.builder import load_pretrained_model
 11 | from llava.utils import disable_torch_init
 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | 
 14 | from PIL import Image
 15 | import math
 16 | 
 17 | 
 18 | def split_list(lst, n):
 19 |     """Split a list into n (roughly) equal-sized chunks"""
 20 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 21 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 22 | 
 23 | 
 24 | def get_chunk(lst, n, k):
 25 |     chunks = split_list(lst, n)
 26 |     return chunks[k]
 27 | 
 28 | 
 29 | def eval_model(args):
 30 |     # Model
 31 |     disable_torch_init()
 32 |     model_path = os.path.expanduser(args.model_path)
 33 |     model_name = get_model_name_from_path(model_path)
 34 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 35 | 
 36 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 37 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 38 |     answers_file = os.path.expanduser(args.answers_file)
 39 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 40 |     ans_file = open(answers_file, "w")
 41 |     for line in tqdm(questions):
 42 |         idx = line["question_id"]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 |         cur_prompt = qs
 46 |         if model.config.mm_use_im_start_end:
 47 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 48 |         else:
 49 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 50 | 
 51 |         conv = conv_templates[args.conv_mode].copy()
 52 |         conv.append_message(conv.roles[0], qs)
 53 |         conv.append_message(conv.roles[1], None)
 54 |         prompt = conv.get_prompt()
 55 | 
 56 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 57 | 
 58 |         image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
 59 |         image_tensor = process_images([image], image_processor, model.config)[0]
 60 | 
 61 |         with torch.inference_mode():
 62 |             output_ids = model.generate(
 63 |                 input_ids,
 64 |                 images=image_tensor.unsqueeze(0).half().cuda(),
 65 |                 image_sizes=[image.size],
 66 |                 do_sample=True if args.temperature > 0 else False,
 67 |                 temperature=args.temperature,
 68 |                 top_p=args.top_p,
 69 |                 num_beams=args.num_beams,
 70 |                 # no_repeat_ngram_size=3,
 71 |                 max_new_tokens=1024,
 72 |                 use_cache=True)
 73 | 
 74 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
 75 | 
 76 |         ans_id = shortuuid.uuid()
 77 |         ans_file.write(json.dumps({"question_id": idx,
 78 |                                    "prompt": cur_prompt,
 79 |                                    "text": outputs,
 80 |                                    "answer_id": ans_id,
 81 |                                    "model_id": model_name,
 82 |                                    "metadata": {}}) + "\n")
 83 |         ans_file.flush()
 84 |     ans_file.close()
 85 | 
 86 | if __name__ == "__main__":
 87 |     parser = argparse.ArgumentParser()
 88 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
 89 |     parser.add_argument("--model-base", type=str, default=None)
 90 |     parser.add_argument("--image-folder", type=str, default="")
 91 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
 92 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
 93 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
 94 |     parser.add_argument("--num-chunks", type=int, default=1)
 95 |     parser.add_argument("--chunk-idx", type=int, default=0)
 96 |     parser.add_argument("--temperature", type=float, default=0.2)
 97 |     parser.add_argument("--top_p", type=float, default=None)
 98 |     parser.add_argument("--num_beams", type=int, default=1)
 99 |     args = parser.parse_args()
100 | 
101 |     eval_model(args)
102 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | import concurrent.futures
 8 | 
 9 | import openai
10 | import tqdm
11 | import shortuuid
12 | 
13 | MODEL = 'gpt-3.5-turbo'
14 | MODEL_ID = 'gpt-3.5-turbo:20230327'
15 | 
16 | def get_answer(question_id: int, question: str, max_tokens: int):
17 |     ans = {
18 |         'answer_id': shortuuid.uuid(),
19 |         'question_id': question_id,
20 |         'model_id': MODEL_ID,
21 |     }
22 |     for _ in range(3):
23 |         try:
24 |             response = openai.ChatCompletion.create(
25 |                 model=MODEL,
26 |                 messages=[{
27 |                     'role': 'system',
28 |                     'content': 'You are a helpful assistant.'
29 |                 }, {
30 |                     'role': 'user',
31 |                     'content': question,
32 |                 }],
33 |                 max_tokens=max_tokens,
34 |             )
35 |             ans['text'] = response['choices'][0]['message']['content']
36 |             return ans
37 |         except Exception as e:
38 |             print('[ERROR]', e)
39 |             ans['text'] = '#ERROR#'
40 |             time.sleep(1)
41 |     return ans
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
46 |     parser.add_argument('-q', '--question')
47 |     parser.add_argument('-o', '--output')
48 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
49 |     args = parser.parse_args()
50 | 
51 |     questions_dict = {}
52 |     with open(os.path.expanduser(args.question)) as f:
53 |         for line in f:
54 |             if not line:
55 |                 continue
56 |             q = json.loads(line)
57 |             questions_dict[q['question_id']] = q['text']
58 | 
59 |     answers = []
60 | 
61 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
62 |         futures = []
63 |         for qid, question in questions_dict.items():
64 |             future = executor.submit(get_answer, qid, question, args.max_tokens)
65 |             futures.append(future)
66 | 
67 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
68 |             answers.append(future.result())
69 | 
70 |     answers.sort(key=lambda x: x['question_id'])
71 | 
72 |     with open(os.path.expanduser(args.output), 'w') as f:
73 |         table = [json.dumps(ans) for ans in answers]
74 |         f.write('\n'.join(table))
75 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['llava_score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/eval/to_seed_image.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import os
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--original-file", type=str, default="tables/question.jsonl")
 8 |     parser.add_argument("--modify-file", type=str, default="answer.jsonl")
 9 |     args = parser.parse_args()
10 | 
11 |     seedbench_list = [json.loads(q) for q in open(args.original_file, "r")]
12 |     new_list = []
13 |     count = 0
14 |     for idx in range(len(seedbench_list)):
15 |         if seedbench_list[idx]['image'].startswith('SEED-Bench-video-image'):
16 |             continue
17 |         else:
18 |             new_list.append(seedbench_list[idx])
19 | 
20 |     mod_file = os.path.expanduser(args.modify_file)
21 |     os.makedirs(os.path.dirname(mod_file), exist_ok=True)
22 |     mod_file = open(args.modify_file, "w")
23 |     for idx in range(len(new_list)):
24 | 
25 |         mod_file.write(json.dumps(new_list[idx]) + "\n")
26 | 
27 |     mod_file.close()
28 | 
29 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from llava.model.language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
3 | from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
4 | 
5 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2023 Haotian Liu
 2 | #
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | #
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | 
16 | from typing import Optional, Tuple
17 | 
18 | import torch
19 | 
20 | from transformers import AutoConfig, AutoModelForCausalLM, \
21 |                          MptConfig, MptForCausalLM, MptModel
22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
23 | 
24 | 
25 | class LlavaMptConfig(MptConfig):
26 |     model_type = "llava_mpt"
27 | 
28 | 
29 | class LlavaMptModel(LlavaMetaModel, MptModel):
30 |     config_class = LlavaMptConfig
31 | 
32 |     def __init__(self, config: MptConfig):
33 |         config.hidden_size = config.d_model
34 |         super(LlavaMptModel, self).__init__(config)
35 |     
36 |     def embed_tokens(self, x):
37 |         return self.wte(x)
38 | 
39 | 
40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
41 |     config_class = LlavaMptConfig
42 |     supports_gradient_checkpointing = True
43 | 
44 |     def __init__(self, config):
45 |         super(MptForCausalLM, self).__init__(config)
46 | 
47 |         self.transformer = LlavaMptModel(config)
48 |         self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49 | 
50 |         # Initialize weights and apply final processing
51 |         self.post_init()
52 | 
53 |     def get_model(self):
54 |         return self.transformer
55 | 
56 |     def _set_gradient_checkpointing(self, module, value=False):
57 |         if isinstance(module, LlavaMptModel):
58 |             module.gradient_checkpointing = value
59 | 
60 |     def forward(
61 |         self,
62 |         input_ids: Optional[torch.LongTensor] = None,
63 |         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
64 |         attention_mask: Optional[torch.Tensor] = None,
65 |         inputs_embeds: Optional[torch.Tensor] = None,
66 |         labels: Optional[torch.Tensor] = None,
67 |         use_cache: Optional[bool] = None,
68 |         output_attentions: Optional[bool] = None,
69 |         output_hidden_states: Optional[bool] = None,
70 |         return_dict: Optional[bool] = None,
71 |         images=None):
72 | 
73 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
74 |         
75 |         return super().forward(
76 |             input_ids,
77 |             past_key_values=past_key_values,
78 |             attention_mask=attention_mask,
79 |             inputs_embeds=inputs_embeds,
80 |             labels=labels,
81 |             use_cache=use_cache,
82 |             output_attentions=output_attentions,
83 |             output_hidden_states=output_hidden_states,
84 |             return_dict=return_dict,
85 |         )
86 | 
87 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
88 |         images = kwargs.pop("images", None)
89 |         _inputs = super().prepare_inputs_for_generation(
90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
91 |         )
92 |         _inputs['images'] = images
93 |         return _inputs
94 | 
95 | 
96 | AutoConfig.register("llava_mpt", LlavaMptConfig)
97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
98 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
20 |             self.load_model()
21 |         else:
22 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
23 | 
24 |     def load_model(self, device_map=None):
25 |         if self.is_loaded:
26 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
27 |             return
28 | 
29 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
30 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
31 |         self.vision_tower.requires_grad_(False)
32 | 
33 |         self.is_loaded = True
34 | 
35 |     def feature_select(self, image_forward_outs):
36 |         image_features = image_forward_outs.hidden_states[self.select_layer]
37 |         if self.select_feature == 'patch':
38 |             image_features = image_features[:, 1:]
39 |         elif self.select_feature == 'cls_patch':
40 |             image_features = image_features
41 |         else:
42 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
43 |         return image_features
44 | 
45 |     @torch.no_grad()
46 |     def forward(self, images):
47 |         if type(images) is list:
48 |             image_features = []
49 |             for image in images:
50 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
51 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
52 |                 image_features.append(image_feature)
53 |         else:
54 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
55 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
56 | 
57 |         return image_features
58 | 
59 |     @property
60 |     def dummy_feature(self):
61 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
62 | 
63 |     @property
64 |     def dtype(self):
65 |         return self.vision_tower.dtype
66 | 
67 |     @property
68 |     def device(self):
69 |         return self.vision_tower.device
70 | 
71 |     @property
72 |     def config(self):
73 |         if self.is_loaded:
74 |             return self.vision_tower.config
75 |         else:
76 |             return self.cfg_only
77 | 
78 |     @property
79 |     def hidden_size(self):
80 |         return self.config.hidden_size
81 | 
82 |     @property
83 |     def num_patches_per_side(self):
84 |         return self.config.image_size // self.config.patch_size
85 | 
86 |     @property
87 |     def num_patches(self):
88 |         return (self.config.image_size // self.config.patch_size) ** 2
89 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_train/llava/serve/__init__.py


--------------------------------------------------------------------------------
/COINCIDE_train/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_train/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/COINCIDE_train/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/COINCIDE_train/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/COINCIDE_train/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/COINCIDE_train/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/000_caps.txt:
--------------------------------------------------------------------------------
 1 | A man wearing multiple neck ties making a goofy face.
 2 | A man in a white shirt wearing very many ties.
 3 | a man with ties on poses for a picture
 4 | A man wearing multiple ties on his neck.
 5 | A young man smiles while wearing several ties.
 6 | 
 7 | tie: [0.574, 0.298, 0.752, 0.704]
 8 | tie: [0.464, 0.339, 0.639, 0.789]
 9 | tie: [0.349, 0.363, 0.563, 0.732]
10 | tie: [0.259, 0.255, 0.668, 0.805]
11 | person: [0.019, 0.065, 0.962, 0.988]
12 | person: [0.0, 0.24, 0.214, 1.0]
13 | tie: [0.316, 0.778, 0.443, 0.867]
14 | tie: [0.386, 0.707, 0.496, 0.801]
15 | tie: [0.251, 0.354, 0.402, 0.747]
16 | tie: [0.44, 0.318, 0.63, 0.397]
17 | tie: [0.566, 0.33, 0.655, 0.543]
18 | tie: [0.25, 0.409, 0.359, 0.556]


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/000_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What is unusual about this photo?
3 | ===
4 | Answer:
5 | In the photo, the man is wearing a total of ten ties around his neck. This is considered unusual because it is not common for someone to wear multiple ties at once. In a typical setting, a person would wear only one tie. The man's goofy facial expression also adds to the unconventional and humorous nature of the image.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/001_caps.txt:
--------------------------------------------------------------------------------
 1 | A group of people standing outside of a black vehicle with various luggage.
 2 | Luggage surrounds a vehicle in an underground parking area
 3 | People try to fit all of their luggage in an SUV.
 4 | The sport utility vehicle is parked in the public garage, being packed for a trip
 5 | Some people with luggage near a van that is transporting it.
 6 | 
 7 | person: [0.681, 0.242, 0.774, 0.694]
 8 | person: [0.63, 0.222, 0.686, 0.516]
 9 | person: [0.444, 0.233, 0.487, 0.34]
10 | backpack: [0.384, 0.696, 0.485, 0.914]
11 | backpack: [0.755, 0.413, 0.846, 0.692]
12 | suitcase: [0.758, 0.413, 0.845, 0.69]
13 | suitcase: [0.1, 0.497, 0.173, 0.579]
14 | bicycle: [0.282, 0.363, 0.327, 0.442]
15 | car: [0.786, 0.25, 0.848, 0.322]
16 | car: [0.783, 0.27, 0.827, 0.335]
17 | car: [0.86, 0.254, 0.891, 0.3]
18 | car: [0.261, 0.101, 0.787, 0.626]


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/001_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What challenges do these people face?
3 | ===
4 | Answer:
5 | In the image, a group of people is standing outside a black SUV in a parking area, surrounded by various pieces of luggage, including suitcases and backpacks. They are facing the challenge of fitting all their luggage into the black SUV. There are multiple suitcases and backpacks to be packed, which suggests that the group has a significant amount of belongings to accommodate. They might have to strategize and arrange the luggage efficiently to ensure that everything fits properly into the vehicle. Additionally, they need to consider the comfort of the passengers and visibility while driving, so the placement of the luggage must not obstruct the driver's view or make the passengers uncomfortable during the trip.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/002_caps.txt:
--------------------------------------------------------------------------------
1 | There is a movie theater that displays the show times above the doors.
2 | A red fire hydrant is deep in the snow.
3 | The fire hydrant is in the snow near a recently plowed sidewalk.
4 | This city has had a very hard winter with snow.
5 | A hotel for dogs in the snow in winter.
6 | 
7 | fire hydrant: [0.326, 0.612, 0.426, 0.72]


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/002_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What challenges might this city face?
3 | ===
4 | Answer:
5 | The city faces challenges due to the harsh winter conditions and heavy snowfall. In the image, a red fire hydrant is almost buried deep in the snow, which indicates the significant amount of snow the city has experienced. This can lead to various challenges such as difficulties in transportation, increased risk of accidents, and disruptions to daily life. For example, the recently plowed sidewalk near the fire hydrant shows that the city has to constantly clear snow from roads and sidewalks to maintain access and safety for pedestrians and vehicles. Moreover, emergency services, like firefighters, might face challenges accessing crucial equipment, such as fire hydrants, during emergencies due to the snow accumulation. This highlights the importance of effective snow management strategies and preparedness in such cities to minimize the impact of harsh winter conditions on residents and essential services.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/complex_reasoning/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
 2 | 
 3 | The task is to use the provided caption and bounding box information, create a plausible question about the image, and provide the answer in detail.
 4 | 
 5 | Create complex questions beyond describing the scene.
 6 | To answer such questions, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request.  Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first.
 7 | 
 8 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects.  
 9 | 
10 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box.  Always answer as if you are directly looking at the image.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/conversation/000_caps.txt:
--------------------------------------------------------------------------------
1 | There is a movie theater that displays the show times above the doors.
2 | A red fire hydrant is deep in the snow.
3 | The fire hydrant is in the snow near a recently plowed sidewalk.
4 | This city has had a very hard winter with snow.
5 | A hotel for dogs in the snow in winter.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/conversation/000_conv.txt:
--------------------------------------------------------------------------------
 1 | Question:
 2 | What color is the fire hydrant in the image?
 3 | ===
 4 | Answer:
 5 | The fire hydrant in the image is red.
 6 | ===
 7 | Question:
 8 | Is the fire hydrant fully covered by snow or partially visible?
 9 | ===
10 | Answer:
11 | The fire hydrant is partially visible, with a significant portion of it covered by the deep snow.
12 | ===
13 | Question:
14 | What is the condition of the sidewalk near the fire hydrant?
15 | ===
16 | Answer:
17 | The sidewalk near the fire hydrant has been recently plowed, so it is relatively clear of snow compared to the surrounding area.
18 | ===
19 | Question:
20 | What does the image suggest about the winter season in the city?
21 | ===
22 | Answer:
23 | The image suggests that the winter season in the city has been quite harsh and challenging. The significant snow accumulation seen in the image indicates that the city has experienced heavy snowfall, which can lead to various difficulties and disruptions for its residents. 
24 | 
25 | For instance, the deep snow covering the fire hydrant could potentially hinder its accessibility in case of emergencies, which poses safety concerns. The recently plowed sidewalk implies that the city's maintenance crews have been actively working to keep the walkways clear and safe for pedestrians, but the sheer amount of snowfall might make it difficult to keep up with the continuous snow removal efforts. 
26 | 
27 | Furthermore, such extreme winter conditions can impact transportation, leading to traffic issues, delays in public transit, and increased risks of accidents due to slippery roads. It can also cause problems for businesses and schools, as people might face challenges commuting to work or attending classes. Additionally, the heavy snow can put extra strain on infrastructure, such as roofs and power lines, increasing the likelihood of structural damage or power outages. 
28 | 
29 | In conclusion, the image of the red fire hydrant deep in the snow and the recently plowed sidewalk suggest that the city has faced a particularly severe winter season, with substantial snowfall that has likely caused various challenges and disruptions for its residents and infrastructure.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/conversation/001_caps.txt:
--------------------------------------------------------------------------------
1 | A man is skiing in the open snow covered hills
2 | A skier is making his way into the snow.
3 | A skier on the foothills of a huge mountain range.
4 | A skier looks at mountains while standing near a trail sign.
5 | a single person skiing by an area with a lot of bushes


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/conversation/001_conv.txt:
--------------------------------------------------------------------------------
 1 | Question:
 2 | What is the skier doing?
 3 | ===
 4 | Answer:
 5 | The skier is skiing in the snow-covered hills, making his way through the snow and enjoying the beautiful mountain scenery. They are also standing near a trail sign, which indicates their location on the mountain.
 6 | ===
 7 | Question:
 8 | Are there any other people or skiers visible in the image?
 9 | ===
10 | Answer:
11 | No, there is only a single person skiing in the image, and they are surrounded by an area with a lot of bushes.
12 | ===
13 | Question:
14 | What kind of skiing is the person doing? Cross-country or downhill?
15 | ===
16 | Answer:
17 | Based on what I can see in the photo, it is not possible to definitively determine whether the skier is doing cross-country or downhill skiing. However, given the presence of a trail sign and the fact that they are skiing in the foothills, it is more likely that the skier is engaged in cross-country skiing. Downhill skiing typically takes place on more steep slopes and involves ski lifts, whereas cross-country skiing often occurs on relatively flatter terrain with marked trails.
18 | ===
19 | Question:
20 | Can you explain cross-country and downhill skiing a bit more?
21 | ===
22 | Answer:
23 | Cross-country skiing and downhill skiing are two distinct types of skiing, each with its own techniques, equipment, and terrain preferences. 
24 | 
25 | 1. Cross-Country Skiing:
26 | Also known as Nordic skiing, cross-country skiing involves traversing relatively flat terrain or gentle slopes using skis and poles. It is a popular recreational and competitive sport that focuses on endurance, balance, and aerobic fitness. Cross-country skiers use their own locomotion to move across snow-covered terrain, rather than relying on ski lifts or other forms of assistance. 
27 | There are two main techniques in cross-country skiing:
28 | - Classic: This technique involves skiing in parallel tracks, using a forward-and-backward gliding motion while keeping the skis parallel to each other. The skier uses their poles to propel themselves forward.
29 | - Skate skiing: This technique is similar to ice skating, where the skier pushes off one ski at an angle and glides on the other. It requires a smooth, groomed surface and is typically faster than the classic technique. 
30 | Cross-country ski equipment consists of lightweight skis, boots, bindings, and poles. The boots are more flexible than those used in downhill skiing, allowing for better ankle movement and control. 
31 | 
32 | 2. Downhill Skiing:
33 | Also known as alpine skiing, downhill skiing involves descending slopes at high speeds using skis and poles for balance and control. This sport is more focused on speed, technique, and navigating challenging terrain, including steep slopes, moguls, and even jumps. 
34 | Downhill skiing can be further categorized into several disciplines, such as slalom, giant slalom, super-G, and downhill racing. Each discipline has its own set of rules, courses, and ski equipment. 
35 | Downhill ski equipment includes heavier and stiffer skis, boots, bindings, and poles than those used in cross-country skiing. The boots are more rigid to provide better support and control during high-speed descents and sharp turns. 
36 | 
37 | In summary, cross-country skiing is an endurance-based sport that involves traveling across flat or gently sloping terrain, while downhill skiing is focused on speed and technique as skiers navigate steeper slopes and challenging terrain. Both sports require specialized equipment and techniques, but they offer different experiences and challenges to participants.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/conversation/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, describing the same image you are looking at. Answer all questions as you are seeing the image.
 2 | 
 3 | Design a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question.
 4 | Ask diverse questions and give corresponding answers.
 5 | 
 6 | Include questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers:
 7 | (1) one can see the content in the image that the question asks about and can answer confidently;
 8 | (2) one can determine confidently from the image that it is not in the image.
 9 | Do not ask any question that cannot be answered confidently.
10 | 
11 | Also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.
12 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized.  You can include multiple paragraphs if necessary.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/000_caps.txt:
--------------------------------------------------------------------------------
 1 | A harbor filled with lots of boats next to a building.
 2 | A bicycle parked in front of several boats at a dock.
 3 | A red bicycle in front of a line of docked white yachts
 4 | A bike sits before boats which sit before a long building.
 5 | A bicycle is a convenient means of land transportation when you live on a boat.
 6 | 
 7 | bicycle: [0.287, 0.641, 0.507, 0.874]
 8 | bicycle: [0.566, 0.667, 0.63, 0.731]
 9 | boat: [0.318, 0.579, 0.575, 0.724]
10 | boat: [0.704, 0.607, 0.818, 0.727]
11 | boat: [0.818, 0.601, 0.942, 0.744]
12 | boat: [0.002, 0.53, 0.243, 0.71]
13 | boat: [0.541, 0.611, 0.668, 0.731]
14 | person: [0.778, 0.527, 0.797, 0.57]
15 | cup: [0.708, 0.733, 0.724, 0.758]
16 | boat: [0.236, 0.532, 0.404, 0.64]
17 | boat: [0.81, 0.632, 0.836, 0.676]
18 | boat: [0.957, 0.526, 1.0, 0.752]


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/000_conv.txt:
--------------------------------------------------------------------------------
1 | It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a few white yachts lined up, standing out from the rest. There is a red bicycle prominently parked in front of the line of docked boats, serving as a convenient means of land transportation for those living on the boats. Another bicycle can be seen further back in the scene, near the middle of the harbor.
2 | 
3 | A person is visible near the right side of the harbor, possibly enjoying the view or attending to their boat. Additionally, there is a cup placed on a surface near the middle of the scene.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/001_caps.txt:
--------------------------------------------------------------------------------
 1 | A group of people standing outside of a black vehicle with various luggage.
 2 | Luggage surrounds a vehicle in an underground parking area
 3 | People try to fit all of their luggage in an SUV.
 4 | The sport utility vehicle is parked in the public garage, being packed for a trip
 5 | Some people with luggage near a van that is transporting it.
 6 | 
 7 | person: [0.681, 0.242, 0.774, 0.694]
 8 | person: [0.63, 0.222, 0.686, 0.516]
 9 | person: [0.444, 0.233, 0.487, 0.34]
10 | backpack: [0.384, 0.696, 0.485, 0.914]
11 | backpack: [0.755, 0.413, 0.846, 0.692]
12 | suitcase: [0.758, 0.413, 0.845, 0.69]
13 | suitcase: [0.1, 0.497, 0.173, 0.579]
14 | bicycle: [0.282, 0.363, 0.327, 0.442]
15 | car: [0.786, 0.25, 0.848, 0.322]
16 | car: [0.783, 0.27, 0.827, 0.335]
17 | car: [0.86, 0.254, 0.891, 0.3]
18 | car: [0.261, 0.101, 0.787, 0.626]


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/001_conv.txt:
--------------------------------------------------------------------------------
1 | The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the scene, with one person standing closer to the left side of the vehicle, another person in the middle, and the third person on the right side. They are all working together to pack their luggage into the SUV for a trip. 
2 | 
3 | In the parking area, various luggage items are scattered around the vehicle. There are two backpacks, one located near the left rear wheel and the other closer to the right side of the vehicle. Additionally, there are two suitcases, one on the right side of the car and another further away near the center of the parking area. A bicycle can also be seen on the left side of the vehicle. 
4 | 
5 | Other cars are parked around the main SUV, with one car positioned behind it and slightly to the left, another behind and slightly to the right, and the third car further behind on the right side.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/002_caps.txt:
--------------------------------------------------------------------------------
 1 | A man holds a Wii-mote above his head while another looks on.
 2 | A guy and his friend are playing Nintendo Wii.
 3 | A young man is holding a video game remote over his head.
 4 | two men standing in a room while one plays with a wii mote
 5 | Some guys standing and playing a video game.
 6 | 
 7 | couch: [0.697, 0.759, 0.995, 1.0]
 8 | dining table: [0.426, 0.755, 1.0, 0.987]
 9 | person: [0.082, 0.252, 0.342, 1.0]
10 | person: [0.399, 0.085, 0.742, 0.982]
11 | remote: [0.477, 0.135, 0.516, 0.187]
12 | sink: [0.016, 0.501, 0.063, 0.52]
13 | potted plant: [0.798, 0.384, 0.888, 0.645]
14 | refrigerator: [0.305, 0.389, 0.414, 0.547]
15 | chair: [0.72, 0.509, 0.858, 0.725]


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/002_conv.txt:
--------------------------------------------------------------------------------
1 | The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is holding a Wii remote above his head with enthusiasm, while the other man looks on, likely enjoying the friendly competition. 
2 | 
3 | The room appears to be a living space with a couch located in the background and a dining table nearby. A potted plant can be seen placed close to the couch, and a chair is situated in the middle of the room. The room also features a kitchen area with a sink and a refrigerator visible in the background.


--------------------------------------------------------------------------------
/COINCIDE_train/playground/data/prompts/detail_description/system_message.txt:
--------------------------------------------------------------------------------
1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
2 | 
3 | Using the provided caption and bounding box information, describe the scene in a detailed manner.
4 | 
5 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects.
6 | 
7 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box.  Always answer as if you are directly looking at the image.


--------------------------------------------------------------------------------
/COINCIDE_train/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llava"
 7 | version = "1.2.2.post1"
 8 | description = "Towards GPT-4 like large language and visual assistant."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.2.0", "torchvision==0.17.0", "tiktoken",
17 |     "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.21.0", "peft", "bitsandbytes",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
20 |     "gradio==4.16.0", "gradio_client==0.8.1",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.12.6", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://llava-vl.github.io"
31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
32 | 
33 | [tool.setuptools.packages.find]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 | 
36 | [tool.wheel]
37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
38 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str)
 9 |     parser.add_argument("--result-file", type=str)
10 |     parser.add_argument("--result-upload-file", type=str)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24 |         data_type = question_data['question_type_id']
25 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
26 |         try:
27 |             question_id = int(question_data['question_id'])
28 |         except:
29 |             question_id = question_data['question_id']
30 |         if question_id not in results:
31 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
32 |             continue
33 |         row = results[question_id]
34 |         if row['text'] == question_data['answer']:
35 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36 | 
37 |     total_count = 0
38 |     total_correct = 0
39 |     for data_type in sorted(type_counts.keys()):
40 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41 |         if eval_only_type is None:
42 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43 | 
44 |         total_count += type_counts[data_type]
45 |         total_correct += correct_counts[data_type]
46 | 
47 |     total_accuracy = total_correct / total_count * 100
48 |     if eval_only_type is None:
49 |         print(f"Total accuracy: {total_accuracy:.2f}%")
50 |     else:
51 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52 | 
53 |     return results
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 |     data = json.load(open(args.annotation_file))
58 |     ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
59 | 
60 |     results = eval_single(args.result_file)
61 |     eval_single(args.result_file, eval_only_type='image')
62 |     # eval_single(args.result_file, eval_only_type='video')
63 | 
64 |     # with open(args.result_upload_file, 'w') as fp:
65 |     #     for question in data['questions']:
66 |     #         qid = question['question_id']
67 |     #         if qid in results:
68 |     #             result = results[qid]
69 |     #         else:
70 |     #             result = results[int(qid)]
71 |     #         fp.write(json.dumps({
72 |     #             'question_id': qid,
73 |     #             'prediction': result['text']
74 |     #         }) + '\n')
75 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_sqa_to_llava.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import fire
 4 | import re
 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
 6 | 
 7 | 
 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
 9 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
10 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
11 | 
12 |     split_problems = build_prompt_chatbot(
13 |         problems, split_indices, prompt_format,
14 |         use_caption=False, is_test=False)
15 | 
16 |     target_format = []
17 |     for prob_id, (input, output) in split_problems.items():
18 |         if input.startswith('Question: '):
19 |             input = input.replace('Question: ', '')
20 |         if output.startswith('Answer: '):
21 |             output = output.replace('Answer: ', '')
22 | 
23 |         raw_prob_data = problems[prob_id]
24 |         if raw_prob_data['image'] is None:
25 |             target_format.append({
26 |                 "id": prob_id,
27 |                 "conversations": [
28 |                     {'from': 'human', 'value': f"{input}"},
29 |                     {'from': 'gpt', 'value': f"{output}"},
30 |                 ],
31 |             })
32 | 
33 |         else:
34 |             target_format.append({
35 |                 "id": prob_id,
36 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
37 |                 "conversations": [
38 |                     {'from': 'human', 'value': f"{input}\n<image>"},
39 |                     {'from': 'gpt', 'value': f"{output}"},
40 |                 ],
41 |             })
42 | 
43 |     print(f'Number of samples: {len(target_format)}')
44 | 
45 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
46 |         json.dump(target_format, f, indent=2)
47 | 
48 | 
49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
50 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
51 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
52 | 
53 |     split_problems = build_prompt_chatbot(
54 |         problems, split_indices, prompt_format,
55 |         use_caption=False, is_test=False)
56 | 
57 |     writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
58 |     for prob_id, (input, output) in split_problems.items():
59 |         if input.startswith('Question: '):
60 |             input = input.replace('Question: ', '')
61 |         if output.startswith('Answer: '):
62 |             output = output.replace('Answer: ', '')
63 | 
64 |         raw_prob_data = problems[prob_id]
65 |         if raw_prob_data['image'] is None:
66 |             data = {
67 |                 "id": prob_id,
68 |                 "instruction": f"{input}",
69 |                 "output": f"{output}",
70 |             }
71 | 
72 |         else:
73 |             data = {
74 |                 "id": prob_id,
75 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
76 |                 "instruction": f"{input}\n<image>",
77 |                 "output": f"{output}",
78 |             }
79 |         writer.write(json.dumps(data) + '\n')
80 |     writer.close()
81 | 
82 | 
83 | def main(task, **kwargs):
84 |     globals()[task](**kwargs)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     fire.Fire(main)
89 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)
48 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 |     parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--split', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x['question_id']: x['text'] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x['question_id'] for x in test_split])
36 | 
37 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x['question_id'] not in results:
45 |             all_answers.append({
46 |                 'question_id': x['question_id'],
47 |                 'answer': ''
48 |             })
49 |         else:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': answer_processor(results[x['question_id']])
53 |             })
54 | 
55 |     with open(dst, 'w') as f:
56 |         json.dump(all_answers, open(dst, 'w'))
57 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT=$1
 9 | BASE="vicuna-7b-v1.5"
10 | SPLIT="llava_gqa_testdev_balanced"
11 | BASE_PATH="/base_path"
12 | GQADIR="${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/data"
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
16 |         --model-path ${BASE_PATH}/checkpoints/$CKPT \
17 |         --model-base ${BASE_PATH}/checkpoints/$BASE \
18 |         --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/$SPLIT.jsonl \
19 |         --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/data/images \
20 |         --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --temperature 0 \
24 |         --conv-mode vicuna_v1 &
25 | done
26 | 
27 | wait
28 | 
29 | output_file=${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
30 | 
31 | # Clear out the output file if it exists.
32 | > "$output_file"
33 | 
34 | # Loop through the indices and concatenate each file.
35 | for IDX in $(seq 0 $((CHUNKS-1))); do
36 |     cat ${BASE_PATH}/COINCIDE_train/playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
37 | done
38 | 
39 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
40 | 
41 | cd $GQADIR
42 | python eval/eval.py --tier testdev_balanced
43 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/images \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/answers/${CKPT}.jsonl \
13 |     --temperature 0 \
14 |     --conv-mode vicuna_v1
15 | 
16 | mkdir -p ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/reviews
17 | 
18 | python llava/eval/eval_gpt_review_bench.py \
19 |     --question ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
20 |     --context ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/context.jsonl \
21 |     --rule ${BASE_PATH}/COINCIDE_train/llava/eval/table/rule.json \
22 |     --answer-list \
23 |         ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
24 |         ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/answers/${CKPT}.jsonl \
25 |     --output \
26 |         ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT}.jsonl
27 | 
28 | python llava/eval/summarize_gpt_review.py -f ${BASE_PATH}/COINCIDE_train/playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT}.jsonl
29 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | SPLIT="mmbench_dev_20230712"
 6 | BASE_PATH="/base_path"
 7 | 
 8 | python -m llava.eval.model_vqa_mmbench \
 9 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
10 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
11 |     --question-file ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/$SPLIT.tsv \
12 |     --answers-file ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers/$SPLIT/${CKPT}.jsonl \
13 |     --single-pred-prompt \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | mkdir -p ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers_upload/$SPLIT
18 | 
19 | python scripts/convert_mmbench_for_submission.py \
20 |     --annotation-file ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/$SPLIT.tsv \
21 |     --result-dir ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers/$SPLIT \
22 |     --upload-dir ${BASE_PATH}/COINCIDE/playground/data/eval/mmbench/answers_upload/$SPLIT \
23 |     --experiment $CKPT
24 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | SPLIT="mmbench_dev_cn_20231003"
 6 | BASE_PATH="/base_path"
 7 | 
 8 | python -m llava.eval.model_vqa_mmbench \
 9 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
10 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
11 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/$SPLIT.tsv \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers/$SPLIT/${CKPT}.jsonl \
13 |     --lang cn \
14 |     --single-pred-prompt \
15 |     --temperature 0 \
16 |     --conv-mode vicuna_v1
17 | 
18 | mkdir -p ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers_upload/$SPLIT
19 | 
20 | python scripts/convert_mmbench_for_submission.py \
21 |     --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/$SPLIT.tsv \
22 |     --result-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers/$SPLIT \
23 |     --upload-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/mmbench/answers_upload/$SPLIT \
24 |     --experiment $CKPT
25 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa_loader \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME/llava_mme.jsonl \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME/MME_Benchmark_release_version \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME/answers/${CKPT}.jsonl \
13 |     --temperature 0 \
14 |     --conv-mode vicuna_v1
15 | 
16 | cd ${BASE_PATH}/COINCIDE_train/playground/data/eval/MME
17 | 
18 | python convert_answer_to_mme.py --experiment $CKPT
19 | 
20 | cd eval_tool
21 | 
22 | python calculation.py --results_dir answers/$CKPT
23 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/llava-mm-vet.jsonl \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/images \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/answers/${CKPT}.jsonl \
13 |     --temperature 0 \
14 |     --conv-mode vicuna_v1
15 | 
16 | mkdir -p ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/results
17 | 
18 | python scripts/convert_mmvet_for_eval.py \
19 |     --src ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/answers/${CKPT}.jsonl \
20 |     --dst ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/results/${CKPT}.json
21 | 
22 | python playground/data/eval/mm-vet/MM-Vet/mm-vet_evaluator.py \
23 |     --mmvet_path ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet \
24 |     --result_file ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet/results/${CKPT}.json \
25 |     --result_path ${BASE_PATH}/COINCIDE_train/playground/data/eval/mm-vet
26 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa_loader \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/llava_pope_test.jsonl \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/val2014 \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/answers/${CKPT}.jsonl \
13 |     --temperature 0 \
14 |     --conv-mode vicuna_v1
15 | 
16 | python llava/eval/eval_pope.py \
17 |     --annotation-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/coco \
18 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/llava_pope_test.jsonl \
19 |     --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/pope/answers/${CKPT}.jsonl
20 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT=$1
 9 | BASE="vicuna-7b-v1.5"
10 | BASE_PATH="/base_path"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 |         --model-path ${BASE_PATH}/checkpoints/$CKPT \
15 |         --model-base ${BASE_PATH}/checkpoints/$BASE \
16 |         --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/llava-seed-bench_mod.jsonl \
17 |         --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench \
18 |         --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | # Evaluate
38 | python scripts/convert_seed_for_submission.py \
39 |     --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/SEED-Bench.json \
40 |     --result-file $output_file \
41 |     --result-upload-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/seed_bench/answers_upload/${CKPT}.jsonl
42 | 
43 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa_science \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/llava_test_CQM-A.json \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/images/test \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}.jsonl \
13 |     --single-pred-prompt \
14 |     --temperature 0 \
15 |     --conv-mode vicuna_v1
16 | 
17 | python llava/eval/eval_science_qa.py \
18 |     --base-dir ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa \
19 |     --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}.jsonl \
20 |     --output-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \
21 |     --output-result ${BASE_PATH}/COINCIDE_train/playground/data/eval/scienceqa/answers/${CKPT}_result.json
22 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa_loader \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/train_images \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/answers/${CKPT}.jsonl \
13 |     --temperature 0 \
14 |     --conv-mode vicuna_v1
15 | 
16 | python -m llava.eval.eval_textvqa \
17 |     --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
18 |     --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/textvqa/answers/${CKPT}.jsonl
19 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | BASE="vicuna-7b-v1.5"
 5 | BASE_PATH="/base_path"
 6 | 
 7 | python -m llava.eval.model_vqa_loader \
 8 |     --model-path ${BASE_PATH}/checkpoints/$CKPT \
 9 |     --model-base ${BASE_PATH}/checkpoints/$BASE \
10 |     --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/llava_test.jsonl \
11 |     --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/test \
12 |     --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/answers/${CKPT}.jsonl \
13 |     --temperature 0 \
14 |     --conv-mode vicuna_v1
15 | 
16 | python scripts/convert_vizwiz_for_submission.py \
17 |     --annotation-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/llava_test.jsonl \
18 |     --result-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/answers/${CKPT}.jsonl \
19 |     --result-upload-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vizwiz/answers_upload/${CKPT}.json
20 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT=$1
 9 | BASE="vicuna-7b-v1.5"
10 | SPLIT="llava_vqav2_mscoco_test-dev2015"
11 | BASE_PATH="/base_path"
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
15 |         --model-path ${BASE_PATH}/checkpoints/$CKPT \
16 |         --model-base ${BASE_PATH}/checkpoints/$BASE \
17 |         --question-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/$SPLIT.jsonl \
18 |         --image-folder ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/test2015 \
19 |         --answers-file ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --temperature 0 \
23 |         --conv-mode vicuna_v1 &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ${BASE_PATH}/COINCIDE_train/playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
39 | #
40 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_PATH="/base_path"
 4 | 
 5 | deepspeed llava/train/train_xformers.py \
 6 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 7 |     --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \
 8 |     --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \
 9 |     --version v1 \
10 |     --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
11 |     --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
12 |     --vision_tower openai/clip-vit-large-patch14-336 \
13 |     --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \
14 |     --mm_projector_type mlp2x_gelu \
15 |     --mm_vision_select_layer -2 \
16 |     --mm_use_im_start_end False \
17 |     --mm_use_im_patch_token False \
18 |     --image_aspect_ratio pad \
19 |     --group_by_modality_length True \
20 |     --bf16 False \
21 |     --output_dir ${BASE_PATH}/checkpoints/llava_lora_v1.5_v100 \
22 |     --num_train_epochs 1 \
23 |     --per_device_train_batch_size 4 \
24 |     --per_device_eval_batch_size 4 \
25 |     --gradient_accumulation_steps 8 \
26 |     --eval_accumulation_steps 8 \
27 |     --evaluation_strategy "no" \
28 |     --save_strategy "steps" \
29 |     --save_steps 50000 \
30 |     --save_total_limit 1 \
31 |     --learning_rate 2e-4 \
32 |     --weight_decay 0. \
33 |     --warmup_ratio 0.03 \
34 |     --lr_scheduler_type "cosine" \
35 |     --logging_steps 1 \
36 |     --tf32 False \
37 |     --fp16 True \
38 |     --model_max_length 2048 \
39 |     --gradient_checkpointing True \
40 |     --dataloader_num_workers 4 \
41 |     --lazy_preprocess True \
42 |     --report_to wandb \
43 |     --run_name llava_lora_v1.5_v100 \
44 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/finetune_lora_coincide.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$1" == "" ]; then
 4 |   PRUNE_RATIO=0.2
 5 | else
 6 |   PRUNE_RATIO=$1
 7 | fi
 8 | 
 9 | BASE_PATH="/base_path"
10 | 
11 | deepspeed llava/train/train_xformers.py \
12 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
13 |     --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \
14 |     --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \
15 |     --version v1 \
16 |     --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
17 |     --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
18 |     --vision_tower openai/clip-vit-large-patch14-336 \
19 |     --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \
20 |     --mm_projector_type mlp2x_gelu \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --image_aspect_ratio pad \
25 |     --group_by_modality_length True \
26 |     --bf16 False \
27 |     --output_dir ${BASE_PATH}/checkpoints/llava_lora_tinyllava_ours_37111519_mmd_transfer_mod_density_prune_${PRUNE_RATIO}_v1.5 \
28 |     --num_train_epochs 1 \
29 |     --per_device_train_batch_size 4 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 8 \
32 |     --eval_accumulation_steps 8 \
33 |     --evaluation_strategy "no" \
34 |     --save_strategy "steps" \
35 |     --save_steps 50000 \
36 |     --save_total_limit 1 \
37 |     --learning_rate 2e-4 \
38 |     --weight_decay 0. \
39 |     --warmup_ratio 0.03 \
40 |     --lr_scheduler_type "cosine" \
41 |     --logging_steps 1 \
42 |     --tf32 False \
43 |     --fp16 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb \
49 |     --run_name llava_lora_tinyllava_ours_37111519_mmd_transfer_mod_density_prune_${PRUNE_RATIO}_v1.5 \
50 |     --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/TinyLLaVA-Instruction/ours_37111519_indices_mmd_trans_mod_density_${PRUNE_RATIO}.npy \
51 |     --prune_p $PRUNE_RATIO \
52 |     --prune_subset "bottom" \
53 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/finetune_lora_random.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$1" == "" ]; then
 4 |   PRUNE_RATIO=0.2
 5 | else
 6 |   PRUNE_RATIO=$1
 7 | fi
 8 | 
 9 | BASE_PATH="/base_path"
10 | 
11 | deepspeed llava/train/train_xformers.py \
12 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
13 |     --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \
14 |     --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \
15 |     --version v1 \
16 |     --data_path ${BASE_PATH}/COINCIDE_train/playground/data/llava_v1_5_mix665k.json \
17 |     --image_folder ${BASE_PATH}/COINCIDE_train/playground/data \
18 |     --vision_tower openai/clip-vit-large-patch14-336 \
19 |     --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \
20 |     --mm_projector_type mlp2x_gelu \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --image_aspect_ratio pad \
25 |     --group_by_modality_length True \
26 |     --bf16 False \
27 |     --output_dir ${BASE_PATH}/checkpoints/llava_lora_random_prune_${PRUNE_RATIO}_v1.5 \
28 |     --num_train_epochs 1 \
29 |     --per_device_train_batch_size 4 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 8 \
32 |     --eval_accumulation_steps 8 \
33 |     --evaluation_strategy "no" \
34 |     --save_strategy "steps" \
35 |     --save_steps 50000 \
36 |     --save_total_limit 1 \
37 |     --learning_rate 2e-4 \
38 |     --weight_decay 0. \
39 |     --warmup_ratio 0.03 \
40 |     --lr_scheduler_type "cosine" \
41 |     --logging_steps 1 \
42 |     --tf32 False \
43 |     --fp16 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb \
49 |     --run_name llava_lora_random_prune_${PRUNE_RATIO}_v1.5 \
50 |     --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/LLaVA-Instruction/random_indices.npy \
51 |     --prune_p $PRUNE_RATIO \
52 |     --prune_subset "bottom" \
53 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/vision_flan/finetune_lora_vision_flan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_PATH="/base_path"
 4 | 
 5 | deepspeed llava/train/train_xformers.py \
 6 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 7 |     --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \
 8 |     --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \
 9 |     --version v1 \
10 |     --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
11 |     --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
12 |     --vision_tower openai/clip-vit-large-patch14-336 \
13 |     --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \
14 |     --mm_projector_type mlp2x_gelu \
15 |     --mm_vision_select_layer -2 \
16 |     --mm_use_im_start_end False \
17 |     --mm_use_im_patch_token False \
18 |     --image_aspect_ratio pad \
19 |     --group_by_modality_length True \
20 |     --bf16 False \
21 |     --output_dir ${BASE_PATH}/checkpoints/llava_lora_vision_flan_lora_v1.5_v100 \
22 |     --num_train_epochs 1 \
23 |     --per_device_train_batch_size 4 \
24 |     --per_device_eval_batch_size 4 \
25 |     --gradient_accumulation_steps 8 \
26 |     --eval_accumulation_steps 8 \
27 |     --evaluation_strategy "no" \
28 |     --save_strategy "steps" \
29 |     --save_steps 50000 \
30 |     --save_total_limit 1 \
31 |     --learning_rate 2e-4 \
32 |     --weight_decay 0. \
33 |     --warmup_ratio 0.03 \
34 |     --lr_scheduler_type "cosine" \
35 |     --logging_steps 1 \
36 |     --tf32 False \
37 |     --fp16 True \
38 |     --model_max_length 2048 \
39 |     --gradient_checkpointing True \
40 |     --dataloader_num_workers 4 \
41 |     --lazy_preprocess True \
42 |     --report_to wandb \
43 |     --run_name llava_lora_vision_flan_lora_v1.5_v100 \
44 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/vision_flan/finetune_lora_vision_flan_coincide.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$1" == "" ]; then
 4 |   PRUNE_RATIO=0.2
 5 | else
 6 |   PRUNE_RATIO=$1
 7 | fi
 8 | 
 9 | BASE_PATH="/base_path"
10 | 
11 | deepspeed llava/train/train_xformers.py \
12 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
13 |     --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \
14 |     --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \
15 |     --version v1 \
16 |     --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
17 |     --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
18 |     --vision_tower openai/clip-vit-large-patch14-336 \
19 |     --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \
20 |     --mm_projector_type mlp2x_gelu \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --image_aspect_ratio pad \
25 |     --group_by_modality_length True \
26 |     --bf16 False \
27 |     --output_dir ${BASE_PATH}/checkpoints/llava_lora_vision_flan_lora_ours_3000_prune_${PRUNE_RATIO}_v1.5 \
28 |     --num_train_epochs 1 \
29 |     --per_device_train_batch_size 4 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 8 \
32 |     --eval_accumulation_steps 8 \
33 |     --evaluation_strategy "no" \
34 |     --save_strategy "steps" \
35 |     --save_steps 50000 \
36 |     --save_total_limit 1 \
37 |     --learning_rate 2e-4 \
38 |     --weight_decay 0. \
39 |     --warmup_ratio 0.03 \
40 |     --lr_scheduler_type "cosine" \
41 |     --logging_steps 1 \
42 |     --tf32 False \
43 |     --fp16 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb \
49 |     --run_name llava_lora_vision_flan_lora_ours_3000_prune_${PRUNE_RATIO}_v1.5 \
50 |     --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/ours_3000_${PRUNE_RATIO}.npy \
51 |     --prune_p $PRUNE_RATIO \
52 |     --prune_subset "bottom" \
53 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/v1_5/vision_flan/finetune_lora_vision_flan_random.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | if [ "$1" == "" ]; then
 5 |   PRUNE_RATIO=0.2
 6 | else
 7 |   PRUNE_RATIO=$1
 8 | fi
 9 | 
10 | BASE_PATH="/base_path"
11 | 
12 | deepspeed llava/train/train_xformers.py \
13 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
14 |     --deepspeed ${BASE_PATH}/COINCIDE_train/scripts/zero3.json \
15 |     --model_name_or_path ${BASE_PATH}/checkpoints/vicuna-7b-v1.5 \
16 |     --version v1 \
17 |     --data_path ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/annotation_191-task_1k.json \
18 |     --image_folder ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/images_191task_1k \
19 |     --vision_tower openai/clip-vit-large-patch14-336 \
20 |     --pretrain_mm_mlp_adapter ${BASE_PATH}/checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --mm_vision_select_layer -2 \
23 |     --mm_use_im_start_end False \
24 |     --mm_use_im_patch_token False \
25 |     --image_aspect_ratio pad \
26 |     --group_by_modality_length True \
27 |     --bf16 False \
28 |     --output_dir ${BASE_PATH}/checkpoints/llava_lora_vision_flan_lora_random_prune_${PRUNE_RATIO}_v1.5 \
29 |     --num_train_epochs 1 \
30 |     --per_device_train_batch_size 4 \
31 |     --per_device_eval_batch_size 4 \
32 |     --gradient_accumulation_steps 8 \
33 |     --eval_accumulation_steps 8 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 50000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 2e-4 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 False \
44 |     --fp16 True \
45 |     --model_max_length 2048 \
46 |     --gradient_checkpointing True \
47 |     --dataloader_num_workers 4 \
48 |     --lazy_preprocess True \
49 |     --report_to wandb \
50 |     --run_name llava_lora_vision_flan_lora_random_prune_${PRUNE_RATIO}_v1.5 \
51 |     --prune_indices ${BASE_PATH}/COINCIDE_train/playground/data/vision-flan_191-task_1k/random_indices.npy \
52 |     --prune_p $PRUNE_RATIO \
53 | 


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/COINCIDE_train/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # COINCIDE
  2 | 
  3 | ### **Concept-skill Transferability-based Data Selection for Large Vision-Language Models, EMNLP 2024**
  4 | * Authors: [Jaewoo Lee](https://g-jwlee.github.io/), [Boyang Li^](http://www.boyangli.org/), [Sung Ju Hwang^](http://www.sungjuhwang.com/)
  5 | * [Paper](https://arxiv.org/abs/2406.10995)
  6 | 
  7 | ## Introduction
  8 | 
  9 | Training Large Vision-Langauge Models on extensive visual instruction tuning (VIT) data incurs significant computational cost, making the process infeasible for small academic labs and individual researchers.
 10 | Additionally, it is not clear if all the VIT data are necessary for good generalization, as different vision-language (VL) tasks have different abilities to transfer to downstream tasks.
 11 | 
 12 | We propose COINCIDE (COre-INstruction Concept-skIll Data Election), which identifies VL concept-skill compositions through data clustering using activations from an off-the-shelf, small VLM.
 13 | 
 14 | 
 15 | <center><img src="assets/concept_figure.png" alt="Teaser" width="100%"></center>
 16 | <p>
 17 | 
 18 | From each cluster, COINCIDE selects training data for a target LVLM by considering transferability (i.e., how well knowledge from each cluster can facilitate LVLM's learning in other clusters) and internal density of clusters.
 19 | 
 20 | </p>
 21 | 
 22 | ## Install
 23 | Since we use different models for 1. clustering and 2. training, different conda environments are required for each process.
 24 | 
 25 | ### Clustering
 26 | ```shell
 27 | conda create -n coincide_cluster python=3.10 -y
 28 | conda activate coincide_cluster
 29 | pip install --upgrade pip  # enable PEP 660 support
 30 | pip install -e .
 31 | pip install -e ".[train]"
 32 | ```
 33 | 
 34 | ### Training
 35 | ```shell
 36 | conda create -n coincide_train python=3.10 -y
 37 | conda activate coincide_train
 38 | cd COINCIDE_train
 39 | pip install --upgrade pip  # enable PEP 660 support
 40 | pip install -e .
 41 | pip install -e ".[train]"
 42 | pip install flash-attn --no-build-isolation
 43 | ```
 44 | 
 45 | ## Dataset
 46 | ### LLaVA-1.5
 47 | Follow the visual instruction tuning dataset download guides in the [official Github page](https://github.com/haotian-liu/LLaVA)
 48 | Place the downloaded files in `COINCIDE_train/playground/data`.
 49 | Also, prepare for the evaluation benchmark datasets by following the instructions in the page.
 50 | 
 51 | ### Vision-Flan
 52 | Similarly, download the Vision-Flan visual instruction tuning dataset through the [official project page](https://vision-flan.github.io/)
 53 | Place the downloaded files in `COINCIDE_train/playground/data`.
 54 | 
 55 | ## Models
 56 | In this paper, we focus on two models: [TinyLLavA-2B](https://github.com/TinyLLaVA/TinyLLaVA_Factory/tree/tinyllava_bench) for clustering and
 57 | [LLaVA-1.5-7B](https://github.com/haotian-liu/LLaVA) for training. Please download the models in `/checkpoints` directory. <br>
 58 | 
 59 | You can try to use other LVLMs such as [TinyLLaVA-OpenELM-450M-SigLIP-0.89B](https://github.com/TinyLLaVA/TinyLLaVA_Factory?tab=readme-ov-file) or
 60 | [LLaVA-1.5-13B](https://github.com/haotian-liu/LLaVA), as we have done in our ablation studies.
 61 | 
 62 | ## LLaVA-1.5 experiment
 63 | ### Clustering
 64 | Change directory `cd COINCIDE_cluster`.
 65 | ```shell
 66 | bash scripts/tiny_llava/eval/llava_score/coincide/extract_embed.sh
 67 | bash scripts/tiny_llava/eval/llava_score/coincide/clustering.sh
 68 | ```
 69 | 
 70 | ### Training
 71 | Change directory `cd COINCIDE_train`.
 72 | ```shell
 73 | bash scripts/v1_5/finetune_lora_coincide.sh
 74 | ```
 75 | 
 76 | ## Vision-Flan experiment
 77 | ### Clustering
 78 | Change directory `cd COINCIDE_cluster`.
 79 | ```shell
 80 | bash scripts/tiny_llava/eval/vision_flan_score/coincide/extract_embed.sh
 81 | bash scripts/tiny_llava/eval/vision_flan_score/coincide/clustering.sh
 82 | ```
 83 | 
 84 | ### Training
 85 | Change directory `cd COINCIDE_train`.
 86 | ```shell
 87 | bash scripts/v1_5/vision_flan_score/finetune_lora_coincide.sh
 88 | ```
 89 | 
 90 | ## Evaluation
 91 | MODEL_NAME is the name of the directory that stores the model. <br>
 92 | {eval_benchmark} could be gqa, llavabench, mme, etc.
 93 | ```shell
 94 | bash scripts/v1_5/eval/{eval_benchmark}.sh MODEL_NAME
 95 | ```
 96 | 
 97 | ## Bibtex
 98 | ```
 99 | @inproceedings{lee2024coincide,
100 |       title={Concept-skill Transferability-based Data Selection for Large Vision-Language Models},
101 |       author={Jaewoo Lee and Boyang Li and Sung Ju Hwang},
102 |       year={2024},
103 |       booktitle={Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
104 | }
105 | ```
106 | 


--------------------------------------------------------------------------------
/assets/concept_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-JWLee/COINCIDE_code/629f7760a94c70740b1138eed080cb42eb34c69c/assets/concept_figure.png


--------------------------------------------------------------------------------