├── CLIP_benchmark ├── bash │ ├── build.sh │ ├── run_benchmark_clean.sh │ ├── run_benchmark_lp.sh │ └── run_benchmark_rt.sh ├── benchmark │ ├── dataset_type.csv │ ├── datasets.txt │ ├── datasets_lp.txt │ ├── datasets_rt.txt │ └── models.txt └── clip_benchmark │ ├── __init__.py │ ├── cli.py │ ├── datasets │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-311.pyc │ │ ├── babel_imagenet.cpython-311.pyc │ │ ├── builder.cpython-311.pyc │ │ ├── caltech101.cpython-311.pyc │ │ ├── flickr.cpython-311.pyc │ │ ├── imagenetv2.cpython-311.pyc │ │ ├── objectnet.cpython-311.pyc │ │ ├── sugar_crepe.cpython-311.pyc │ │ └── voc2007.cpython-311.pyc │ ├── ar_classnames.json │ ├── ar_zeroshot_classification_templates.json │ ├── babel_imagenet.json │ ├── babel_imagenet.py │ ├── builder.py │ ├── caltech101.py │ ├── cn_classnames.json │ ├── cn_zeroshot_classification_templates.json │ ├── cupl_prompts.json │ ├── en_classnames.json │ ├── en_zeroshot_classification_templates.json │ ├── flickr.py │ ├── imagenetv2.py │ ├── it_classnames.json │ ├── it_zeroshot_classification_templates.json │ ├── jp_classnames.json │ ├── jp_zeroshot_classification_templates.json │ ├── kitti.py │ ├── multilingual_mscoco.py │ ├── nllb_dist13b_prompts.json │ ├── objectnet.py │ ├── sugar_crepe.py │ ├── tfds.py │ └── voc2007.py │ ├── metrics │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-311.pyc │ │ ├── captioning.cpython-311.pyc │ │ ├── image_caption_selection.cpython-311.pyc │ │ ├── linear_probe.cpython-311.pyc │ │ ├── zeroshot_classification.cpython-311.pyc │ │ └── zeroshot_retrieval.cpython-311.pyc │ ├── captioning.py │ ├── image_caption_selection.py │ ├── linear_probe.py │ ├── zeroshot_classification.py │ └── zeroshot_retrieval.py │ ├── model_collection.py │ ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-311.pyc │ │ ├── japanese_clip.cpython-311.pyc │ │ └── open_clip.cpython-311.pyc │ ├── japanese_clip.py │ └── open_clip.py │ └── webdataset_builder.py ├── CLIP_eval ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── clip_robustbench.cpython-311.pyc │ └── eval_utils.cpython-311.pyc ├── clip_robustbench.py ├── eval_utils.py └── zeroshot-templates.json ├── LLaVA ├── llava │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-311.pyc │ │ ├── constants.cpython-311.pyc │ │ ├── conversation.cpython-311.pyc │ │ └── mm_utils.cpython-311.pyc │ ├── constants.py │ ├── conversation.py │ ├── eval │ │ ├── eval_gpt_review.py │ │ ├── eval_gpt_review_bench.py │ │ ├── eval_gpt_review_visual.py │ │ ├── eval_pope.py │ │ ├── eval_science_qa.py │ │ ├── eval_science_qa_gpt4.py │ │ ├── eval_science_qa_gpt4_requery.py │ │ ├── eval_textvqa.py │ │ ├── generate_webpage_data_from_table.py │ │ ├── m4c_evaluator.py │ │ ├── model_qa.py │ │ ├── model_vqa.py │ │ ├── model_vqa_loader.py │ │ ├── model_vqa_mmbench.py │ │ ├── model_vqa_science.py │ │ ├── qa_baseline_gpt35.py │ │ ├── run_llava.py │ │ ├── summarize_gpt_review.py │ │ ├── table │ │ │ ├── answer │ │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ │ ├── answer_bard.jsonl │ │ │ │ ├── answer_gpt35.jsonl │ │ │ │ ├── answer_llama-13b.jsonl │ │ │ │ └── answer_vicuna-13b.jsonl │ │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ │ ├── model.jsonl │ │ │ ├── prompt.jsonl │ │ │ ├── question.jsonl │ │ │ ├── results │ │ │ │ ├── test_sqa_llava_13b_v0.json │ │ │ │ └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json │ │ │ ├── review │ │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ │ ├── reviewer.jsonl │ │ │ └── rule.json │ │ └── webpage │ │ │ ├── figures │ │ │ ├── alpaca.png │ │ │ ├── bard.jpg │ │ │ ├── chatgpt.svg │ │ │ ├── llama.jpg │ │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ │ └── vicuna.jpeg │ │ │ ├── index.html │ │ │ ├── script.js │ │ │ └── styles.css │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-311.pyc │ │ │ ├── builder.cpython-311.pyc │ │ │ └── llava_arch.cpython-311.pyc │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── __pycache__ │ │ │ │ ├── llava_llama.cpython-311.pyc │ │ │ │ └── llava_mpt.cpython-311.pyc │ │ │ ├── llava_llama.py │ │ │ ├── llava_mpt.py │ │ │ └── mpt │ │ │ │ ├── __pycache__ │ │ │ │ ├── adapt_tokenizer.cpython-311.pyc │ │ │ │ ├── attention.cpython-311.pyc │ │ │ │ ├── blocks.cpython-311.pyc │ │ │ │ ├── configuration_mpt.cpython-311.pyc │ │ │ │ ├── custom_embedding.cpython-311.pyc │ │ │ │ ├── hf_prefixlm_converter.cpython-311.pyc │ │ │ │ ├── modeling_mpt.cpython-311.pyc │ │ │ │ └── norm.cpython-311.pyc │ │ │ │ ├── adapt_tokenizer.py │ │ │ │ ├── attention.py │ │ │ │ ├── blocks.py │ │ │ │ ├── configuration_mpt.py │ │ │ │ ├── custom_embedding.py │ │ │ │ ├── flash_attn_triton.py │ │ │ │ ├── hf_prefixlm_converter.py │ │ │ │ ├── meta_init_context.py │ │ │ │ ├── modeling_mpt.py │ │ │ │ ├── norm.py │ │ │ │ └── param_init_fns.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── __pycache__ │ │ │ │ ├── builder.cpython-311.pyc │ │ │ │ └── clip_encoder.cpython-311.pyc │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ ├── multimodal_projector │ │ │ ├── __pycache__ │ │ │ │ └── builder.cpython-311.pyc │ │ │ └── builder.py │ │ └── utils.py │ ├── serve │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── controller.py │ │ ├── examples │ │ │ ├── extreme_ironing.jpg │ │ │ └── waterview.jpg │ │ ├── gradio_web_server.py │ │ ├── model_worker.py │ │ ├── register_worker.py │ │ ├── sglang_worker.py │ │ └── test_message.py │ ├── train │ │ ├── __pycache__ │ │ │ ├── llava_trainer.cpython-311.pyc │ │ │ └── train.cpython-311.pyc │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_xformers_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ ├── train_mem.py │ │ └── train_xformers.py │ └── utils.py └── scripts │ ├── convert_gqa_for_eval.py │ ├── convert_mmbench_for_submission.py │ ├── convert_mmvet_for_eval.py │ ├── convert_seed_for_submission.py │ ├── convert_sqa_to_llava.py │ ├── convert_sqa_to_llava_base_prompt.py │ ├── convert_vizwiz_for_submission.py │ ├── convert_vqav2_for_submission.py │ ├── extract_mm_projector.py │ ├── finetune.sh │ ├── finetune_full_schedule.sh │ ├── finetune_lora.sh │ ├── finetune_qlora.sh │ ├── finetune_sqa.sh │ ├── merge_lora_weights.py │ ├── pretrain.sh │ ├── pretrain_xformers.sh │ ├── sqa_eval_batch.sh │ ├── sqa_eval_gather.sh │ ├── upload_pypi.sh │ ├── v1_5 │ ├── eval │ │ ├── gqa.sh │ │ ├── llavabench.sh │ │ ├── mmbench.sh │ │ ├── mmbench_cn.sh │ │ ├── mme.sh │ │ ├── mmvet.sh │ │ ├── pope.sh │ │ ├── qbench.sh │ │ ├── qbench_zh.sh │ │ ├── seed.sh │ │ ├── sqa.sh │ │ ├── textvqa.sh │ │ ├── vizwiz.sh │ │ └── vqav2.sh │ └── finetune_task_lora.sh │ ├── zero2.json │ ├── zero3.json │ └── zero3_offload.json ├── README.md ├── asset └── method.png ├── open_flamingo ├── LICENSE ├── README.md ├── __init__.py ├── __pycache__ │ └── __init__.cpython-311.pyc ├── eval │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-311.pyc │ ├── classification_utils.py │ ├── coco_metric.py │ ├── eval_datasets.py │ ├── eval_model.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-311.pyc │ │ │ └── utils.cpython-311.pyc │ │ ├── blip.py │ │ ├── llava.py │ │ ├── of_eval_model_adv.py │ │ ├── open_flamingo.py │ │ └── utils.py │ ├── ok_vqa_utils.py │ └── vqa_metric.py └── src │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── factory.cpython-311.pyc │ ├── flamingo.cpython-311.pyc │ ├── flamingo_lm.cpython-311.pyc │ ├── helpers.cpython-311.pyc │ └── utils.cpython-311.pyc │ ├── factory.py │ ├── flamingo.py │ ├── flamingo_lm.py │ ├── helpers.py │ └── utils.py ├── requirements.txt └── train ├── __init__.py ├── align_training_clip.py ├── datasets.py └── utils.py /CLIP_benchmark/bash/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # gathers results from a results directory and builds a csv 3 | # enter results dir in format /path/to/results/* 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | set -e 6 | echo "Enter path to results directory: " 7 | read RES_DIR 8 | echo "building results csv... ${RES_DIR}" 9 | RND=${RANDOM}${RANDOM} 10 | python -m clip_benchmark.cli build ${RES_DIR} --output "res${RND}.csv" 11 | echo "reformatting csv..." 12 | python reformat_csv.py res${RND}.csv -------------------------------------------------------------------------------- /CLIP_benchmark/bash/run_benchmark_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # stop on error 3 | # add parent to python path 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | 6 | SECONDS=0 7 | SAMPLES=-1 8 | BS=64 9 | 10 | SAVE_DIR=../CLIP_benchmark/result # TODO 11 | mkdir -p "$SAVE_DIR" 12 | python -m clip_benchmark.cli eval --dataset_root "../CLIP_benchmark/datasets/wds_{dataset_cleaned}" --dataset benchmark/datasets.txt \ 13 | --pretrained_model benchmark/models.txt \ 14 | --output "${SAVE_DIR}/clean_{model}_{pretrained}_beta{beta}_{dataset}_{n_samples}_bs{bs}_{attack}_{eps}_{iterations}.json" \ 15 | --attack none --eps 1 \ 16 | --batch_size $BS --n_samples $SAMPLES \ 17 | 18 | 19 | hours=$((SECONDS / 3600)) 20 | minutes=$(( (SECONDS % 3600) / 60 )) 21 | echo "[Runtime] $hours h $minutes min" 22 | -------------------------------------------------------------------------------- /CLIP_benchmark/bash/run_benchmark_lp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # stop on error 3 | # add parent to python path 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | 6 | SECONDS=0 7 | SAMPLES=-1 8 | BS=64 9 | 10 | SAVE_DIR=../CLIP_benchmark/result # TODO 11 | mkdir -p "$SAVE_DIR" 12 | python -m clip_benchmark.cli eval --dataset_root "../CLIP_benchmark/datasets/wds_{dataset_cleaned}" --dataset benchmark/datasets_lp.txt --task linear_probe \ 13 | --pretrained_model benchmark/models.txt \ 14 | --output "${SAVE_DIR}/clean_{model}_{pretrained}_beta{beta}_{dataset}_{n_samples}_bs{bs}_{attack}_{eps}_{iterations}.json" \ 15 | --attack none --eps 1 \ 16 | --batch_size $BS --n_samples $SAMPLES \ 17 | 18 | hours=$((SECONDS / 3600)) 19 | minutes=$(( (SECONDS % 3600) / 60 )) 20 | echo "[Runtime] $hours h $minutes min" 21 | -------------------------------------------------------------------------------- /CLIP_benchmark/bash/run_benchmark_rt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # stop on error 3 | # add parent to python path 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | 6 | SECONDS=0 7 | SAMPLES=-1 8 | BS=64 9 | 10 | SAVE_DIR=../CLIP_benchmark/result # TODO 11 | mkdir -p "$SAVE_DIR" 12 | python -m clip_benchmark.cli eval --dataset_root "../CLIP_benchmark/datasets/wds_{dataset_cleaned}" --dataset benchmark/datasets_rt.txt --task zeroshot_retrieval \ 13 | --recall_k 1 5 10 \ 14 | --pretrained_model benchmark/models.txt \ 15 | --output "${SAVE_DIR}/clean_{model}_{pretrained}_beta{beta}_{dataset}_{n_samples}_bs{bs}_{attack}_{eps}_{iterations}.json" \ 16 | --attack none --eps 1 \ 17 | --batch_size $BS --n_samples $SAMPLES \ 18 | 19 | hours=$((SECONDS / 3600)) 20 | minutes=$(( (SECONDS % 3600) / 60 )) 21 | echo "[Runtime] $hours h $minutes min" 22 | -------------------------------------------------------------------------------- /CLIP_benchmark/benchmark/dataset_type.csv: -------------------------------------------------------------------------------- 1 | dataset,type 2 | imagenet1k,natural 3 | imagenetv2,natural 4 | imagenet-r,natural 5 | imagenet_sketch,specialized 6 | objectnet,natural 7 | imagenet-a,natural 8 | imagenet-o,natural 9 | vtab/cifar10,natural 10 | vtab/cifar100,natural 11 | mnist,specialized 12 | vtab/flowers,natural 13 | cars,natural 14 | vtab/svhn,natural 15 | fer2013,natural 16 | renderedsst2,specialized 17 | vtab/pets,natural 18 | vtab/caltech101,natural 19 | voc2007_multilabel,natural 20 | voc2007,natural 21 | sun397,natural 22 | fgvc_aircraft,natural 23 | country211,natural 24 | vtab/dtd,natural 25 | gtsrb,natural 26 | stl10,natural 27 | vtab/diabetic_retinopathy,specialized 28 | vtab/eurosat,specialized 29 | vtab/resisc45,specialized 30 | vtab/pcam,specialized 31 | vtab/clevr_count_all,structured 32 | vtab/clevr_closest_object_distance,structured 33 | vtab/dsprites_label_orientation,structured 34 | vtab/dsprites_label_x_position,structured 35 | vtab/dsprites_label_y_position,structured 36 | vtab/smallnorb_label_elevation,structured 37 | vtab/smallnorb_label_azimuth,structured 38 | vtab/dmlab,structured 39 | vtab/kitti_closest_vehicle_distance,structured 40 | mscoco_captions,retrieval 41 | flickr8k,retrieval 42 | flickr30k,retrieval 43 | -------------------------------------------------------------------------------- /CLIP_benchmark/benchmark/datasets.txt: -------------------------------------------------------------------------------- 1 | wds/vtab/cifar10 2 | wds/vtab/cifar100 3 | wds/vtab/caltech101 4 | wds/fer2013 5 | wds/vtab/pets 6 | wds/vtab/dtd 7 | wds/vtab/resisc45 8 | wds/vtab/eurosat 9 | wds/vtab/pcam 10 | wds/imagenet_sketch 11 | wds/imagenet-o 12 | -------------------------------------------------------------------------------- /CLIP_benchmark/benchmark/datasets_lp.txt: -------------------------------------------------------------------------------- 1 | wds/svhn 2 | wds/gtsrb 3 | wds/vtab/clevr_closest_object_distance 4 | wds/vtab/clevr_count_all 5 | -------------------------------------------------------------------------------- /CLIP_benchmark/benchmark/datasets_rt.txt: -------------------------------------------------------------------------------- 1 | wds/mscoco_captions 2 | wds/flicker30k 3 | -------------------------------------------------------------------------------- /CLIP_benchmark/benchmark/models.txt: -------------------------------------------------------------------------------- 1 | ViT-L-14-336,openai 2 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for CLIP Benchmark.""" 2 | 3 | __author__ = """Mehdi Cherti""" 4 | __email__ = 'mehdicherti@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__init__.py -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/babel_imagenet.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/babel_imagenet.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/builder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/builder.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/caltech101.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/caltech101.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/flickr.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/flickr.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/imagenetv2.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/imagenetv2.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/objectnet.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/objectnet.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/sugar_crepe.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/sugar_crepe.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/__pycache__/voc2007.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/datasets/__pycache__/voc2007.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/ar_zeroshot_classification_templates.json: -------------------------------------------------------------------------------- 1 | { 2 | "imagenet1k": [ 3 | "{c}", 4 | "\u0635\u0648\u0631\u0629 \u0633\u064a\u0626\u0629 \u0644\u0640 {c}", 5 | "\u0635\u0648\u0631\u0629 \u0633\u064a\u0626\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 {c}", 6 | "\u0646\u062d\u062a \u0644\u0634\u0643\u0644 {c}", 7 | "\u0646\u062d\u062a \u0644\u0640 {c}", 8 | "\u0635\u0648\u0631\u0629 \u0630\u0627\u062a \u062c\u0648\u0648\u062f\u0629 \u0645\u0646\u062e\u0641\u0636\u0629 \u0644\u0640 {c}", 9 | "\u0635\u0648\u0631\u0629 \u0630\u0627\u062a \u062c\u0648\u0648\u062f\u0629 \u0645\u0646\u062e\u0641\u0636\u0629 \u062a\u062d\u062a\u0648\u064a {c}", 10 | "\u0631\u0633\u0648\u0645\u0627\u062a \u062c\u062f\u0627\u0631\u064a\u0629 \u062a\u062d\u062a\u0648\u064a {c}", 11 | "\u0631\u0633\u0648\u0645\u0627\u062a \u062c\u062f\u0627\u0631\u064a\u0629 \u0644\u0640 {c}", 12 | "\u0635\u0648\u0631\u0629 \u0645\u0642\u062a\u0637\u0639\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 {c}", 13 | "\u0635\u0648\u0631\u0629 \u0645\u0642\u062a\u0637\u0639\u0629 \u0644\u0640 {c}", 14 | "\u062a\u0637\u0631\u064a\u0632 {c} ", 15 | " \u0635\u0648\u0631\u0629 \u064a\u0635\u0639\u0628 \u0641\u064a\u0647\u0627 \u0631\u0624\u064a\u0629 {c} ", 16 | "\u0635\u0648\u0631\u0629 \u0633\u0627\u0637\u0639\u0629 \u0644\u0640 {c}", 17 | "\u0635\u0648\u0631\u0629 \u0648\u0627\u0636\u062d\u0629 \u0644\u0640 {c}", 18 | "\u0635\u0648\u0631\u0629 \u0645\u062a\u0633\u062e\u0629 \u0644\u0640 {c}", 19 | "\u0635\u0648\u0631\u0629 \u0645\u0638\u0644\u0645\u0629 \u0644\u0640 {c}", 20 | "\u0635\u0648\u0631\u0629 \u0623\u0628\u064a\u0636 \u0648\u0623\u0633\u0648\u062f {c}", 21 | "{c} \u0641\u064a \u0644\u0642\u0637\u0629 \u0642\u0631\u064a\u0628\u0629", 22 | "\u0635\u0648\u0631\u0629 \u0631\u0627\u0626\u0639\u0629 \u0644\u0640 {c}", 23 | "\u0644\u0642\u0637\u0629 \u0642\u0631\u064a\u0628\u0629 \u0644\u0640 {c}", 24 | "\u0631\u0633\u0645 \u062d\u0627\u0633\u0648\u0628\u064a \u064a\u062d\u062a\u0648\u064a {c}", 25 | "\u0635\u0648\u0631\u0629 \u0645\u0631\u0633\u0648\u0645\u0629 \u062a\u062d\u062a\u0648\u064a {c}", 26 | "\u0631\u0633\u0645\u0629 \u0644\u0640 {c}", 27 | "\u0631\u0633\u0645\u0629 {c}", 28 | "\u0631\u0633\u0645 \u064a\u062d\u062a\u0648\u064a {c} ", 29 | "\u0635\u0648\u0631\u0629 \u0628\u0646\u0645\u0637 \u0627\u0644\u0628\u0643\u0633\u0644 \u0644\u0640 {c}", 30 | " \u0635\u0648\u0631\u0629 \u0633\u0627\u0637\u0639\u0629 {c}", 31 | "\u0648\u0634\u0645 {c}", 32 | "{c} \u0641\u064a \u0627\u0644\u0635\u0648\u0631\u0629", 33 | "\u0635\u0648\u0631\u0629 \u0645\u062a\u0633\u062e\u0629 \u062a\u062d\u062a\u0648\u064a {c}", 34 | "\u0635\u0648\u0631\u0629 \u062a\u0627\u0644\u0641\u0629 {c}", 35 | "\u0635\u0648\u0631\u0629 \u0636\u0628\u0627\u0628\u064a\u0629 \u0644\u0640 {c}", 36 | "\u0635\u0648\u0631\u0629 {c}", 37 | "\u0635\u0648\u0631\u0629 \u062c\u064a\u062f\u0629 \u0644\u0640 {c}", 38 | "\u0635\u0648\u0631\u0629 \u0644\u0640 {c}", 39 | "\u062a\u0635\u064a\u064a\u0631 \u0644\u0640 {c}", 40 | "{c} \u0639\u0644\u0649 \u0634\u0643\u0644 \u0631\u0633\u0645 \u062d\u0627\u0633\u0648\u0628\u064a \u062b\u0646\u0627\u0626\u064a \u0623\u0648 \u062b\u0644\u0627\u062b\u064a \u0627\u0644\u0623\u0628\u0639\u0627\u062f", 41 | "\u064a\u0648\u062c\u062f {c} \u0648\u0627\u062d\u062f \u0641\u064a \u0627\u0644\u0635\u0648\u0631\u0629", 42 | "\u0631\u0633\u0645 \u062d\u0627\u0633\u0648\u0628\u064a \u0644\u0640 {c}", 43 | "\u0627\u0648\u0631\u064a\u063a\u0627\u0645\u064a \u0644\u0640 {c}", 44 | "{c} \u0645\u0635\u0646\u0648\u0639 \u0639\u0646 \u0637\u0631\u064a\u0642 \u0641\u0646 \u0637\u064a \u0627\u0644\u0648\u0631\u0642", 45 | "{c} \u0641\u064a \u0644\u0639\u0628\u0629 \u0641\u064a\u062f\u064a\u0648", 46 | "{c} \u0645\u0648\u062c\u0648\u062f \u0641\u064a \u0644\u0639\u0628\u0629 \u0627\u0644\u0641\u064a\u062f\u064a\u0648", 47 | "\u0631\u0633\u0645 \u062a\u0642\u0631\u064a\u0628\u064a \u0644\u0640 {c}", 48 | "{c} \u0645\u0631\u0633\u0648\u0645 \u0628\u0627\u0644\u062e\u0631\u0627\u0628\u064a\u0634", 49 | "\u0635\u0648\u0631\u0629 \u0628\u0641\u0646 \u0627\u0644\u062e\u0631\u0627\u0628\u064a\u0634 \u0644\u0640 {c}", 50 | "\u0644\u0639\u0628\u0629 {c}", 51 | "\u0635\u0648\u0631\u0629 \u064a\u0648\u062c\u062f \u0641\u064a\u0647\u0627 {c}", 52 | "\u0631\u0633\u0648\u0645 \u0645\u062a\u062d\u0631\u0643\u0629 \u0644\u0640 {c} ", 53 | "\u0635\u0648\u0631\u0629 \u0644\u0639\u062f\u062f \u0645\u0646 {c}", 54 | "\u0635\u0648\u0631\u0629 \u064a\u0638\u0647\u0631 \u0641\u064a\u0647\u0627 {c}", 55 | "\u0635\u0648\u0631\u0629 {c} \u0635\u063a\u064a\u0631 ", 56 | "\u0635\u0648\u0631\u0629 {c} \u0643\u0628\u064a\u0631", 57 | "{c} \u064a\u0638\u0647\u0631 \u0641\u064a \u0627\u0644\u0635\u0648\u0631\u0629" 58 | ] 59 | } -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/babel_imagenet.py: -------------------------------------------------------------------------------- 1 | import torchvision 2 | 3 | """ 4 | BabelImageNet from https://arxiv.org/pdf/2306.08658.pdf 5 | Adapted from https://github.com/gregor-ge/Babel-ImageNet, thanks to the authors 6 | """ 7 | class BabelImageNet(torchvision.datasets.ImageNet): 8 | def __init__(self, root: str, idxs, split: str = "val", download=None, **kwargs) -> None: 9 | super().__init__(root, split, **kwargs) 10 | examples_per_class = len(self.targets) // 1000 11 | select_idxs = [idx*examples_per_class + i for idx in idxs for i in range(examples_per_class)] 12 | self.targets = [i for i in range(len(idxs)) for _ in range(examples_per_class)] 13 | self.imgs = [self.imgs[i] for i in select_idxs] 14 | self.samples = [self.samples[i] for i in select_idxs] 15 | self.idxs = idxs 16 | 17 | def __getitem__(self, i): 18 | img, target = super().__getitem__(i) 19 | target = self.idxs.index(target) 20 | return img, target -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/cn_zeroshot_classification_templates.json: -------------------------------------------------------------------------------- 1 | { 2 | "imagenet1k": [ 3 | "{c}\u7684\u7167\u7247\u3002", 4 | "\u8d28\u91cf\u5dee\u7684{c}\u7684\u7167\u7247\u3002", 5 | "\u8bb8\u591a{c}\u7684\u7167\u7247\u3002", 6 | "{c}\u7684\u96d5\u5851\u3002", 7 | "\u96be\u4ee5\u770b\u5230{c}\u7684\u7167\u7247\u3002", 8 | "{c}\u7684\u4f4e\u5206\u8fa8\u7387\u7167\u7247\u3002", 9 | "{c}\u7684\u6e32\u67d3\u3002", 10 | "\u6d82\u9e26{c}\u3002", 11 | "{c}\u7684\u7cdf\u7cd5\u7167\u7247\u3002", 12 | "{c}\u7684\u88c1\u526a\u7167\u7247\u3002", 13 | "{c}\u7684\u7eb9\u8eab\u3002", 14 | "{c}\u7684\u523a\u7ee3\u7167\u7247\u3002", 15 | "\u5f88\u96be\u770b\u5230{c}\u7684\u7167\u7247\u3002", 16 | "{c}\u7684\u660e\u4eae\u7167\u7247\u3002", 17 | "\u4e00\u5f20\u5e72\u51c0\u7684{c}\u7684\u7167\u7247\u3002", 18 | "\u4e00\u5f20\u5305\u542b{c}\u7684\u7167\u7247\u3002", 19 | "{c}\u7684\u6df1\u8272\u7167\u7247\u3002", 20 | "{c}\u7684\u624b\u7ed8\u753b\u3002", 21 | "\u6211\u7684{c}\u7684\u7167\u7247\u3002", 22 | "\u4e0d\u81ea\u7136\u7684{c}\u7684\u7167\u7247\u3002", 23 | "\u4e00\u5f20\u9177\u7684{c}\u7684\u7167\u7247\u3002", 24 | "{c}\u7684\u7279\u5199\u7167\u7247\u3002", 25 | "{c}\u7684\u9ed1\u767d\u7167\u7247\u3002", 26 | "\u4e00\u5e45{c}\u7684\u753b\u3002", 27 | "\u4e00\u5e45{c}\u7684\u7ed8\u753b\u3002", 28 | "\u4e00\u5f20{c}\u7684\u50cf\u7d20\u7167\u7247\u3002", 29 | "{c}\u7684\u96d5\u50cf\u3002", 30 | "\u4e00\u5f20{c}\u7684\u660e\u4eae\u7167\u7247\u3002", 31 | "{c}\u7684\u88c1\u526a\u7167\u7247\u3002", 32 | "\u4eba\u9020\u7684{c}\u7684\u7167\u7247\u3002", 33 | "\u4e00\u5f20\u5173\u4e8e{c}\u7684\u7167\u7247\u3002", 34 | "\u635f\u574f\u7684{c}\u7684jpeg\u7167\u7247\u3002", 35 | "{c}\u7684\u6a21\u7cca\u7167\u7247\u3002", 36 | "{c}\u7684\u76f8\u7247\u3002", 37 | "\u4e00\u5f20{c}\u7684\u597d\u7167\u7247\u3002", 38 | "{c}\u7684\u6e32\u67d3\u7167\u3002", 39 | "\u89c6\u9891\u6e38\u620f\u4e2d\u7684{c}\u3002", 40 | "\u4e00\u5f20{c}\u7684\u7167\u7247\u3002", 41 | "{c}\u7684\u6d82\u9e26\u3002", 42 | "{c}\u7684\u8fd1\u8ddd\u79bb\u7167\u7247\u3002", 43 | "{c}\u7684\u6298\u7eb8\u3002", 44 | "{c}\u5728\u89c6\u9891\u6e38\u620f\u4e2d\u3002", 45 | "{c}\u7684\u8349\u56fe\u3002", 46 | "{c}\u7684\u6d82\u9e26\u7167\u3002", 47 | "{c}\u7684\u6298\u7eb8\u5f62\u72b6\u3002", 48 | "\u4f4e\u5206\u8fa8\u7387\u7684{c}\u7684\u7167\u7247\u3002", 49 | "\u73a9\u5177{c}\u3002", 50 | "{c}\u7684\u526f\u672c\u3002", 51 | "{c}\u7684\u5e72\u51c0\u7684\u7167\u7247\u3002", 52 | "\u4e00\u5f20\u5927{c}\u7684\u7167\u7247\u3002", 53 | "{c}\u7684\u91cd\u73b0\u3002", 54 | "\u4e00\u5f20\u6f02\u4eae\u7684{c}\u7684\u7167\u7247\u3002", 55 | "\u4e00\u5f20\u5947\u602a\u7684{c}\u7684\u7167\u7247\u3002", 56 | "\u6a21\u7cca\u7684{c}\u7684\u7167\u7247\u3002", 57 | "\u5361\u901a{c}\u3002", 58 | "{c}\u7684\u827a\u672f\u4f5c\u54c1\u3002", 59 | "{c}\u7684\u7d20\u63cf\u3002", 60 | "\u523a\u7ee3{c}\u3002", 61 | "{c}\u7684\u50cf\u7d20\u7167\u3002", 62 | "{c}\u7684\u62cd\u7167\u3002", 63 | "{c}\u7684\u635f\u574f\u7684\u7167\u7247\u3002", 64 | "\u9ad8\u8d28\u91cf\u7684{c}\u7684\u7167\u7247\u3002", 65 | "\u6bdb\u7ed2\u73a9\u5177{c}\u3002", 66 | "\u6f02\u4eae\u7684{c}\u7684\u7167\u7247\u3002", 67 | "\u5c0f{c}\u7684\u7167\u7247\u3002", 68 | "\u7167\u7247\u662f\u5947\u602a\u7684{c}\u3002", 69 | "\u6f2b\u753b{c}\u3002", 70 | "{c}\u7684\u827a\u672f\u7167\u3002", 71 | "{c}\u7684\u56fe\u5f62\u3002", 72 | "\u5927{c}\u7684\u7167\u7247\u3002", 73 | "\u9ed1\u767d\u7684{c}\u7684\u7167\u7247\u3002", 74 | "{c}\u6bdb\u7ed2\u73a9\u5177\u3002", 75 | "\u4e00\u5f20{c}\u7684\u6df1\u8272\u7167\u7247\u3002", 76 | "{c}\u7684\u6444\u5f71\u56fe\u3002", 77 | "{c}\u7684\u6d82\u9e26\u7167\u3002", 78 | "\u73a9\u5177\u5f62\u72b6\u7684{c}\u3002", 79 | "\u62cd\u4e86{c}\u7684\u7167\u7247\u3002", 80 | "\u9177\u9177\u7684{c}\u7684\u7167\u7247\u3002", 81 | "\u7167\u7247\u91cc\u7684\u5c0f{c}\u3002", 82 | "{c}\u7684\u523a\u9752\u3002" 83 | ] 84 | } -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/flickr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from https://github.com/pytorch/vision/blob/main/torchvision/datasets/flickr.py 3 | Thanks to the authors of torchvision 4 | """ 5 | from collections import defaultdict 6 | import glob 7 | import os 8 | from collections import defaultdict 9 | from html.parser import HTMLParser 10 | from typing import Any, Callable, Dict, List, Optional, Tuple 11 | 12 | from PIL import Image 13 | from torchvision.datasets import VisionDataset 14 | 15 | class Flickr(VisionDataset): 16 | 17 | def __init__( 18 | self, 19 | root: str, 20 | ann_file: str, 21 | transform: Optional[Callable] = None, 22 | target_transform: Optional[Callable] = None, 23 | ) -> None: 24 | super().__init__(root, transform=transform, target_transform=target_transform) 25 | self.ann_file = os.path.expanduser(ann_file) 26 | data = defaultdict(list) 27 | with open(ann_file) as fd: 28 | fd.readline() 29 | for line in fd: 30 | line = line.strip() 31 | if line: 32 | # some lines have comma in the caption, se we make sure we do the split correctly 33 | img, caption = line.strip().split(".jpg,") 34 | img = img + ".jpg" 35 | data[img].append(caption) 36 | self.data = list(data.items()) 37 | 38 | def __getitem__(self, index: int) -> Tuple[Any, Any]: 39 | """ 40 | Args: 41 | index (int): Index 42 | 43 | Returns: 44 | tuple: Tuple (image, target). target is a list of captions for the image. 45 | """ 46 | img, captions = self.data[index] 47 | 48 | # Image 49 | img = Image.open(os.path.join(self.root, img)).convert("RGB") 50 | if self.transform is not None: 51 | img = self.transform(img) 52 | 53 | # Captions 54 | target = captions 55 | if self.target_transform is not None: 56 | target = self.target_transform(target) 57 | 58 | return img, target 59 | 60 | 61 | def __len__(self) -> int: 62 | return len(self.data) -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/it_zeroshot_classification_templates.json: -------------------------------------------------------------------------------- 1 | { 2 | "imagenet1k": [ 3 | "una brutta foto di {c}", 4 | "una scultura di {c}", 5 | "una foto di {c} difficilmente visibile", 6 | "una foto a bassa risoluzione di {c}", 7 | "un rendering di {c}", 8 | "graffiti di {c}", 9 | "una pessima foto di {c}", 10 | "una foto ritagliata di {c}", 11 | "un tatuaggio di {c}", 12 | "{c} ricamato", 13 | "{c} ricamata", 14 | "una foto luminosa di {c}", 15 | "una foto di {c} pulito", 16 | "una foto di {c} pulita", 17 | "una foto di {c} sporco", 18 | "una foto di {c} sporca", 19 | "una foto di {c}\u00a0carino", 20 | "una foto di {c} carina", 21 | "una foto di {c} strano", 22 | "una foto di {c} strana", 23 | "una foto di {c} piccolo", 24 | "una foto di {c} piccola", 25 | "una foto di {c} largo", 26 | "una foto di {c} larga", 27 | "una foto di {c} grande", 28 | "una foto scura di {c}", 29 | "un disegno di {c}", 30 | "{c} di plastica", 31 | "una foto del {c} bella", 32 | "una foto ravvicinata di {c}", 33 | "una foto in bianco e nero di {c}", 34 | "un dipinto di {c}", 35 | "una foto sgranata di {c}", 36 | "una foto ritagliata di {c}", 37 | "una foto sfocata di {c}", 38 | "una buona foto di {c}", 39 | "una riproduzione di {c}", 40 | "un rendering di {c}", 41 | "{c} in un video gioco", 42 | "uno scarabocchio di {c}", 43 | "un origami di {c}", 44 | "uno sketch di {c}", 45 | "una bozza di {c}", 46 | "una foto a bassa risoluzione di {c}", 47 | "un giocattolo di {c}", 48 | "una resa di {c}", 49 | "{c} come cartone animato", 50 | "un'opera di {c}", 51 | "un peluche di {c}" 52 | ] 53 | } -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/jp_zeroshot_classification_templates.json: -------------------------------------------------------------------------------- 1 | { 2 | "imagenet1k": [ 3 | "{c}\u306e\u60aa\u3044\u5199\u771f", 4 | "\u591a\u304f\u306e{c}\u306e\u5199\u771f", 5 | "{c}\u306e\u5f6b\u523b", 6 | "\u898b\u3065\u3089\u3044{c}\u306e\u5199\u771f", 7 | "{c}\u306e\u4f4e\u89e3\u50cf\u5ea6\u5199\u771f", 8 | "{c}\u306e\u30ec\u30f3\u30c0\u30ea\u30f3\u30b0", 9 | "{c}\u306e\u843d\u66f8\u304d", 10 | "{c}\u306e\u30c8\u30ea\u30df\u30f3\u30b0\u5199\u771f", 11 | "{c}\u306e\u30bf\u30c8\u30a5\u30fc", 12 | "\u523a\u7e4d\u3055\u308c\u305f{c}", 13 | "{c}\u306e\u660e\u308b\u3044\u5199\u771f", 14 | "\u304d\u308c\u3044\u306a{c}\u306e\u5199\u771f", 15 | "\u6c5a\u308c\u305f{c}\u306e\u5199\u771f", 16 | "{c}\u306e\u6697\u3044\u5199\u771f", 17 | "{c}\u306e\u7d75", 18 | "\u79c1\u306e{c}\u306e\u5199\u771f", 19 | "\u30d7\u30e9\u30b9\u30c1\u30c3\u30af\u88fd\u306e{c}", 20 | "\u304b\u3063\u3053\u3044\u3044{c}\u306e\u5199\u771f", 21 | "{c}\u306e\u30af\u30ed\u30fc\u30ba\u30a2\u30c3\u30d7\u5199\u771f", 22 | "{c}\u306e\u767d\u9ed2\u5199\u771f", 23 | "{c}\u306e\u30d4\u30af\u30bb\u30eb\u5199\u771f", 24 | "jpeg\u3067\u52a0\u5de5\u3057\u305f{c}\u306e\u5199\u771f", 25 | "{c}\u306e\u307c\u3084\u3051\u305f\u5199\u771f", 26 | "{c}\u306e\u5199\u771f", 27 | "{c}\u306e\u826f\u3044\u5199\u771f", 28 | "\u30b2\u30fc\u30e0\u306b\u767b\u5834\u3059\u308b{c}", 29 | "\u6298\u308a\u7d19\u3067\u4f5c\u3063\u305f{c}", 30 | "{c}\u306e\u30b9\u30b1\u30c3\u30c1", 31 | "\u304a\u3082\u3061\u3083\u306e{c}", 32 | "{c}\u306e\u6f14\u51fa", 33 | "\u5927\u304d\u306a{c}\u306e\u5199\u771f", 34 | "\u7d20\u6575\u306a{c}\u306e\u5199\u771f", 35 | "\u5947\u5999\u306a{c}\u306e\u5199\u771f", 36 | "\u6f2b\u753b\u306e{c}", 37 | "{c}\u306e\u82b8\u8853", 38 | "{c}\u306e\u306c\u3044\u3050\u308b\u307f", 39 | "\u5c0f\u3055\u306a{c}\u306e\u5199\u771f" 40 | ] 41 | } -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/multilingual_mscoco.py: -------------------------------------------------------------------------------- 1 | from subprocess import call 2 | import os, json 3 | 4 | from torchvision.datasets import VisionDataset 5 | from PIL import Image 6 | 7 | 8 | GITHUB_MAIN_ORIGINAL_ANNOTATION_PATH = 'https://github.com/mehdidc/retrieval_annotations/releases/download/1.0.0/coco_{}_karpathy.json' 9 | GITHUB_MAIN_PATH = 'https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/XTD10/' 10 | SUPPORTED_LANGUAGES = ['es', 'it', 'ko', 'pl', 'ru', 'tr', 'zh', 'en'] 11 | 12 | IMAGE_INDEX_FILE = 'mscoco-multilingual_index.json' 13 | IMAGE_INDEX_FILE_DOWNLOAD_NAME = 'test_image_names.txt' 14 | 15 | CAPTIONS_FILE_DOWNLOAD_NAME = 'test_1kcaptions_{}.txt' 16 | CAPTIONS_FILE_NAME = 'multilingual_mscoco_captions-{}.json' 17 | 18 | ORIGINAL_ANNOTATION_FILE_NAME = 'coco_{}_karpathy.json' 19 | 20 | 21 | 22 | class Multilingual_MSCOCO(VisionDataset): 23 | 24 | def __init__(self, root, ann_file, transform=None, target_transform=None): 25 | super().__init__(root, transform=transform, target_transform=target_transform) 26 | self.ann_file = os.path.expanduser(ann_file) 27 | with open(ann_file, 'r') as fp: 28 | data = json.load(fp) 29 | 30 | self.data = [(img_path, txt) for img_path, txt in zip(data['image_paths'], data['annotations'])] 31 | 32 | def __getitem__(self, index): 33 | img, captions = self.data[index] 34 | 35 | # Image 36 | img = Image.open(os.path.join(self.root, img)).convert("RGB") 37 | if self.transform is not None: 38 | img = self.transform(img) 39 | 40 | # Captions 41 | target = [captions, ] 42 | if self.target_transform is not None: 43 | target = self.target_transform(target) 44 | 45 | return img, target 46 | 47 | 48 | def __len__(self) -> int: 49 | return len(self.data) 50 | 51 | 52 | def _get_downloadable_file(filename, download_url, is_json=True): 53 | if (os.path.exists(filename) == False): 54 | print("Downloading", download_url) 55 | call("wget {} -O {}".format(download_url, filename), shell=True) 56 | with open(filename, 'r') as fp: 57 | if (is_json): 58 | return json.load(fp) 59 | return [line.strip() for line in fp.readlines()] 60 | 61 | 62 | def create_annotation_file(root, lang_code): 63 | print("Downloading multilingual_ms_coco index file") 64 | download_path = os.path.join(GITHUB_MAIN_PATH, IMAGE_INDEX_FILE_DOWNLOAD_NAME) 65 | target_images = _get_downloadable_file("multilingual_coco_images.txt", download_path, False) 66 | 67 | print("Downloading multilingual_ms_coco captions:", lang_code) 68 | download_path = os.path.join(GITHUB_MAIN_PATH, CAPTIONS_FILE_DOWNLOAD_NAME.format(lang_code)) 69 | target_captions = _get_downloadable_file('raw_multilingual_coco_captions_{}.txt'.format(lang_code), download_path, False) 70 | 71 | number_of_missing_images = 0 72 | valid_images, valid_annotations, valid_indicies = [], [], [] 73 | for i, (img, txt) in enumerate(zip(target_images, target_captions)): 74 | # Create a new file name that includes the root split 75 | root_split = 'val2014' if 'val' in img else 'train2014' 76 | filename_with_root_split = "{}/{}".format(root_split, img) 77 | 78 | if (os.path.exists(filename_with_root_split)): 79 | print("Missing image file", img) 80 | number_of_missing_images += 1 81 | continue 82 | 83 | valid_images.append(filename_with_root_split) 84 | valid_annotations.append(txt) 85 | valid_indicies.append(i) 86 | 87 | if (number_of_missing_images > 0): 88 | print("*** WARNING *** missing {} files.".format(number_of_missing_images)) 89 | 90 | with open(os.path.join(root, CAPTIONS_FILE_NAME.format(lang_code)), 'w') as fp: 91 | json.dump({'image_paths': valid_images, 'annotations': valid_annotations, 'indicies': valid_indicies}, fp) 92 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/objectnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code adapted from https://github.com/mlfoundations/wise-ft/blob/master/src/datasets/objectnet.py 3 | Thanks to the authors of wise-ft 4 | """ 5 | 6 | import os 7 | import json 8 | from pathlib import Path 9 | import PIL 10 | 11 | import numpy as np 12 | 13 | import torch 14 | from torchvision import datasets 15 | from torchvision.transforms import Compose 16 | 17 | from pathlib import Path 18 | 19 | def get_metadata(folder): 20 | metadata = Path(folder) 21 | 22 | with open(metadata / 'folder_to_objectnet_label.json', 'r') as f: 23 | folder_map = json.load(f) 24 | folder_map = {v: k for k, v in folder_map.items()} 25 | with open(metadata / 'objectnet_to_imagenet_1k.json', 'r') as f: 26 | objectnet_map = json.load(f) 27 | 28 | with open(metadata / 'pytorch_to_imagenet_2012_id.json', 'r') as f: 29 | pytorch_map = json.load(f) 30 | pytorch_map = {v: k for k, v in pytorch_map.items()} 31 | 32 | with open(metadata / 'imagenet_to_label_2012_v2', 'r') as f: 33 | imagenet_map = {v.strip(): str(pytorch_map[i]) for i, v in enumerate(f)} 34 | 35 | folder_to_ids, class_sublist = {}, [] 36 | classnames = [] 37 | for objectnet_name, imagenet_names in objectnet_map.items(): 38 | imagenet_names = imagenet_names.split('; ') 39 | imagenet_ids = [int(imagenet_map[imagenet_name]) for imagenet_name in imagenet_names] 40 | class_sublist.extend(imagenet_ids) 41 | folder_to_ids[folder_map[objectnet_name]] = imagenet_ids 42 | 43 | class_sublist = sorted(class_sublist) 44 | class_sublist_mask = [(i in class_sublist) for i in range(1000)] 45 | classname_map = {v: k for k, v in folder_map.items()} 46 | return class_sublist, class_sublist_mask, folder_to_ids, classname_map 47 | 48 | class ObjectNetDataset(datasets.ImageFolder): 49 | 50 | def __init__(self, root, transform): 51 | (self._class_sublist, 52 | self.class_sublist_mask, 53 | self.folders_to_ids, 54 | self.classname_map) = get_metadata(root) 55 | subdir = os.path.join(root, "objectnet-1.0", "images") 56 | label_map = {name: idx for idx, name in enumerate(sorted(list(self.folders_to_ids.keys())))} 57 | self.label_map = label_map 58 | super().__init__(subdir, transform=transform) 59 | self.samples = [ 60 | d for d in self.samples 61 | if os.path.basename(os.path.dirname(d[0])) in self.label_map 62 | ] 63 | self.imgs = self.samples 64 | self.classes = sorted(list(self.folders_to_ids.keys())) 65 | self.classes = [self.classname_map[c].lower() for c in self.classes] 66 | 67 | def __len__(self): 68 | return len(self.samples) 69 | 70 | def __getitem__(self, index): 71 | path, target = self.samples[index] 72 | sample = self.loader(path) 73 | if self.transform is not None: 74 | sample = self.transform(sample) 75 | label = os.path.basename(os.path.dirname(path)) 76 | return sample, self.label_map[label] -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/sugar_crepe.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch.utils.data import Dataset 3 | from PIL import Image 4 | import json 5 | class SugarCrepe(Dataset): 6 | 7 | def __init__(self, root, ann_file, transform=None): 8 | self.root = root 9 | self.ann = json.load(open(ann_file)) 10 | self.transform = transform 11 | 12 | def __getitem__(self, idx): 13 | data = self.ann[str(idx)] 14 | img = Image.open(os.path.join(self.root, data['filename'])) 15 | if self.transform is not None: 16 | img = self.transform(img) 17 | caption = data['caption'] 18 | negative_caption = data['negative_caption'] 19 | return img, [caption, negative_caption] 20 | 21 | def __len__(self): 22 | return len(self.ann) -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/datasets/tfds.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | 4 | 5 | def download_tfds_dataset(name, data_dir=None): 6 | import tensorflow_datasets as tfds 7 | import timm 8 | builder = tfds.builder(name, data_dir=data_dir) 9 | builder.download_and_prepare() 10 | 11 | def disable_gpus_on_tensorflow(): 12 | import tensorflow as tf 13 | tf.config.set_visible_devices([], 'GPU') 14 | 15 | class VTABIterableDataset(torch.utils.data.IterableDataset): 16 | 17 | def __init__(self, tfds_dataset, split="test", input_name="image", label_name="label", input_mode="RGB", transform=None, target_transform=None, classes=None): 18 | self.tfds_dataset = tfds_dataset 19 | self.input_name = input_name 20 | self.label_name = label_name 21 | self.transform = transform 22 | self.target_transform = target_transform 23 | self.input_mode = input_mode 24 | self.num_examples = tfds_dataset.get_num_samples(split) 25 | self.split = split 26 | if classes is None: 27 | self.classes = tfds_dataset._dataset_builder.info.features['label'].names 28 | else: 29 | self.classes = classes 30 | def __iter__(self): 31 | worker_info = torch.utils.data.get_worker_info() 32 | iterator = self.tfds_dataset.get_tf_data(self.split, batch_size=1, epochs=1, for_eval=True) 33 | if worker_info is not None: 34 | iterator = iterator.shard(index=worker_info.id, num_shards=worker_info.num_workers) 35 | nb = 0 36 | for data in iterator: 37 | inputs = (data[self.input_name].numpy()) 38 | labels = data[self.label_name].numpy() 39 | for input, label in zip(inputs, labels): 40 | input = Image.fromarray(input, mode=self.input_mode) 41 | if self.transform is not None: 42 | input = self.transform(input) 43 | if self.target_transform is not None: 44 | label = self.target_transform(label) 45 | yield input, label 46 | 47 | def __len__(self): 48 | return self.num_examples 49 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__init__.py -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__pycache__/captioning.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__pycache__/captioning.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__pycache__/image_caption_selection.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__pycache__/image_caption_selection.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__pycache__/linear_probe.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__pycache__/linear_probe.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__pycache__/zeroshot_classification.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__pycache__/zeroshot_classification.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/__pycache__/zeroshot_retrieval.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/metrics/__pycache__/zeroshot_retrieval.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/captioning.py: -------------------------------------------------------------------------------- 1 | import json 2 | from open_clip import tokenize 3 | from tqdm.auto import tqdm 4 | from open_clip.tokenizer import _tokenizer 5 | 6 | 7 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 8 | from pycocoevalcap.bleu.bleu import Bleu 9 | from pycocoevalcap.meteor.meteor import Meteor 10 | from pycocoevalcap.rouge.rouge import Rouge 11 | from pycocoevalcap.cider.cider import Cider 12 | from pycocoevalcap.spice.spice import Spice 13 | 14 | 15 | """ 16 | Code adapted from https://github.com/salaniz/pycocoevalcap/blob/master/eval.py 17 | Thanks to @salaniz for the code! 18 | """ 19 | class COCOEvalCap: 20 | def __init__(self, results): 21 | self.evalImgs = [] 22 | self.eval = {} 23 | self.imgToEval = {} 24 | self.results = results 25 | def evaluate(self): 26 | gts = {} 27 | res = {} 28 | for imgId, r in enumerate(self.results): 29 | gts[imgId] = r['true'] 30 | res[imgId] = r['gen'] 31 | # ================================================= 32 | # Set up scorers 33 | # ================================================= 34 | print('tokenization...') 35 | tokenizer = PTBTokenizer() 36 | gts = tokenizer.tokenize(gts) 37 | res = tokenizer.tokenize(res) 38 | 39 | # ================================================= 40 | # Set up scorers 41 | # ================================================= 42 | print('setting up scorers...') 43 | scorers = [ 44 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 45 | (Meteor(),"METEOR"), 46 | (Rouge(), "ROUGE_L"), 47 | (Cider(), "CIDEr"), 48 | (Spice(), "SPICE") 49 | ] 50 | 51 | # ================================================= 52 | # Compute scores 53 | # ================================================= 54 | for scorer, method in scorers: 55 | print('computing %s score...'%(scorer.method())) 56 | score, scores = scorer.compute_score(gts, res) 57 | if type(method) == list: 58 | for sc, scs, m in zip(score, scores, method): 59 | self.setEval(sc, m) 60 | self.setImgToEvalImgs(scs, gts.keys(), m) 61 | print("%s: %0.3f"%(m, sc)) 62 | else: 63 | self.setEval(score, method) 64 | self.setImgToEvalImgs(scores, gts.keys(), method) 65 | print("%s: %0.3f"%(method, score)) 66 | self.setEvalImgs() 67 | 68 | def setEval(self, score, method): 69 | self.eval[method] = score 70 | 71 | def setImgToEvalImgs(self, scores, imgIds, method): 72 | for imgId, score in zip(imgIds, scores): 73 | if not imgId in self.imgToEval: 74 | self.imgToEval[imgId] = {} 75 | self.imgToEval[imgId]["image_id"] = imgId 76 | self.imgToEval[imgId][method] = score 77 | 78 | def setEvalImgs(self): 79 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()] 80 | 81 | def evaluate(model, dataloader, batch_size, device, transform, train_dataloader=None, num_workers=None, amp=True, verbose=False): 82 | results = [] 83 | image_id = 0 84 | gt = [] 85 | for idx, (img, captions) in enumerate(tqdm(dataloader)): 86 | out = model.generate(img.to(device)) 87 | decoded = [_tokenizer.decode(i).split("")[0].replace("", "").strip() for i in out.cpu().numpy()] 88 | for pred, true in zip(decoded, captions): 89 | true = [{'caption': t} for t in true] 90 | pred = [{'caption': pred}] 91 | results.append({"image_id":image_id, "gen":pred, "true": true}) 92 | image_id += 1 93 | coco_eval = COCOEvalCap(results) 94 | coco_eval.evaluate() 95 | metrics = coco_eval.eval 96 | # print output evaluation scores 97 | for metric, score in metrics.items(): 98 | print(f'{metric}: {score:.3f}') 99 | return metrics 100 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/metrics/image_caption_selection.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import suppress 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from tqdm import tqdm 7 | 8 | def evaluate(model, dataloader, tokenizer, device, amp=True, recall_k_list=[5]): 9 | """ 10 | Evaluate the model on the given dataset 11 | 12 | Parameters 13 | ---------- 14 | 15 | model: torch.nn,Module 16 | CLIP-like model with `encode_image` and `encode_text` 17 | 18 | dataloader: torch.utils.data.Dataloader 19 | dataloader to use for evaluation 20 | 21 | tokenizer: 22 | text tokenizer, i.e. convert list of strings to torch.Tensor of integers 23 | 24 | device: cpu/cuda 25 | 26 | amp: whether to use automatic mixed precision 27 | 28 | Returns 29 | ------- 30 | 31 | dict of accuracy metric 32 | """ 33 | autocast = torch.cuda.amp.autocast if amp else suppress 34 | preds = [] 35 | for batch_images, batch_texts in tqdm(dataloader): 36 | batch_images = batch_images.to(device) 37 | # tokenize all texts in the batch 38 | batch_texts_tok = tokenizer([text for i, texts in enumerate(batch_texts) for text in texts]).to(device) 39 | nb_texts_for_each_image = [len(texts) for texts in batch_texts] 40 | 41 | # compute the embedding of images and texts 42 | with torch.no_grad(), autocast(): 43 | batch_images_emb = F.normalize(model.encode_image(batch_images), dim=-1).cpu() 44 | batch_texts_emb = F.normalize(model.encode_text(batch_texts_tok), dim=-1).cpu() 45 | start = 0 46 | for i, nb in enumerate(nb_texts_for_each_image): 47 | end = start + nb 48 | image_emb = batch_images_emb[i:i+1] 49 | texts_emb = batch_texts_emb[start:end] 50 | scores = image_emb @ texts_emb.t() 51 | scores = scores[0] 52 | pred = scores.argmax().item() 53 | start = end 54 | preds.append(pred) 55 | pred = torch.Tensor(preds).long() 56 | acc = (pred==0).float().mean().item() # 0 is the index of the caption, the rest (>0) are considered negative captions 57 | metrics = {} 58 | metrics[f"acc"] = acc 59 | return metrics -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/model_collection.py: -------------------------------------------------------------------------------- 1 | import open_clip 2 | 3 | def get_model_collection_from_file(path): 4 | return [l.strip().split(",") for l in open(path).readlines()] 5 | 6 | model_collection = { 7 | "openclip_base": [ 8 | ("ViT-B-32-quickgelu", "laion400m_e32"), 9 | ("ViT-B-32","laion2b_e16"), 10 | ("ViT-B-32","laion2b_s34b_b79k"), 11 | ("ViT-B-16","laion400m_e32"), 12 | ("ViT-B-16-plus-240","laion400m_e32"), 13 | ("ViT-L-14","laion400m_e32"), 14 | ("ViT-L-14","laion2b_s32b_b82k"), 15 | ("ViT-H-14","laion2b_s32b_b79k"), 16 | ("ViT-g-14","laion2b_s12b_b42k"), 17 | ], 18 | "openclip_multilingual":[ 19 | ("xlm-roberta-base-ViT-B-32", "laion5b_s13b_b90k"), 20 | ("xlm-roberta-large-ViT-H-14", "frozen_laion5b_s13b_b90k"), 21 | ], 22 | "openclip_all": open_clip.list_pretrained(), 23 | "openai": [ 24 | ("ViT-B-32","openai"), 25 | ("ViT-B-16","openai"), 26 | ("ViT-L-14", "openai"), 27 | ("ViT-L-14-336", "openai"), 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import torch 3 | from .open_clip import load_open_clip 4 | from .japanese_clip import load_japanese_clip 5 | 6 | # loading function must return (model, transform, tokenizer) 7 | TYPE2FUNC = { 8 | "open_clip": load_open_clip, 9 | "ja_clip": load_japanese_clip 10 | } 11 | MODEL_TYPES = list(TYPE2FUNC.keys()) 12 | 13 | 14 | def load_clip( 15 | model_type: str, 16 | model_name: str, 17 | pretrained: str, 18 | cache_dir: str, 19 | device: Union[str, torch.device] = "cuda", 20 | lora = False 21 | ): 22 | assert model_type in MODEL_TYPES, f"model_type={model_type} is invalid!" 23 | load_func = TYPE2FUNC[model_type] 24 | return load_func(model_name=model_name, pretrained=pretrained, cache_dir=cache_dir, device=device, lora=lora) 25 | -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/models/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/models/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/models/__pycache__/japanese_clip.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/models/__pycache__/japanese_clip.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/models/__pycache__/open_clip.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_benchmark/clip_benchmark/models/__pycache__/open_clip.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_benchmark/clip_benchmark/models/japanese_clip.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import torch 3 | 4 | 5 | class DictTensor: 6 | """ 7 | enable to do `tokenizer(texts).to(device)` 8 | """ 9 | def __init__(self, d: Dict[str, torch.Tensor]): 10 | self.d = d 11 | 12 | def to(self, device): 13 | return {k: v.to(device) for k, v in self.d.items()} 14 | 15 | 16 | class JaCLIPForBenchmark: 17 | """ 18 | enable to do model.encode_text(dict_tensor) 19 | """ 20 | def __init__(self, model): 21 | self.model = model 22 | 23 | def encode_text(self, dict_tensor): 24 | return self.model.get_text_features(**dict_tensor) 25 | 26 | def encode_image(self, image): 27 | return self.model.get_image_features(image) 28 | 29 | 30 | def load_japanese_clip(pretrained: str, device="cpu", **kwargs): 31 | """ 32 | Load Japanese CLIP/CLOOB by rinna (https://github.com/rinnakk/japanese-clip) 33 | Remarks: 34 | - You must input not only input_ids but also attention_masks and position_ids when doing `model.encode_text()` to make it work correctly. 35 | """ 36 | try: 37 | import japanese_clip as ja_clip 38 | except ImportError: 39 | raise ImportError("Install `japanese_clip` by `pip install git+https://github.com/rinnakk/japanese-clip.git`") 40 | cache_dir = kwargs.pop("cache_dir", None) 41 | model, transform = ja_clip.load(pretrained, device=device, cache_dir=cache_dir) 42 | 43 | class JaTokenizerForBenchmark: 44 | def __init__(self, ): 45 | self.tokenizer = ja_clip.load_tokenizer() 46 | 47 | def __call__(self, texts) -> Dict[str, torch.Tensor]: 48 | inputs = ja_clip.tokenize(texts, tokenizer=self.tokenizer, device="cpu") 49 | return DictTensor(inputs) 50 | 51 | def __len__(self): 52 | return len(self.tokenizer) 53 | 54 | return JaCLIPForBenchmark(model), transform, JaTokenizerForBenchmark() 55 | -------------------------------------------------------------------------------- /CLIP_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_eval/__init__.py -------------------------------------------------------------------------------- /CLIP_eval/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_eval/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_eval/__pycache__/clip_robustbench.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_eval/__pycache__/clip_robustbench.cpython-311.pyc -------------------------------------------------------------------------------- /CLIP_eval/__pycache__/eval_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/CLIP_eval/__pycache__/eval_utils.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /LLaVA/llava/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/__pycache__/constants.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/__pycache__/constants.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/__pycache__/conversation.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/__pycache__/conversation.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/__pycache__/mm_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/__pycache__/mm_utils.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | 86 | if isinstance(inst['caption'], list): 87 | cap_str = '\n'.join(inst['caption']) 88 | else: 89 | cap_str = inst['caption'] 90 | 91 | category = 'llava_bench_' + json.loads(ques_js)['category'] 92 | if category in rule_dict: 93 | rule = rule_dict[category] 94 | else: 95 | assert False, f"Visual QA category not found in rule file: {category}." 96 | prompt = rule['prompt'] 97 | role = rule['role'] 98 | content = (f'[Context]\n{cap_str}\n\n' 99 | f'[Question]\n{ques["text"]}\n\n' 100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 102 | f'[System]\n{prompt}\n\n') 103 | cur_js = { 104 | 'id': idx+1, 105 | 'question_id': ques['question_id'], 106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 108 | 'category': category 109 | } 110 | if idx >= len(cur_reviews): 111 | review = get_eval(content, args.max_tokens) 112 | scores = parse_score(review) 113 | cur_js['content'] = review 114 | cur_js['tuple'] = scores 115 | review_file.write(json.dumps(cur_js) + '\n') 116 | review_file.flush() 117 | else: 118 | print(f'Skipping {idx} as we already have it.') 119 | idx += 1 120 | print(idx) 121 | review_file.close() 122 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/generate_webpage_data_from_table.py: -------------------------------------------------------------------------------- 1 | """Generate json file for webpage.""" 2 | import json 3 | import os 4 | import re 5 | 6 | # models = ['llama', 'alpaca', 'gpt35', 'bard'] 7 | models = ['vicuna'] 8 | 9 | 10 | def read_jsonl(path: str, key: str=None): 11 | data = [] 12 | with open(os.path.expanduser(path)) as f: 13 | for line in f: 14 | if not line: 15 | continue 16 | data.append(json.loads(line)) 17 | if key is not None: 18 | data.sort(key=lambda x: x[key]) 19 | data = {item[key]: item for item in data} 20 | return data 21 | 22 | 23 | def trim_hanging_lines(s: str, n: int) -> str: 24 | s = s.strip() 25 | for _ in range(n): 26 | s = s.split('\n', 1)[1].strip() 27 | return s 28 | 29 | 30 | if __name__ == '__main__': 31 | questions = read_jsonl('table/question.jsonl', key='question_id') 32 | 33 | # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') 34 | # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') 35 | # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') 36 | # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') 37 | vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') 38 | ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') 39 | 40 | review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') 41 | # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') 42 | # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') 43 | # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') 44 | # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') 45 | 46 | records = [] 47 | for qid in questions.keys(): 48 | r = { 49 | 'id': qid, 50 | 'category': questions[qid]['category'], 51 | 'question': questions[qid]['text'], 52 | 'answers': { 53 | # 'alpaca': alpaca_answers[qid]['text'], 54 | # 'llama': llama_answers[qid]['text'], 55 | # 'bard': bard_answers[qid]['text'], 56 | # 'gpt35': gpt35_answers[qid]['text'], 57 | 'vicuna': vicuna_answers[qid]['text'], 58 | 'ours': ours_answers[qid]['text'], 59 | }, 60 | 'evaluations': { 61 | # 'alpaca': review_alpaca[qid]['text'], 62 | # 'llama': review_llama[qid]['text'], 63 | # 'bard': review_bard[qid]['text'], 64 | 'vicuna': review_vicuna[qid]['content'], 65 | # 'gpt35': review_gpt35[qid]['text'], 66 | }, 67 | 'scores': { 68 | 'vicuna': review_vicuna[qid]['tuple'], 69 | # 'alpaca': review_alpaca[qid]['score'], 70 | # 'llama': review_llama[qid]['score'], 71 | # 'bard': review_bard[qid]['score'], 72 | # 'gpt35': review_gpt35[qid]['score'], 73 | }, 74 | } 75 | 76 | # cleanup data 77 | cleaned_evals = {} 78 | for k, v in r['evaluations'].items(): 79 | v = v.strip() 80 | lines = v.split('\n') 81 | # trim the first line if it's a pair of numbers 82 | if re.match(r'\d+[, ]+\d+', lines[0]): 83 | lines = lines[1:] 84 | v = '\n'.join(lines) 85 | cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') 86 | 87 | r['evaluations'] = cleaned_evals 88 | records.append(r) 89 | 90 | # Reorder the records, this is optional 91 | for r in records: 92 | if r['id'] <= 20: 93 | r['id'] += 60 94 | else: 95 | r['id'] -= 20 96 | for r in records: 97 | if r['id'] <= 50: 98 | r['id'] += 10 99 | elif 50 < r['id'] <= 60: 100 | r['id'] -= 50 101 | for r in records: 102 | if r['id'] == 7: 103 | r['id'] = 1 104 | elif r['id'] < 7: 105 | r['id'] += 1 106 | 107 | records.sort(key=lambda x: x['id']) 108 | 109 | # Write to file 110 | with open('webpage/data.json', 'w') as f: 111 | json.dump({'questions': records, 'models': models}, f, indent=2) 112 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | @torch.inference_mode() 14 | def eval_model(model_name, questions_file, answers_file): 15 | # Model 16 | disable_torch_init() 17 | model_name = os.path.expanduser(model_name) 18 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 19 | model = AutoModelForCausalLM.from_pretrained(model_name, 20 | torch_dtype=torch.float16).cuda() 21 | 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 41 | try: 42 | index = outputs.index(conv.sep, len(prompt)) 43 | except ValueError: 44 | outputs += conv.sep 45 | index = outputs.index(conv.sep, len(prompt)) 46 | 47 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 48 | ans_id = shortuuid.uuid() 49 | ans_file.write(json.dumps({"question_id": idx, 50 | "text": outputs, 51 | "answer_id": ans_id, 52 | "model_id": model_name, 53 | "metadata": {}}) + "\n") 54 | ans_file.flush() 55 | ans_file.close() 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 61 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 62 | args = parser.parse_args() 63 | 64 | eval_model(args.model_name, args.question_file, args.answers_file) 65 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | 36 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 37 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 38 | answers_file = os.path.expanduser(args.answers_file) 39 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 40 | ans_file = open(answers_file, "w") 41 | for line in tqdm(questions): 42 | idx = line["question_id"] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | cur_prompt = qs 46 | if model.config.mm_use_im_start_end: 47 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 48 | else: 49 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 50 | 51 | conv = conv_templates[args.conv_mode].copy() 52 | conv.append_message(conv.roles[0], qs) 53 | conv.append_message(conv.roles[1], None) 54 | prompt = conv.get_prompt() 55 | 56 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 57 | 58 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 59 | image_tensor = process_images([image], image_processor, model.config)[0] 60 | 61 | with torch.inference_mode(): 62 | output_ids = model.generate( 63 | input_ids, 64 | images=image_tensor.unsqueeze(0).half().cuda(), 65 | image_sizes=[image.size], 66 | do_sample=True if args.temperature > 0 else False, 67 | temperature=args.temperature, 68 | top_p=args.top_p, 69 | num_beams=args.num_beams, 70 | # no_repeat_ngram_size=3, 71 | max_new_tokens=1024, 72 | use_cache=True) 73 | 74 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 75 | 76 | ans_id = shortuuid.uuid() 77 | ans_file.write(json.dumps({"question_id": idx, 78 | "prompt": cur_prompt, 79 | "text": outputs, 80 | "answer_id": ans_id, 81 | "model_id": model_name, 82 | "metadata": {}}) + "\n") 83 | ans_file.flush() 84 | ans_file.close() 85 | 86 | if __name__ == "__main__": 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 89 | parser.add_argument("--model-base", type=str, default=None) 90 | parser.add_argument("--image-folder", type=str, default="") 91 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 92 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 93 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 94 | parser.add_argument("--num-chunks", type=int, default=1) 95 | parser.add_argument("--chunk-idx", type=int, default=0) 96 | parser.add_argument("--temperature", type=float, default=0.2) 97 | parser.add_argument("--top_p", type=float, default=None) 98 | parser.add_argument("--num_beams", type=int, default=1) 99 | args = parser.parse_args() 100 | 101 | eval_model(args) 102 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /LLaVA/llava/eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /LLaVA/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | #from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig 3 | -------------------------------------------------------------------------------- /LLaVA/llava/model/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/__pycache__/builder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/__pycache__/builder.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/__pycache__/llava_arch.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/__pycache__/llava_arch.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /LLaVA/llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/__pycache__/llava_llama.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/__pycache__/llava_llama.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/__pycache__/llava_mpt.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/__pycache__/llava_mpt.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/attention.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/attention.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/blocks.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/blocks.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/__pycache__/norm.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/language_model/mpt/__pycache__/norm.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/adapt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast 3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] 4 | NUM_SENTINEL_TOKENS: int = 100 5 | 6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer): 7 | """Adds sentinel tokens and padding token (if missing). 8 | 9 | Expands the tokenizer vocabulary to include sentinel tokens 10 | used in mixture-of-denoiser tasks as well as a padding token. 11 | 12 | All added tokens are added as special tokens. No tokens are 13 | added if sentinel tokens and padding token already exist. 14 | """ 15 | sentinels_to_add = [f'' for i in range(NUM_SENTINEL_TOKENS)] 16 | tokenizer.add_tokens(sentinels_to_add, special_tokens=True) 17 | if tokenizer.pad_token is None: 18 | tokenizer.add_tokens('', special_tokens=True) 19 | tokenizer.pad_token = '' 20 | assert tokenizer.pad_token_id is not None 21 | sentinels = ''.join([f'' for i in range(NUM_SENTINEL_TOKENS)]) 22 | _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids 23 | tokenizer.sentinel_token_ids = _sentinel_token_ids 24 | 25 | class AutoTokenizerForMOD(AutoTokenizer): 26 | """AutoTokenizer + Adaptation for MOD. 27 | 28 | A simple wrapper around AutoTokenizer to make instantiating 29 | an MOD-adapted tokenizer a bit easier. 30 | 31 | MOD-adapted tokenizers have sentinel tokens (e.g., ), 32 | a padding token, and a property to get the token ids of the 33 | sentinel tokens. 34 | """ 35 | 36 | @classmethod 37 | def from_pretrained(cls, *args, **kwargs): 38 | """See `AutoTokenizer.from_pretrained` docstring.""" 39 | tokenizer = super().from_pretrained(*args, **kwargs) 40 | adapt_tokenizer_for_denoising(tokenizer) 41 | return tokenizer -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/blocks.py: -------------------------------------------------------------------------------- 1 | """GPT Blocks used for the GPT Model.""" 2 | from typing import Dict, Optional, Tuple 3 | import torch 4 | import torch.nn as nn 5 | from .attention import ATTN_CLASS_REGISTRY 6 | from .norm import NORM_CLASS_REGISTRY 7 | 8 | class MPTMLP(nn.Module): 9 | 10 | def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None): 11 | super().__init__() 12 | self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device) 13 | self.act = nn.GELU(approximate='none') 14 | self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device) 15 | self.down_proj._is_residual = True 16 | 17 | def forward(self, x): 18 | return self.down_proj(self.act(self.up_proj(x))) 19 | 20 | class MPTBlock(nn.Module): 21 | 22 | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs): 23 | del kwargs 24 | super().__init__() 25 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] 26 | attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']] 27 | self.norm_1 = norm_class(d_model, device=device) 28 | self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device) 29 | self.norm_2 = norm_class(d_model, device=device) 30 | self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device) 31 | self.resid_attn_dropout = nn.Dropout(resid_pdrop) 32 | self.resid_ffn_dropout = nn.Dropout(resid_pdrop) 33 | 34 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]: 35 | a = self.norm_1(x) 36 | (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal) 37 | x = x + self.resid_attn_dropout(b) 38 | m = self.norm_2(x) 39 | n = self.ffn(m) 40 | x = x + self.resid_ffn_dropout(n) 41 | return (x, attn_weights, past_key_value) -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | class SharedEmbedding(nn.Embedding): 7 | 8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/meta_init_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import torch 3 | import torch.nn as nn 4 | 5 | @contextmanager 6 | def init_empty_weights(include_buffers: bool=False): 7 | """Meta initialization context manager. 8 | 9 | A context manager under which models are initialized with all parameters 10 | on the meta device, therefore creating an empty model. Useful when just 11 | initializing the model would blow the available RAM. 12 | 13 | Args: 14 | include_buffers (`bool`, *optional*, defaults to `False`): Whether or 15 | not to also put all buffers on the meta device while initializing. 16 | 17 | Example: 18 | ```python 19 | import torch.nn as nn 20 | 21 | # Initialize a model with 100 billions parameters in no time and without using any RAM. 22 | with init_empty_weights(): 23 | tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)]) 24 | ``` 25 | 26 | 27 | 28 | Any model created under this context manager has no weights. As such you can't do something like 29 | `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`]. 30 | 31 | 32 | """ 33 | with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f: 34 | yield f 35 | 36 | @contextmanager 37 | def init_on_device(device: torch.device, include_buffers: bool=False): 38 | """Device initialization context manager. 39 | 40 | A context manager under which models are initialized with all parameters 41 | on the specified device. 42 | 43 | Args: 44 | device (`torch.device`): Device to initialize all parameters on. 45 | include_buffers (`bool`, *optional*, defaults to `False`): Whether or 46 | not to also put all buffers on the meta device while initializing. 47 | 48 | Example: 49 | ```python 50 | import torch.nn as nn 51 | 52 | with init_on_device(device=torch.device("cuda")): 53 | tst = nn.Liner(100, 100) # on `cuda` device 54 | ``` 55 | """ 56 | old_register_parameter = nn.Module.register_parameter 57 | if include_buffers: 58 | old_register_buffer = nn.Module.register_buffer 59 | 60 | def register_empty_parameter(module, name, param): 61 | old_register_parameter(module, name, param) 62 | if param is not None: 63 | param_cls = type(module._parameters[name]) 64 | kwargs = module._parameters[name].__dict__ 65 | module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) 66 | 67 | def register_empty_buffer(module, name, buffer): 68 | old_register_buffer(module, name, buffer) 69 | if buffer is not None: 70 | module._buffers[name] = module._buffers[name].to(device) 71 | if include_buffers: 72 | tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']} 73 | else: 74 | tensor_constructors_to_patch = {} 75 | 76 | def patch_tensor_constructor(fn): 77 | 78 | def wrapper(*args, **kwargs): 79 | kwargs['device'] = device 80 | return fn(*args, **kwargs) 81 | return wrapper 82 | try: 83 | nn.Module.register_parameter = register_empty_parameter 84 | if include_buffers: 85 | nn.Module.register_buffer = register_empty_buffer 86 | for torch_function_name in tensor_constructors_to_patch.keys(): 87 | setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name))) 88 | yield 89 | finally: 90 | nn.Module.register_parameter = old_register_parameter 91 | if include_buffers: 92 | nn.Module.register_buffer = old_register_buffer 93 | for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items(): 94 | setattr(torch, torch_function_name, old_torch_function) -------------------------------------------------------------------------------- /LLaVA/llava/model/language_model/mpt/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def _cast_if_autocast_enabled(tensor): 4 | if torch.is_autocast_enabled(): 5 | if tensor.device.type == 'cuda': 6 | dtype = torch.get_autocast_gpu_dtype() 7 | elif tensor.device.type == 'cpu': 8 | dtype = torch.get_autocast_cpu_dtype() 9 | else: 10 | raise NotImplementedError() 11 | return tensor.to(dtype=dtype) 12 | return tensor 13 | 14 | class LPLayerNorm(torch.nn.LayerNorm): 15 | 16 | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None): 17 | super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype) 18 | 19 | def forward(self, x): 20 | module_device = x.device 21 | downcast_x = _cast_if_autocast_enabled(x) 22 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 23 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias 24 | with torch.autocast(enabled=False, device_type=module_device.type): 25 | return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps) 26 | 27 | def rms_norm(x, weight=None, eps=1e-05): 28 | output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) 29 | if weight is not None: 30 | return output * weight 31 | return output 32 | 33 | class RMSNorm(torch.nn.Module): 34 | 35 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 36 | super().__init__() 37 | self.eps = eps 38 | if weight: 39 | self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device)) 40 | else: 41 | self.register_parameter('weight', None) 42 | 43 | def forward(self, x): 44 | return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype) 45 | 46 | class LPRMSNorm(RMSNorm): 47 | 48 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 49 | super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device) 50 | 51 | def forward(self, x): 52 | downcast_x = _cast_if_autocast_enabled(x) 53 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 54 | with torch.autocast(enabled=False, device_type=x.device.type): 55 | return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype) 56 | NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm} -------------------------------------------------------------------------------- /LLaVA/llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /LLaVA/llava/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /LLaVA/llava/model/multimodal_projector/__pycache__/builder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/model/multimodal_projector/__pycache__/builder.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /LLaVA/llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /LLaVA/llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/serve/__init__.py -------------------------------------------------------------------------------- /LLaVA/llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /LLaVA/llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /LLaVA/llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /LLaVA/llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /LLaVA/llava/train/__pycache__/llava_trainer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/train/__pycache__/llava_trainer.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/train/__pycache__/train.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/LLaVA/llava/train/__pycache__/train.cpython-311.pyc -------------------------------------------------------------------------------- /LLaVA/llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from llava.train.train import train 3 | 4 | if __name__ == "__main__": 5 | train(attn_implementation="flash_attention_2") 6 | -------------------------------------------------------------------------------- /LLaVA/llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /LLaVA/llava/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from llava.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True, encoding='UTF-8') 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | eval_single(args.result_file, eval_only_type='video') 63 | 64 | with open(args.result_upload_file, 'w') as fp: 65 | for question in data['questions']: 66 | qid = question['question_id'] 67 | if qid in results: 68 | result = results[qid] 69 | else: 70 | result = results[int(qid)] 71 | fp.write(json.dumps({ 72 | 'question_id': qid, 73 | 'prediction': result['text'] 74 | }) + '\n') 75 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fire 4 | import re 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot 6 | 7 | 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): 9 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 10 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 11 | 12 | split_problems = build_prompt_chatbot( 13 | problems, split_indices, prompt_format, 14 | use_caption=False, is_test=False) 15 | 16 | target_format = [] 17 | for prob_id, (input, output) in split_problems.items(): 18 | if input.startswith('Question: '): 19 | input = input.replace('Question: ', '') 20 | if output.startswith('Answer: '): 21 | output = output.replace('Answer: ', '') 22 | 23 | raw_prob_data = problems[prob_id] 24 | if raw_prob_data['image'] is None: 25 | target_format.append({ 26 | "id": prob_id, 27 | "conversations": [ 28 | {'from': 'human', 'value': f"{input}"}, 29 | {'from': 'gpt', 'value': f"{output}"}, 30 | ], 31 | }) 32 | 33 | else: 34 | target_format.append({ 35 | "id": prob_id, 36 | "image": os.path.join(prob_id, raw_prob_data['image']), 37 | "conversations": [ 38 | {'from': 'human', 'value': f"{input}\n"}, 39 | {'from': 'gpt', 'value': f"{output}"}, 40 | ], 41 | }) 42 | 43 | print(f'Number of samples: {len(target_format)}') 44 | 45 | with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: 46 | json.dump(target_format, f, indent=2) 47 | 48 | 49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): 50 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 51 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 52 | 53 | split_problems = build_prompt_chatbot( 54 | problems, split_indices, prompt_format, 55 | use_caption=False, is_test=False) 56 | 57 | writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") 58 | for prob_id, (input, output) in split_problems.items(): 59 | if input.startswith('Question: '): 60 | input = input.replace('Question: ', '') 61 | if output.startswith('Answer: '): 62 | output = output.replace('Answer: ', '') 63 | 64 | raw_prob_data = problems[prob_id] 65 | if raw_prob_data['image'] is None: 66 | data = { 67 | "id": prob_id, 68 | "instruction": f"{input}", 69 | "output": f"{output}", 70 | } 71 | 72 | else: 73 | data = { 74 | "id": prob_id, 75 | "image": os.path.join(prob_id, raw_prob_data['image']), 76 | "instruction": f"{input}\n", 77 | "output": f"{output}", 78 | } 79 | writer.write(json.dumps(data) + '\n') 80 | writer.close() 81 | 82 | 83 | def main(task, **kwargs): 84 | globals()[task](**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | fire.Fire(main) 89 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /LLaVA/scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /LLaVA/scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /LLaVA/scripts/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_80k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /LLaVA/scripts/finetune_full_schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_158k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 3 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /LLaVA/scripts/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 21 | --version $PROMPT_VERSION \ 22 | --data_path ./playground/data/llava_instruct_80k.json \ 23 | --image_folder /path/to/coco/train2017 \ 24 | --vision_tower openai/clip-vit-large-patch14 \ 25 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 26 | --mm_vision_select_layer -2 \ 27 | --mm_use_im_start_end False \ 28 | --mm_use_im_patch_token False \ 29 | --bf16 True \ 30 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 16 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 1 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 50000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --lazy_preprocess True \ 48 | --dataloader_num_workers 4 \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /LLaVA/scripts/finetune_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --bits 4 \ 21 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 22 | --version $PROMPT_VERSION \ 23 | --data_path ./playground/data/llava_instruct_80k.json \ 24 | --image_folder /path/to/coco/train2017 \ 25 | --vision_tower openai/clip-vit-large-patch14 \ 26 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 27 | --mm_vision_select_layer -2 \ 28 | --mm_use_im_start_end False \ 29 | --mm_use_im_patch_token False \ 30 | --bf16 True \ 31 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 32 | --num_train_epochs 1 \ 33 | --per_device_train_batch_size 16 \ 34 | --per_device_eval_batch_size 4 \ 35 | --gradient_accumulation_steps 1 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 50000 \ 39 | --save_total_limit 1 \ 40 | --learning_rate 2e-5 \ 41 | --weight_decay 0. \ 42 | --warmup_ratio 0.03 \ 43 | --lr_scheduler_type "cosine" \ 44 | --logging_steps 1 \ 45 | --tf32 True \ 46 | --model_max_length 2048 \ 47 | --gradient_checkpointing True \ 48 | --lazy_preprocess True \ 49 | --dataloader_num_workers 4 \ 50 | --report_to wandb 51 | -------------------------------------------------------------------------------- /LLaVA/scripts/finetune_sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | deepspeed llava/train/train_mem.py \ 6 | --deepspeed ./scripts/zero2.json \ 7 | --model_name_or_path lmsys/vicuna-13b-v1.3 \ 8 | --version $PROMPT_VERSION \ 9 | --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \ 10 | --image_folder /Data/ScienceQA/data/scienceqa/images/train \ 11 | --vision_tower openai/clip-vit-large-patch14 \ 12 | --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \ 18 | --num_train_epochs 12 \ 19 | --per_device_train_batch_size 16 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to wandb 37 | -------------------------------------------------------------------------------- /LLaVA/scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from llava.model.builder import load_pretrained_model 4 | from llava.mm_utils import get_model_name_from_path 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, dtype='float16', pretrained_rob_path=args.vision_tower, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | parser.add_argument("--vision_tower", type=str, required=True) 20 | 21 | args = parser.parse_args() 22 | 23 | merge_lora(args) 24 | -------------------------------------------------------------------------------- /LLaVA/scripts/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | # MODEL_VERSION=vicuna-v1-3-7b 8 | # MODEL_VERSION=llama-2-7b-chat 9 | 10 | ########### DO NOT CHANGE ########### 11 | ########### USE THIS FOR BOTH ########### 12 | PROMPT_VERSION=plain 13 | ########### DO NOT CHANGE ########### 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 18 | --version $PROMPT_VERSION \ 19 | --data_path /path/to/pretrain_data.json \ 20 | --image_folder /path/to/images \ 21 | --vision_tower openai/clip-vit-large-patch14 \ 22 | --tune_mm_mlp_adapter True \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 16 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 1 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 24000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 2e-3 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to wandb 47 | -------------------------------------------------------------------------------- /LLaVA/scripts/pretrain_xformers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | # MODEL_VERSION=vicuna-v1-3-7b 6 | # MODEL_VERSION=llama-2-7b-chat 7 | 8 | ########### DO NOT CHANGE ########### 9 | ########### USE THIS FOR BOTH ########### 10 | PROMPT_VERSION=plain 11 | ########### DO NOT CHANGE ########### 12 | 13 | deepspeed llava/train/train_xformers.py \ 14 | --deepspeed ./scripts/zero2.json \ 15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 16 | --version $PROMPT_VERSION \ 17 | --data_path /path/to/pretrain_data.json \ 18 | --image_folder /path/to/images \ 19 | --vision_tower openai/clip-vit-large-patch14 \ 20 | --tune_mm_mlp_adapter True \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --bf16 False \ 25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 4 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 2e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 False \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | -------------------------------------------------------------------------------- /LLaVA/scripts/sqa_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | for IDX in {0..7}; do 5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ 6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ 7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ 8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ 9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ 10 | --num-chunks $CHUNKS \ 11 | --chunk-idx $IDX \ 12 | --conv-mode llava_v1 & 13 | done 14 | -------------------------------------------------------------------------------- /LLaVA/scripts/sqa_eval_gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | output_file="test_llava-13b.jsonl" 5 | 6 | # Clear out the output file if it exists. 7 | > "$output_file" 8 | 9 | # Loop through the indices and concatenate each file. 10 | for idx in $(seq 0 $((CHUNKS-1))); do 11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" 12 | done 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ 16 | --result-file ./test_llava-13b.jsonl \ 17 | --output-file ./test_llava-13b_output.json \ 18 | --output-result ./test_llava-13b_result.json 19 | -------------------------------------------------------------------------------- /LLaVA/scripts/upload_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Step 0: Clean up 4 | rm -rf dist 5 | 6 | # Step 1: Change the package name to "llava-torch" 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml 8 | 9 | # Step 2: Build the package 10 | python -m build 11 | 12 | # Step 3: Revert the changes in pyproject.toml to the original 13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml 14 | 15 | # Step 4: Upload to PyPI 16 | python -m twine upload dist/* 17 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_gqa_testdev_balanced" 10 | GQADIR="./playground/data/eval/gqa/data" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path liuhaotian/llava-v1.5-13b \ 15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/gqa/data/images \ 17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 37 | 38 | cd $GQADIR 39 | python eval/eval.py --tier testdev_balanced 40 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 12 | 13 | python llava/eval/eval_gpt_review_bench.py \ 14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 16 | --rule llava/eval/table/rule.json \ 17 | --answer-list \ 18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 20 | --output \ 21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 22 | 23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 24 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 14 | 15 | python scripts/convert_mmbench_for_submission.py \ 16 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 17 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 19 | --experiment llava-v1.5-13b 20 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_cn_20231003" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --lang cn \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 18 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \ 19 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \ 20 | --experiment llava-v1.5-13b 21 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 6 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 7 | --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | cd ./playground/data/eval/MME 12 | 13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b 14 | 15 | cd eval_tool 16 | 17 | python calculation.py --results_dir answers/llava-v1.5-13b 18 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 6 | --image-folder ./playground/data/eval/mm-vet/images \ 7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p ./playground/data/eval/mm-vet/results 12 | 13 | python scripts/convert_mmvet_for_eval.py \ 14 | --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 15 | --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json 16 | 17 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 6 | --image-folder ./playground/data/eval/pope/val2014 \ 7 | --answers-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python llava/eval/eval_pope.py \ 12 | --annotation-dir ./playground/data/eval/pope/coco \ 13 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 14 | --result-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl 15 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/qbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | echo "Evaluating in 'dev' split." 5 | elif [ "$1" = "test" ]; then 6 | echo "Evaluating in 'test' split." 7 | else 8 | echo "Unknown split, please choose between 'dev' and 'test'." 9 | exit 1 10 | fi 11 | 12 | python -m llava.eval.model_vqa_qbench \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \ 16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \ 17 | --conv-mode llava_v1 \ 18 | --lang en 19 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/qbench_zh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | ZH_SPLIT="验证集" 5 | echo "Evaluating in 'dev' split." 6 | elif [ "$1" = "test" ]; then 7 | ZH_SPLIT="测试集" 8 | echo "Evaluating in 'test' split." 9 | else 10 | echo "Unknown split, please choose between 'dev' and 'test'." 11 | exit 1 12 | fi 13 | 14 | python -m llava.eval.model_vqa_qbench \ 15 | --model-path liuhaotian/llava-v1.5-13b \ 16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \ 18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \ 19 | --conv-mode llava_v1 \ 20 | --lang zh 21 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 12 | --model-path liuhaotian/llava-v1.5-13b \ 13 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \ 14 | --image-folder ./playground/data/eval/seed_bench \ 15 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode vicuna_v1 & 20 | done 21 | 22 | wait 23 | 24 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 25 | 26 | # Clear out the output file if it exists. 27 | > "$output_file" 28 | 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | # Evaluate 35 | python scripts/convert_seed_for_submission.py \ 36 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 37 | --result-file $output_file \ 38 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b.jsonl 39 | 40 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_science \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 6 | --image-folder ./playground/data/eval/scienceqa/images/test \ 7 | --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 8 | --single-pred-prompt \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | python llava/eval/eval_science_qa.py \ 13 | --base-dir ./playground/data/eval/scienceqa \ 14 | --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 15 | --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \ 16 | --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json 17 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 6 | --image-folder ./playground/data/eval/textvqa/train_images \ 7 | --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python -m llava.eval.eval_textvqa \ 12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 13 | --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl 14 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 6 | --image-folder ./playground/data/eval/vizwiz/test \ 7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python scripts/convert_vizwiz_for_submission.py \ 12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 13 | --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json 15 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder ./playground/data/eval/vqav2/test2015 \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 36 | 37 | -------------------------------------------------------------------------------- /LLaVA/scripts/v1_5/finetune_task_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | module load gcc/11.2.0 4 | export NCCL_P2P_DISABLE=1 5 | export NCCL_IB_DISABLE=1 6 | 7 | deepspeed llava/train/train_mem.py \ 8 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 9 | --deepspeed ./scripts/zero3.json \ 10 | --model_name_or_path liuhaotian/llava-v1.5-7b \ 11 | --version v1 \ 12 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 13 | --image_folder ./playground/data \ 14 | --vision_tower ../checkpoint/ViT-L-14-336_openai_imagenet_l2_imagenet_exp_1_pdAgu/checkpoints/final.pt \ 15 | --mm_projector_type mlp2x_gelu \ 16 | --mm_vision_select_layer -2 \ 17 | --mm_use_im_start_end False \ 18 | --mm_use_im_patch_token False \ 19 | --image_aspect_ratio pad \ 20 | --group_by_modality_length True \ 21 | --bf16 True \ 22 | --output_dir ./checkpoints/llava-v1.5-7b-336-1-lora-l \ 23 | --num_train_epochs 1 \ 24 | --per_device_train_batch_size 1 \ 25 | --per_device_eval_batch_size 4 \ 26 | --gradient_accumulation_steps 1 \ 27 | --evaluation_strategy "no" \ 28 | --save_strategy "steps" \ 29 | --save_steps 50000 \ 30 | --save_total_limit 1 \ 31 | --learning_rate 2e-4 \ 32 | --weight_decay 0. \ 33 | --warmup_ratio 0.03 \ 34 | --lr_scheduler_type "cosine" \ 35 | --logging_steps 1 \ 36 | --tf32 True \ 37 | --model_max_length 2048 \ 38 | --gradient_checkpointing True \ 39 | --dataloader_num_workers 4 \ 40 | --lazy_preprocess True \ 41 | --report_to none 42 | -------------------------------------------------------------------------------- /LLaVA/scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /LLaVA/scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /LLaVA/scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kernel-based Unsupervised Embedding Alignment for Enhanced Visual Representation in Vision-language Models 2 | Implementation for ICML 2025 paper [Kernel-based Unsupervised Embedding Alignment for Enhanced Visual Representation 3 | in Vision-language Models](https://arxiv.org/abs/2506.02557) 4 | by [Shizhan Gong](https://peterant330.github.io/), Yankai Jiang, [Qi Dou](https://www.cse.cuhk.edu.hk/~qdou/), 5 | and [Farzan Farnia](https://www.cse.cuhk.edu.hk/~farnia/) 6 | 7 | 8 | 9 | ## Setup 10 | We recommend to install the environment through conda: 11 | 12 | ``` 13 | cd KUEA 14 | conda create --name myenv python=3.11 15 | conda activate myenv 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | ## Alignment Fine-tuning 20 | Please use the following code for the alignment fine-tuning. 21 | 22 | ```commandline 23 | python -m train.align_training_clip --clip_model_name ViT-L-14 --pretrained openai --dataset imagenet 24 | --imagenet_root /path/to/imagenet2012 --template std --output_normalize False --steps 40000 --warmup 2800 25 | --batch_size 64 --loss l2 --loss_clean l2 --opt adamw --lr 1e-5 --wd 1e-4 --inner_loss l2 --wandb False 26 | --output_dir /path/to/checkpoint --clean_weight 1. --penalty_weight 0.5 --kernel_dino polynomial 27 | --kernel_clip polynomial --gamma 0.0032 --coef0 0.191623 --experiment_name exp_1 --log_freq 1 --eval_freq 10 28 | ``` 29 | 30 | `--imagenet_root` should be adjusted to designate the directory of the imagenet dataset. `--output_dir` specifies the 31 | directory to store the fine-tuned checkpoint. `--gamma` and `--coef0` are the initial parameters used to calculate the 32 | polynomial kernel of CLIP representations. We pre-calculate them by sampling several images from the training data and 33 | minimize the L2 distance between kernel matrices of CLIP and DINOv2. 34 | 35 | ## Evaluation 36 | We utilize [CLIP-Benchmark](https://github.com/LAION-AI/CLIP_benchmark) for evaluation of the fine-tuned models. 37 | 38 | To evaluate the model, first go to the `CLIP_benchmark` directory 39 | 40 | ``` 41 | cd CLIP_benchmark 42 | ``` 43 | 44 | Edit the file `benchmark/models.txt` to include the model to evaluate: 45 | 46 | ```commandline 47 | ViT-L-14-336,openai 48 | ViT-L-14-336,directory/to/finetuned/models.pt 49 | ``` 50 | The first element specify the architecture of the model, and the second element specify the saved checkpoints. Using 51 | `openai` for evaluation of the original CLIP model. Then run the corresponding bash command: 52 | ```commandline 53 | ./bash/run_benchmark_clean.sh # zero-shot classification 54 | ./bash/run_benchmark_lp.sh # linear probing 55 | ./bash/run_benchmark_rt.sh # image-text retrieval 56 | ``` 57 | Please edit the `SAVE_DIR` field of the corresponding files, which specifies the directory to save the evaluation results. 58 | 59 | ## Fine-tuning of LLaVA 60 | The script to fine-tune LLaVA is adjusted from [LLaVA](https://github.com/haotian-liu/LLaVA). We use the following 61 | command to perform LoRA fine-tuning 62 | ```commandline 63 | cd LLaVA 64 | ./scripts/v1_5/finetune_task_lora.sh 65 | ``` 66 | Note to edit the `--vision_tower` filed of the script to denote the directory of the checkpoints after the alignment fine-tuning. 67 | 68 | ## Evaluation of LLaVA 69 | We utilize the tool provided by [Prismatic library](https://github.com/TRI-ML/prismatic-vlms) for evaluation of the LLaVA. 70 | 71 | ## Pre-trained checkpoints 72 | The pretrained checkpoints for the CLIP vision encoder can be downloaded from OneDrive. 73 | 74 | [ViT-L-14-224](https://mycuhk-my.sharepoint.com/:f:/g/personal/1155187960_link_cuhk_edu_hk/Evj3UUqXLpRNjwQ0pQi-NugB7-JuKxU4xxGiqjrBH_MRDA?e=7SNAec) 75 | 76 | [ViT-L-14-336](https://mycuhk-my.sharepoint.com/:f:/g/personal/1155187960_link_cuhk_edu_hk/Eh90Ji9PvF9Hk70NEa0pKcsBReM1UDIVm3fTNUNKB6pngQ?e=57emeO) 77 | 78 | ## Bibtex 79 | If you find this work helpful, you can cite our paper as follows: 80 | ```commandline 81 | @article{gong2025kernel, 82 | title={Kernel-based Unsupervised Embedding Alignment for Enhanced Visual Representation in Vision-language Models}, 83 | author={Gong, Shizhan and Jiang, Yankai and Dou, Qi and Farnia, Farzan}, 84 | journal={arXiv preprint arXiv:2506.02557}, 85 | year={2025} 86 | } 87 | ``` 88 | 89 | ## Contact 90 | For any questions, please contact [szgong22@cse.cuhk.edu.hk](szgong22@cse.cuhk.edu.hk) 91 | 92 | -------------------------------------------------------------------------------- /asset/method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/asset/method.png -------------------------------------------------------------------------------- /open_flamingo/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Anas Awadalla, Irena Gao, Joshua Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /open_flamingo/README.md: -------------------------------------------------------------------------------- 1 | # OpenFlamingo 2 | - Forked from [OpenFlamingo](https://github.com/mlfoundations/open_flamingo) -------------------------------------------------------------------------------- /open_flamingo/__init__.py: -------------------------------------------------------------------------------- 1 | from .src.flamingo import Flamingo 2 | from .src.factory import create_model_and_transforms 3 | -------------------------------------------------------------------------------- /open_flamingo/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/eval/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /open_flamingo/eval/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/eval/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/eval/coco_metric.py: -------------------------------------------------------------------------------- 1 | from pycocoevalcap.cider.cider import Cider 2 | from pycocoevalcap.eval import COCOEvalCap 3 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 4 | from pycocotools.coco import COCO 5 | 6 | 7 | def compute_cider( 8 | result_path, 9 | annotations_path, 10 | ): 11 | # create coco object and coco_result object 12 | coco = COCO(annotations_path) 13 | coco_result = coco.loadRes(result_path) 14 | 15 | # create coco_eval object by taking coco and coco_result 16 | coco_eval = COCOEvalCap(coco, coco_result) 17 | coco_eval.params["image_id"] = coco_result.getImgIds() 18 | coco_eval.evaluate() 19 | 20 | return coco_eval.eval 21 | 22 | def compute_cider_all_scores( 23 | result_path, 24 | annotations_path, 25 | return_img_ids=False, 26 | ): 27 | # create coco object and coco_result object 28 | coco = COCO(annotations_path) 29 | coco_result = coco.loadRes(result_path) 30 | 31 | cider_scorer = Cider() 32 | imgIds = coco_result.getImgIds() 33 | gts = {} 34 | res = {} 35 | for imgId in imgIds: 36 | gts[imgId] = coco.imgToAnns[imgId] 37 | res[imgId] = coco_result.imgToAnns[imgId] 38 | tokenizer = PTBTokenizer() 39 | gts = tokenizer.tokenize(gts) 40 | res = tokenizer.tokenize(res) 41 | score, scores = cider_scorer.compute_score(gts, res) 42 | scores *= 100 43 | if return_img_ids: 44 | return scores, imgIds 45 | else: 46 | return scores 47 | 48 | def postprocess_captioning_generation(predictions): 49 | return predictions.split("Output", 1)[0] 50 | 51 | if __name__ == '__main__': 52 | result_path = "/mnt/cschlarmann37/project_multimodal/llava-evals/captions-json/cocoresults_38eb6f53-71e4-469e-a864-cb64b1fdbbf4.json" 53 | annotations_path = "/mnt/datasets/coco/annotations/captions_val2014.json" 54 | print(f"\nresult_path: {result_path}\n") 55 | metrics = compute_cider(result_path, annotations_path) 56 | print(metrics) 57 | print(f"CIDER: {metrics['CIDEr']*100}") -------------------------------------------------------------------------------- /open_flamingo/eval/eval_model.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import argparse 3 | from typing import List 4 | from torch.nn.parallel import DistributedDataParallel as DDP 5 | from PIL import Image 6 | 7 | 8 | class BaseEvalModel(abc.ABC): 9 | """Base class encapsulating functionality needed to evaluate a model.""" 10 | 11 | def __init__(self, args: List[str]): 12 | """Initialize model. 13 | 14 | Args: 15 | args: arguments to model. These should be parsed, or if the model 16 | has no applicable arguments, an error should be thrown if `args` 17 | is non-empty. 18 | """ 19 | 20 | def init_distributed(self): 21 | """Wrap model as DDP.""" 22 | self.model = DDP(self.model, device_ids=[self.device]) 23 | 24 | def set_device(self, device): 25 | """Set device for model.""" 26 | self.device = device 27 | self.model = self.model.to(device) 28 | 29 | def get_outputs( 30 | self, 31 | batch_text: List[str], 32 | batch_images: List[List[Image.Image]], 33 | min_generation_length: int, 34 | max_generation_length: int, 35 | num_beams: int, 36 | length_penalty: float, 37 | ) -> List[str]: 38 | """Get outputs for a batch of images and text. 39 | 40 | Args: 41 | batch_text: list of text strings, with the text "" in place 42 | of any images to be included. 43 | batch_images: images to provide to model. Should be a list of lists, 44 | where each list contains the images for a single example. 45 | max_generation_length: maximum length of the generated caption. 46 | Defaults to 10. 47 | num_beams: number of beams to use for beam search. Defaults to 3. 48 | length_penalty: length penalty for beam search. Defaults to -2.0. 49 | 50 | Returns: 51 | List of decoded output strings. 52 | """ 53 | 54 | def vqa_prompt(self, question, answer=None) -> str: 55 | """Get the prompt to use for VQA evaluation. If the answer is not provided, it should be left blank to be generated by the model. 56 | 57 | Returns: 58 | The prompt to use for VQA. 59 | """ 60 | 61 | def caption_prompt(self, caption=None) -> str: 62 | """Get the prompt to use for caption evaluation. If the caption is not provided, it should be left blank to be generated by the model. 63 | 64 | Returns: 65 | The prompt to use for captioning. 66 | """ 67 | 68 | def classification_prompt(self, class_str=None) -> str: 69 | """Get the prompt to use for classification evaluation. If the class_str is not provided, it should be left blank to be generated by the model. 70 | 71 | Returns: 72 | The prompt to use for classification. 73 | """ 74 | -------------------------------------------------------------------------------- /open_flamingo/eval/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/eval/models/__init__.py -------------------------------------------------------------------------------- /open_flamingo/eval/models/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/eval/models/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/eval/models/__pycache__/utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/eval/models/__pycache__/utils.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/eval/models/blip.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from PIL import Image 4 | import torch 5 | 6 | from transformers import Blip2Processor, Blip2ForConditionalGeneration 7 | from open_flamingo.eval.eval_model import BaseEvalModel 8 | from open_flamingo.eval.models.utils import unwrap_model 9 | 10 | 11 | class EvalModel(BaseEvalModel): 12 | """BLIP-2 model evaluation. 13 | 14 | Attributes: 15 | model (nn.Module): Underlying Torch model. 16 | tokenizer (transformers.PreTrainedTokenizer): Tokenizer for model. 17 | device: Index of GPU to use, or the string "cpu" 18 | """ 19 | 20 | def __init__(self, model_args): 21 | assert ( 22 | "processor_path" in model_args 23 | and "lm_path" in model_args 24 | and "device" in model_args 25 | ), "BLIP-2 requires processor_path, lm_path, and device arguments to be specified" 26 | 27 | self.device = ( 28 | int(model_args["device"]) 29 | if ("device" in model_args and model_args["device"] >= 0) 30 | else "cpu" 31 | ) 32 | self.processor = Blip2Processor.from_pretrained(model_args["processor_path"]) 33 | self.model = Blip2ForConditionalGeneration.from_pretrained( 34 | model_args["lm_path"] 35 | ) 36 | self.model.to(self.device) 37 | self.model.eval() 38 | self.processor.tokenizer.padding_side = "left" 39 | 40 | def _prepare_images(self, batch: List[List[torch.Tensor]]) -> torch.Tensor: 41 | """Preprocess images and stack them. 42 | 43 | Args: 44 | batch: A list of lists of images. 45 | 46 | Returns: 47 | A Tensor of shape 48 | (batch_size, channels, height, width). 49 | """ 50 | batch_images = None 51 | assert all( 52 | len(example) == 1 for example in batch 53 | ), "BLIP-2 only supports one image per example" 54 | 55 | for example in batch: 56 | assert len(example) == 1, "BLIP-2 only supports one image per example" 57 | batch_images = torch.cat( 58 | [ 59 | batch_images, 60 | self.processor.image_processor(example, return_tensors="pt")[ 61 | "pixel_values" 62 | ], 63 | ] 64 | if batch_images is not None 65 | else [ 66 | self.processor.image_processor(example, return_tensors="pt")[ 67 | "pixel_values" 68 | ] 69 | ], 70 | dim=0, 71 | ) 72 | return batch_images 73 | 74 | def get_outputs( 75 | self, 76 | batch_text: List[str], 77 | batch_images: List[List[Image.Image]], 78 | max_generation_length: int, 79 | num_beams: int, 80 | length_penalty: float, 81 | ) -> List[str]: 82 | encodings = self.processor.tokenizer( 83 | batch_text, 84 | padding="longest", 85 | truncation=True, 86 | return_tensors="pt", 87 | max_length=2000, 88 | ) 89 | input_ids = encodings["input_ids"] 90 | attention_mask = encodings["attention_mask"] 91 | 92 | with torch.inference_mode(): 93 | outputs = unwrap_model(self.model).generate( 94 | self._prepare_images(batch_images).to(self.device), 95 | input_ids.to(self.device), 96 | attention_mask=attention_mask.to(self.device), 97 | max_new_tokens=max_generation_length, 98 | min_new_tokens=8, 99 | num_beams=num_beams, 100 | length_penalty=length_penalty, 101 | ) 102 | 103 | return self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True) 104 | 105 | def get_vqa_prompt(self, question, answer=None) -> str: 106 | return ( 107 | f"Question:{question} Short answer:{answer if answer is not None else ''}" 108 | ) 109 | 110 | def get_caption_prompt(self, caption=None) -> str: 111 | return f"A photo of {caption if caption is not None else ''}" 112 | 113 | def get_classification_prompt(self, class_str=None) -> str: 114 | raise NotImplementedError 115 | -------------------------------------------------------------------------------- /open_flamingo/eval/models/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def unwrap_model(model): 5 | """ 6 | Unwrap a model from a DataParallel or DistributedDataParallel wrapper. 7 | """ 8 | if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): 9 | return model.module 10 | else: 11 | return model 12 | 13 | 14 | def get_label(lang_x, tokenizer, mode='colon'): 15 | eoc_token = '<|endofchunk|>' 16 | media_token = '' 17 | colon_token_id = tokenizer.encode(':')[0] 18 | eoc_token_id = tokenizer.additional_special_tokens_ids[ 19 | tokenizer.additional_special_tokens.index(eoc_token) 20 | ] 21 | media_token_id = tokenizer.additional_special_tokens_ids[ 22 | tokenizer.additional_special_tokens.index(media_token) 23 | ] 24 | label = lang_x.clone() 25 | # compute context len, by getting the index of the last colon token 26 | for idx in range(len(label)): 27 | if mode == 'colon': 28 | # get the last occurence of the ':' token 29 | # get a tensor of True/False values, then use torch.nonzero to get the indices 30 | indices = (label[idx] == colon_token_id).nonzero().flatten() 31 | # Then get the last occurrence 32 | end_of_context = indices[-1].item() + 1 # +1 because we want to include the colon token 33 | elif isinstance(mode, int): 34 | end_of_context = -label[idx].tolist()[::-1].index(media_token_id) - 1 + mode 35 | label[idx, : end_of_context] = -100 36 | label[label == tokenizer.pad_token_id] = -100 37 | label[:, 0] = -100 38 | label[label == media_token_id] = -100 39 | label[label == eoc_token_id] = -100 40 | return label -------------------------------------------------------------------------------- /open_flamingo/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__init__.py -------------------------------------------------------------------------------- /open_flamingo/src/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/src/__pycache__/factory.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__pycache__/factory.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/src/__pycache__/flamingo.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__pycache__/flamingo.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/src/__pycache__/flamingo_lm.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__pycache__/flamingo_lm.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/src/__pycache__/helpers.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__pycache__/helpers.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/src/__pycache__/utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/open_flamingo/src/__pycache__/utils.cpython-311.pyc -------------------------------------------------------------------------------- /open_flamingo/src/utils.py: -------------------------------------------------------------------------------- 1 | def extend_instance(obj, mixin): 2 | """Apply mixins to a class instance after creation""" 3 | base_cls = obj.__class__ 4 | base_cls_name = obj.__class__.__name__ 5 | obj.__class__ = type( 6 | base_cls_name, (mixin, base_cls), {} 7 | ) # mixin needs to go first for our forward() logic to work 8 | 9 | 10 | def getattr_recursive(obj, att): 11 | """ 12 | Return nested attribute of obj 13 | Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c 14 | """ 15 | if att == "": 16 | return obj 17 | i = att.find(".") 18 | if i < 0: 19 | return getattr(obj, att) 20 | else: 21 | return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :]) 22 | 23 | 24 | def setattr_recursive(obj, att, val): 25 | """ 26 | Set nested attribute of obj 27 | Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val 28 | """ 29 | if "." in att: 30 | obj = getattr_recursive(obj, ".".join(att.split(".")[:-1])) 31 | setattr(obj, att.split(".")[-1], val) 32 | 33 | 34 | def apply_with_stopping_condition( 35 | module, apply_fn, apply_condition=None, stopping_condition=None, **other_args 36 | ): 37 | if stopping_condition(module): 38 | return 39 | if apply_condition(module): 40 | apply_fn(module, **other_args) 41 | for child in module.children(): 42 | apply_with_stopping_condition( 43 | child, 44 | apply_fn, 45 | apply_condition=apply_condition, 46 | stopping_condition=stopping_condition, 47 | **other_args 48 | ) 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.24.0 2 | aiofiles==22.1.0 3 | aiohttp==3.8.4 4 | aiosignal==1.3.1 5 | aiosqlite==0.19.0 6 | anyio==3.6.2 7 | appdirs==1.4.4 8 | argon2-cffi==21.3.0 9 | argon2-cffi-bindings==21.2.0 10 | arrow==1.2.3 11 | asttokens==2.2.1 12 | async-timeout==4.0.2 13 | attrs==23.1.0 14 | Babel==2.12.1 15 | backcall==0.2.0 16 | beautifulsoup4==4.12.2 17 | bleach==6.0.0 18 | braceexpand==0.1.7 19 | certifi==2023.5.7 20 | cffi==1.15.1 21 | chardet==4.0.0 22 | charset-normalizer==3.1.0 23 | click==8.1.3 24 | cmake==3.26.3 25 | comm==0.1.3 26 | contourpy==1.0.7 27 | cycler==0.11.0 28 | datasets==2.12.0 29 | debugpy==1.6.7 30 | decorator==5.1.1 31 | defusedxml==0.7.1 32 | dill==0.3.6 33 | docker-pycreds==0.4.0 34 | einops==0.6.1 35 | einops-exts==0.0.4 36 | executing==1.2.0 37 | fastjsonschema==2.16.3 38 | filelock==3.12.0 39 | fonttools==4.39.3 40 | fqdn==1.5.1 41 | frozenlist==1.3.3 42 | fsspec==2023.5.0 43 | ftfy==6.1.1 44 | geotorch==0.3.0 45 | gitdb==4.0.10 46 | GitPython==3.1.31 47 | huggingface-hub==0.14.1 48 | idna==2.10 49 | inflection==0.5.1 50 | ipykernel==6.23.0 51 | ipython==8.13.2 52 | ipython-genutils==0.2.0 53 | isoduration==20.11.0 54 | jedi==0.18.2 55 | Jinja2==3.1.2 56 | joblib==1.2.0 57 | json5==0.9.11 58 | jsonpointer==2.3 59 | jsonschema==4.17.3 60 | kiwisolver==1.4.4 61 | lit==16.0.3 62 | MarkupSafe==2.1.2 63 | matplotlib==3.7.1 64 | matplotlib-inline==0.1.6 65 | mistune==2.0.5 66 | more-itertools==9.1.0 67 | mpmath==1.3.0 68 | multidict==6.0.4 69 | multiprocess==0.70.14 70 | nbclassic==1.0.0 71 | nbclient==0.7.4 72 | nbconvert==7.4.0 73 | nbformat==5.8.0 74 | nest-asyncio==1.5.6 75 | networkx==3.1 76 | nltk==3.8.1 77 | notebook==6.5.4 78 | notebook_shim==0.2.3 79 | numpy==1.24.2 80 | nvidia-cublas-cu11==11.10.3.66 81 | nvidia-cuda-cupti-cu11==11.7.101 82 | nvidia-cuda-nvrtc-cu11==11.7.99 83 | nvidia-cuda-runtime-cu11==11.7.99 84 | nvidia-cudnn-cu11==8.5.0.96 85 | nvidia-cufft-cu11==10.9.0.58 86 | nvidia-curand-cu11==10.2.10.91 87 | nvidia-cusolver-cu11==11.4.0.1 88 | nvidia-cusparse-cu11==11.7.4.91 89 | nvidia-nccl-cu11==2.14.3 90 | nvidia-nvtx-cu11==11.7.91 91 | open-clip-torch==2.19.0 92 | overrides==7.4.0 93 | packaging==23.1 94 | pandas==1.3.5 95 | pandocfilters==1.5.0 96 | parso==0.8.3 97 | pathtools==0.1.2 98 | pexpect==4.8.0 99 | pickleshare==0.7.5 100 | Pillow==9.5.0 101 | platformdirs==3.5.0 102 | prometheus-client==0.16.0 103 | prompt-toolkit==3.0.38 104 | protobuf==3.20.3 105 | psutil==5.9.5 106 | ptyprocess==0.7.0 107 | pure-eval==0.2.2 108 | pyarrow==12.0.0 109 | pycocoevalcap==1.2 110 | pycocotools==2.0.6 111 | pycparser==2.21 112 | Pygments==2.15.1 113 | pyparsing==3.0.9 114 | pyrsistent==0.19.3 115 | python-dateutil==2.8.2 116 | python-json-logger==2.0.7 117 | pytz==2023.3 118 | PyYAML==6.0 119 | pyzmq==25.0.2 120 | regex==2023.5.5 121 | requests==2.25.1 122 | responses==0.18.0 123 | rfc3339-validator==0.1.4 124 | rfc3986-validator==0.1.1 125 | robustbench @ git+https://github.com/RobustBench/robustbench.git@e67e4225facde47be6a41ed78b576076e8b90cc5 126 | scikit-learn==1.3.2 127 | scipy==1.10.1 128 | Send2Trash==1.8.2 129 | sentencepiece==0.1.98 130 | sentry-sdk==1.22.2 131 | setproctitle==1.3.2 132 | shortuuid==1.0.11 133 | six==1.16.0 134 | smmap==5.0.0 135 | sniffio==1.3.0 136 | soupsieve==2.4.1 137 | stack-data==0.6.2 138 | sympy==1.11.1 139 | terminado==0.17.1 140 | timm==0.6.13 141 | tinycss2==1.2.1 142 | tokenizers==0.13.3 143 | torch==2.0.1 144 | torchdiffeq==0.2.3 145 | torchvision==0.15.2 146 | tornado==6.3.1 147 | tqdm==4.65.0 148 | traitlets==5.9.0 149 | transformers @ git+https://github.com/huggingface/transformers@d3cbc997a231098cca81ac27fd3028a5536abe67 150 | triton==2.0.0 151 | typing_extensions==4.5.0 152 | tzdata==2023.3 153 | uri-template==1.2.0 154 | urllib3==1.26.15 155 | wandb==0.15.2 156 | wcwidth==0.2.6 157 | webcolors==1.13 158 | webdataset==0.2.48 159 | webencodings==0.5.1 160 | websocket-client==1.5.1 161 | xxhash==3.2.0 162 | y-py==0.5.9 163 | yarl==1.9.2 164 | ypy-websocket==0.8.2 -------------------------------------------------------------------------------- /train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterant330/KUEA/277499e4918a4c4a7b0261e6b51255b0b9a138ca/train/__init__.py -------------------------------------------------------------------------------- /train/datasets.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from PIL import Image 4 | from torch.utils.data import Dataset 5 | from torchvision.datasets import ImageFolder 6 | 7 | 8 | class COCOFlickrDataset(Dataset): 9 | def __init__( 10 | self, 11 | image_dir_path, 12 | annotations_path, 13 | transform=None, 14 | is_flickr=False, 15 | prefix=None, 16 | ): 17 | self.image_dir_path = image_dir_path 18 | self.annotations = json.load(open(annotations_path))["annotations"] 19 | self.is_flickr = is_flickr 20 | self.transform = transform 21 | self.prefix = prefix 22 | 23 | def __len__(self): 24 | return len(self.annotations) 25 | 26 | def get_img_path(self, idx): 27 | if self.is_flickr: 28 | return f"{self.image_dir_path}/{self.annotations[idx]['image_id']}.jpg" 29 | else: 30 | return f"{self.image_dir_path}/{self.prefix}{self.annotations[idx]['image_id']:012d}.jpg" 31 | 32 | def __getitem__(self, idx): 33 | image = Image.open(self.get_img_path(idx)) 34 | caption = self.annotations[idx]["caption"] 35 | return self.transform(image), caption 36 | 37 | 38 | class ImageNetDataset(ImageFolder): 39 | """Class to represent the ImageNet1k dataset.""" 40 | 41 | def __init__(self, root, **kwargs): 42 | super().__init__(root=root, **kwargs) 43 | 44 | def __getitem__(self, idx): 45 | sample, target = super().__getitem__(idx) 46 | # target_label = IMAGENET_1K_CLASS_ID_TO_LABEL[target] 47 | return sample, target 48 | -------------------------------------------------------------------------------- /train/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import wandb 4 | from time import sleep 5 | import os 6 | 7 | def init_wandb(project_name, model_name, config, **wandb_kwargs): 8 | os.environ['WANDB__SERVICE_WAIT'] = '300' 9 | while True: 10 | try: 11 | wandb_run = wandb.init( 12 | project=project_name, name=model_name, save_code=True, 13 | config=config, **wandb_kwargs, 14 | ) 15 | break 16 | except Exception as e: 17 | print('wandb connection error', file=sys.stderr) 18 | print(f'error: {e}', file=sys.stderr) 19 | sleep(1) 20 | print('retrying..', file=sys.stderr) 21 | return wandb_run 22 | 23 | def str2bool(v): 24 | if isinstance(v, bool): 25 | return v 26 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 27 | return True 28 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 29 | return False 30 | else: 31 | raise ValueError 32 | 33 | class AverageMeter(object): 34 | """Computes and stores the average and current value""" 35 | def __init__(self, name, fmt=':f'): 36 | self.name = name 37 | self.fmt = fmt 38 | self.reset() 39 | 40 | def reset(self): 41 | self.val = 0 42 | self.avg = 0 43 | self.sum = 0 44 | self.count = 0 45 | 46 | def update(self, val, n=1): 47 | self.val = val 48 | self.sum += val * n 49 | self.count += n 50 | self.avg = self.sum / self.count 51 | 52 | def __str__(self): 53 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 54 | return fmtstr.format(**self.__dict__) --------------------------------------------------------------------------------