├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── chat.py ├── imgs ├── blackpink.jpg ├── camera_lens.jpg ├── car_speed.jpg ├── dog_with_horn.jpg ├── example1.jpg ├── example2.jpg ├── fig_overview.jpg ├── jackma.jpg ├── obama.jpg ├── stand_higher.jpg ├── table1.jpg ├── teaser.jpg ├── trump.jpg └── wash_hands.jpg ├── merge_lora_weights_and_save_hf_model.py ├── model ├── LISA.py ├── llava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mpt.py │ │ │ └── mpt │ │ │ │ ├── adapt_tokenizer.py │ │ │ │ ├── attention.py │ │ │ │ ├── blocks.py │ │ │ │ ├── configuration_mpt.py │ │ │ │ ├── custom_embedding.py │ │ │ │ ├── flash_attn_triton.py │ │ │ │ ├── hf_prefixlm_converter.py │ │ │ │ ├── meta_init_context.py │ │ │ │ ├── modeling_mpt.py │ │ │ │ ├── norm.py │ │ │ │ └── param_init_fns.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ └── utils.py │ ├── train │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ └── train_mem.py │ └── utils.py └── segment_anything │ ├── __init__.py │ ├── automatic_mask_generator.py │ ├── build_sam.py │ ├── modeling │ ├── __init__.py │ ├── common.py │ ├── image_encoder.py │ ├── mask_decoder.py │ ├── prompt_encoder.py │ ├── sam.py │ └── transformer.py │ ├── predictor.py │ └── utils │ ├── __init__.py │ ├── amg.py │ ├── onnx.py │ └── transforms.py ├── requirements.txt ├── train_ds.py ├── utils ├── ade20k_classes.json ├── cocostuff_classes.txt ├── conversation.py ├── data_processing.py ├── dataset.py ├── grefcoco.py ├── grefer.py ├── reason_seg_dataset.py ├── refer.py ├── refer_seg_dataset.py ├── sem_seg_dataset.py ├── utils.py └── vqa_dataset.py └── vis_output ├── blackpink.jpg ├── camera_lens.jpg ├── dog_with_horn.jpg ├── example1_mask_0.jpg ├── example1_masked_img_0.jpg ├── example2_mask_0.jpg ├── example2_masked_img_0.jpg ├── jackma.jpg ├── obama.jpg ├── stand_higher.jpg ├── trump.jpg └── wash_hands.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | runs/ 3 | .vscode/ 4 | 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/README.md -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/app.py -------------------------------------------------------------------------------- /chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/chat.py -------------------------------------------------------------------------------- /imgs/blackpink.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/blackpink.jpg -------------------------------------------------------------------------------- /imgs/camera_lens.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/camera_lens.jpg -------------------------------------------------------------------------------- /imgs/car_speed.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/car_speed.jpg -------------------------------------------------------------------------------- /imgs/dog_with_horn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/dog_with_horn.jpg -------------------------------------------------------------------------------- /imgs/example1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/example1.jpg -------------------------------------------------------------------------------- /imgs/example2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/example2.jpg -------------------------------------------------------------------------------- /imgs/fig_overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/fig_overview.jpg -------------------------------------------------------------------------------- /imgs/jackma.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/jackma.jpg -------------------------------------------------------------------------------- /imgs/obama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/obama.jpg -------------------------------------------------------------------------------- /imgs/stand_higher.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/stand_higher.jpg -------------------------------------------------------------------------------- /imgs/table1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/table1.jpg -------------------------------------------------------------------------------- /imgs/teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/teaser.jpg -------------------------------------------------------------------------------- /imgs/trump.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/trump.jpg -------------------------------------------------------------------------------- /imgs/wash_hands.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/imgs/wash_hands.jpg -------------------------------------------------------------------------------- /merge_lora_weights_and_save_hf_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/merge_lora_weights_and_save_hf_model.py -------------------------------------------------------------------------------- /model/LISA.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/LISA.py -------------------------------------------------------------------------------- /model/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /model/llava/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/constants.py -------------------------------------------------------------------------------- /model/llava/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/conversation.py -------------------------------------------------------------------------------- /model/llava/mm_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/mm_utils.py -------------------------------------------------------------------------------- /model/llava/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/__init__.py -------------------------------------------------------------------------------- /model/llava/model/apply_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/apply_delta.py -------------------------------------------------------------------------------- /model/llava/model/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/builder.py -------------------------------------------------------------------------------- /model/llava/model/consolidate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/consolidate.py -------------------------------------------------------------------------------- /model/llava/model/language_model/llava_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/llava_llama.py -------------------------------------------------------------------------------- /model/llava/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/llava_mpt.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/adapt_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/adapt_tokenizer.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/attention.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/blocks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/blocks.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/configuration_mpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/configuration_mpt.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/custom_embedding.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/flash_attn_triton.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/flash_attn_triton.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/hf_prefixlm_converter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/hf_prefixlm_converter.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/meta_init_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/meta_init_context.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/modeling_mpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/modeling_mpt.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/norm.py -------------------------------------------------------------------------------- /model/llava/model/language_model/mpt/param_init_fns.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/language_model/mpt/param_init_fns.py -------------------------------------------------------------------------------- /model/llava/model/llava_arch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/llava_arch.py -------------------------------------------------------------------------------- /model/llava/model/make_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/make_delta.py -------------------------------------------------------------------------------- /model/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/multimodal_encoder/builder.py -------------------------------------------------------------------------------- /model/llava/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/multimodal_encoder/clip_encoder.py -------------------------------------------------------------------------------- /model/llava/model/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/model/utils.py -------------------------------------------------------------------------------- /model/llava/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/train/llama_flash_attn_monkey_patch.py -------------------------------------------------------------------------------- /model/llava/train/llava_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/train/llava_trainer.py -------------------------------------------------------------------------------- /model/llava/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/train/train.py -------------------------------------------------------------------------------- /model/llava/train/train_mem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/train/train_mem.py -------------------------------------------------------------------------------- /model/llava/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/llava/utils.py -------------------------------------------------------------------------------- /model/segment_anything/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/__init__.py -------------------------------------------------------------------------------- /model/segment_anything/automatic_mask_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/automatic_mask_generator.py -------------------------------------------------------------------------------- /model/segment_anything/build_sam.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/build_sam.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/__init__.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/common.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/image_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/image_encoder.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/mask_decoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/mask_decoder.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/prompt_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/prompt_encoder.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/sam.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/sam.py -------------------------------------------------------------------------------- /model/segment_anything/modeling/transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/modeling/transformer.py -------------------------------------------------------------------------------- /model/segment_anything/predictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/predictor.py -------------------------------------------------------------------------------- /model/segment_anything/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/utils/__init__.py -------------------------------------------------------------------------------- /model/segment_anything/utils/amg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/utils/amg.py -------------------------------------------------------------------------------- /model/segment_anything/utils/onnx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/utils/onnx.py -------------------------------------------------------------------------------- /model/segment_anything/utils/transforms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/model/segment_anything/utils/transforms.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/requirements.txt -------------------------------------------------------------------------------- /train_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/train_ds.py -------------------------------------------------------------------------------- /utils/ade20k_classes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/ade20k_classes.json -------------------------------------------------------------------------------- /utils/cocostuff_classes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/cocostuff_classes.txt -------------------------------------------------------------------------------- /utils/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/conversation.py -------------------------------------------------------------------------------- /utils/data_processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/data_processing.py -------------------------------------------------------------------------------- /utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/dataset.py -------------------------------------------------------------------------------- /utils/grefcoco.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/grefcoco.py -------------------------------------------------------------------------------- /utils/grefer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/grefer.py -------------------------------------------------------------------------------- /utils/reason_seg_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/reason_seg_dataset.py -------------------------------------------------------------------------------- /utils/refer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/refer.py -------------------------------------------------------------------------------- /utils/refer_seg_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/refer_seg_dataset.py -------------------------------------------------------------------------------- /utils/sem_seg_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/sem_seg_dataset.py -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/utils.py -------------------------------------------------------------------------------- /utils/vqa_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/utils/vqa_dataset.py -------------------------------------------------------------------------------- /vis_output/blackpink.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/blackpink.jpg -------------------------------------------------------------------------------- /vis_output/camera_lens.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/camera_lens.jpg -------------------------------------------------------------------------------- /vis_output/dog_with_horn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/dog_with_horn.jpg -------------------------------------------------------------------------------- /vis_output/example1_mask_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/example1_mask_0.jpg -------------------------------------------------------------------------------- /vis_output/example1_masked_img_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/example1_masked_img_0.jpg -------------------------------------------------------------------------------- /vis_output/example2_mask_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/example2_mask_0.jpg -------------------------------------------------------------------------------- /vis_output/example2_masked_img_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/example2_masked_img_0.jpg -------------------------------------------------------------------------------- /vis_output/jackma.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/jackma.jpg -------------------------------------------------------------------------------- /vis_output/obama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/obama.jpg -------------------------------------------------------------------------------- /vis_output/stand_higher.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/stand_higher.jpg -------------------------------------------------------------------------------- /vis_output/trump.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/trump.jpg -------------------------------------------------------------------------------- /vis_output/wash_hands.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/LISA/HEAD/vis_output/wash_hands.jpg --------------------------------------------------------------------------------