├── Dataset.md
├── README.md
├── RUN_VideoGLaMM.md
├── Training.md
├── VideoGLaMM
    ├── .DS_Store
    ├── .gitignore
    ├── LICENSE
    ├── chat.py
    ├── eval_anet_entities_infer.py
    ├── eval_gcg_infer.py
    ├── eval_gcg_metrics.py
    ├── eval_grounding.py
    ├── eval_mevis.py
    ├── eval_referdavis_infer.py
    ├── eval_referdavis_metrics.py
    ├── gcg_data_gen
    │   ├── .DS_Store
    │   ├── anet_entities_gcg
    │   │   ├── 1_dev_anet_entities_for_gcg.py
    │   │   ├── 2_anet_entities_gcg_openai_refine.py
    │   │   └── 3_anet_entities_gcg_extract_masks.py
    │   ├── burst_ytvis_gcg
    │   │   ├── README.md
    │   │   ├── generate_annotations.py
    │   │   ├── generation.py
    │   │   ├── merge_b_y.py
    │   │   └── requirements.txt
    │   ├── dev_dataset_visualize.py
    │   ├── hcstvg_gcg
    │   │   ├── dev_hcstvg_2_gcg_captions.py
    │   │   └── dev_hcstvg_2_mask_gen.py
    │   ├── mevis_gcg
    │   │   └── dev_mevis_gcg.py
    │   ├── vidstg_gcg
    │   │   ├── dev_vidstg_gcg_captions.py
    │   │   └── dev_vidstg_gcg_mask_gen.py
    │   └── ytvos_gcg
    │   │   └── dev_ytvos_gcg.py
    ├── model
    │   ├── .DS_Store
    │   ├── VideoGLaMM.py
    │   ├── chatunivi
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── conversation.py
    │   │   ├── mm_utils.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   ├── arch.py
    │   │   │   ├── builder.py
    │   │   │   ├── cluster.py
    │   │   │   ├── language_model
    │   │   │   │   └── llama.py
    │   │   │   └── multimodal_encoder
    │   │   │   │   ├── builder.py
    │   │   │   │   └── clip_encoder.py
    │   │   └── utils.py
    │   ├── llava
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── conversation.py
    │   │   ├── mm_utils.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   ├── apply_delta.py
    │   │   │   ├── builder.py
    │   │   │   ├── consolidate.py
    │   │   │   ├── language_model
    │   │   │   │   ├── llava_llama.py
    │   │   │   │   ├── llava_mpt.py
    │   │   │   │   └── mpt
    │   │   │   │   │   ├── adapt_tokenizer.py
    │   │   │   │   │   ├── attention.py
    │   │   │   │   │   ├── blocks.py
    │   │   │   │   │   ├── configuration_mpt.py
    │   │   │   │   │   ├── custom_embedding.py
    │   │   │   │   │   ├── flash_attn_triton.py
    │   │   │   │   │   ├── hf_prefixlm_converter.py
    │   │   │   │   │   ├── meta_init_context.py
    │   │   │   │   │   ├── modeling_mpt.py
    │   │   │   │   │   ├── norm.py
    │   │   │   │   │   └── param_init_fns.py
    │   │   │   ├── llava_arch.py
    │   │   │   ├── make_delta.py
    │   │   │   ├── multimodal_encoder
    │   │   │   │   ├── builder.py
    │   │   │   │   └── clip_encoder.py
    │   │   │   └── utils.py
    │   │   ├── train
    │   │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   │   ├── llava_trainer.py
    │   │   │   ├── train.py
    │   │   │   └── train_mem.py
    │   │   └── utils.py
    │   ├── segment_anything
    │   │   ├── __init__.py
    │   │   ├── automatic_mask_generator.py
    │   │   ├── build_sam.py
    │   │   ├── modeling
    │   │   │   ├── __init__.py
    │   │   │   ├── common.py
    │   │   │   ├── image_encoder.py
    │   │   │   ├── mask_decoder.py
    │   │   │   ├── prompt_encoder.py
    │   │   │   ├── sam.py
    │   │   │   └── transformer.py
    │   │   ├── predictor.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── amg.py
    │   │   │   ├── onnx.py
    │   │   │   └── transforms.py
    │   ├── segment_anything_2
    │   │   ├── sam2
    │   │   │   ├── __init__.py
    │   │   │   ├── automatic_mask_generator.py
    │   │   │   ├── build_sam.py
    │   │   │   ├── csrc
    │   │   │   │   └── connected_components.cu
    │   │   │   ├── modeling
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── backbones
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── hieradet.py
    │   │   │   │   │   ├── image_encoder.py
    │   │   │   │   │   └── utils.py
    │   │   │   │   ├── memory_attention.py
    │   │   │   │   ├── memory_encoder.py
    │   │   │   │   ├── position_encoding.py
    │   │   │   │   ├── sam
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── mask_decoder.py
    │   │   │   │   │   ├── prompt_encoder.py
    │   │   │   │   │   └── transformer.py
    │   │   │   │   ├── sam2_base.py
    │   │   │   │   └── sam2_utils.py
    │   │   │   ├── sam2_image_predictor.py
    │   │   │   ├── sam2_video_predictor.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── amg.py
    │   │   │   │   ├── misc.py
    │   │   │   │   └── transforms.py
    │   │   ├── sam2_configs
    │   │   │   ├── __init__.py
    │   │   │   ├── sam2_hiera_b+.yaml
    │   │   │   ├── sam2_hiera_l.yaml
    │   │   │   ├── sam2_hiera_s.yaml
    │   │   │   └── sam2_hiera_t.yaml
    │   │   └── setup.py
    │   └── videogpt_plus
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── conversation.py
    │   │   ├── mm_utils.py
    │   │   └── model
    │   │       ├── __init__.py
    │   │       ├── arch.py
    │   │       ├── builder.py
    │   │       ├── dataloader.py
    │   │       ├── internvideo
    │   │           ├── build_internvideo.py
    │   │           ├── config.py
    │   │           ├── easydict.py
    │   │           ├── flash_attention_class.py
    │   │           ├── internvideo2.py
    │   │           ├── internvideo2_stage2_config_vision.py
    │   │           ├── pos_embed.py
    │   │           └── utils.py
    │   │       ├── language_model
    │   │           ├── llama3_1.py
    │   │           └── phi3.py
    │   │       ├── multimodal_encoder
    │   │           ├── builder.py
    │   │           ├── clip_encoder.py
    │   │           └── processor.py
    │   │       └── multimodal_projector
    │   │           └── builder.py
    ├── requirements.txt
    ├── train_ds_with_videogptplus.py
    └── utils
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── ade20k_classes.json
    │   ├── clair.py
    │   ├── cocostuff_classes.txt
    │   ├── conv_generator.py
    │   ├── conversation.py
    │   ├── data_processing.py
    │   ├── dataset.py
    │   ├── enc_preprocessors.py
    │   ├── grandf_dataset.py
    │   ├── grefcoco.py
    │   ├── grefer.py
    │   ├── grounded_video_qa.py
    │   ├── grounding_utils
    │       ├── __init__.py
    │       ├── box_ops.py
    │       ├── image_transforms.py
    │       └── misc.py
    │   ├── hcstvg_dataset.py
    │   ├── itm_transforms.py
    │   ├── mevis_dataset.py
    │   ├── mevis_gcg.py
    │   ├── misc.py
    │   ├── ordered_datasets
    │       ├── ordered_mevis.py
    │       └── ordered_rvos.py
    │   ├── preproc_hcstvgv2.py
    │   ├── preproc_vidstg.py
    │   ├── reason_seg_dataset.py
    │   ├── refer.py
    │   ├── refer_datasets
    │       ├── __init__.py
    │       ├── a2d.py
    │       ├── box_ops.py
    │       ├── davis.py
    │       ├── jhmdb.py
    │       ├── mevis.py
    │       ├── new
    │       │   ├── davis17.py
    │       │   └── ytvos.py
    │       ├── transforms.py
    │       └── ytvos.py
    │   ├── refer_seg_dataset.py
    │   ├── refer_vos_dataset.py
    │   ├── sam_transforms.py
    │   ├── sem_seg_dataset.py
    │   ├── temporal_grounding_datasets.py
    │   ├── trainer.py
    │   ├── utils.py
    │   ├── video_gcg_anet.py
    │   ├── video_gcg_dataset.py
    │   ├── video_vqa_dataset.py
    │   ├── vidstg_dataset.py
    │   ├── vidstg_hcstvg_gcg.py
    │   ├── vqa_dataset.py
    │   └── ytvos_gcg.py
└── docs
    └── images
        ├── .DS_Store
        ├── figures
            ├── cvpr25-teaser.png
            ├── cvpr25_main_block_diagram-jpg.jpg
            ├── cvpr25_qualitative.png
            └── videoglamm_annotation_pipeline.png
        └── logos
            ├── IVAL_logo.png
            ├── MBZUAI_logo.png
            ├── Oryx_logo.png
            └── logo-videoglamm.png


/Dataset.md:
--------------------------------------------------------------------------------
 1 | ### **Datasets used for training VideoGLaMM**
 2 | 
 3 | - **LISA datasets**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/Ed6NO_HzOtxHuLUtwU5llQoBNTKW-hWsat_ADhMPBhdrVA?e=eVFlLu)
 4 | - **GranDf dataset**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EX4whuRa1NdEihJGAIL1j0MBvF8xvx22tX9D3g3lNhW_VQ?e=Y23PDA)
 5 | - **Video datasets**:
 6 |   - **ActivityNet**: [Link](https://drive.google.com/file/d/1qW5bLQtOienpMjO7vkaghA3pCPXc1qj9/view?usp=sharing)
 7 |   - **ActivityNet Captions**: [Link](https://drive.google.com/file/d/13tlKsXTA7YLwN62h_OjxmNulWBjbweNF/view?usp=sharing)
 8 |   - **ActivityNet Entities**: [Link](https://drive.google.com/file/d/13uYeYjXNW9mvsLpuLcNCpp2Zle-HuaUS/view?usp=sharing)
 9 |   - **BURST**: [Link](https://drive.google.com/file/d/119syWknOhxX9HGedkerk9kQo6MHBBQVQ/view?usp=sharing)
10 |   - **HC-STVG**: [Link](https://drive.google.com/file/d/1pzK3aP4bMfpUA1dzSXC9GCxypHgZA1aL/view?usp=sharing)
11 |   - **MeViS**: [Link](https://drive.google.com/file/d/1uuE2IcD4UGpkFdD2MWrIlVVN48PVoXRT/view?usp=sharing)
12 |   - **Processed**: [Link](https://drive.google.com/file/d/1Z16c1WgmoqsUa557ILIG2QBy0hit5Nhr/view?usp=sharing)
13 |     - ActivityNet Entities
14 |     - HC-STVG
15 |     - Referring DAVIS
16 |     - VideoInstruct100K
17 |     - VidSTG
18 |   - **Refer DAVIS**: [Link](https://drive.google.com/file/d/1B4uHyt3_KZIFs9bQobowkPg1y0tG0IuO/view?usp=sharing)
19 |     - DAVIS 16
20 |     - DAVIS 17
21 |   - **Refer YouTube-VOS**: [Link](https://drive.google.com/file/d/1zApsra2fqGX8b3diSvhIfe7bBjZW9tJI/view?usp=sharing)
22 |   - **VideoInstruct100K**: [Link](https://drive.google.com/file/d/1l6XKWbX40tGIG1K05iBW8q4QFfDA-2_1/view?usp=sharing)
23 |   - **VidSTG**: [Link](https://drive.google.com/file/d/12INPWw_FAQcXkeIgGdm61vJ35tkJeGAF/view?usp=sharing)
24 |   - **YTVIS**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EQIEnJmu1yhIhjkwRD8fVcYBRHaQFI9CmZDOoLRkmCXOBw?e=cbDr28)
25 | 
26 | - **GCG Datasets**:
27 |   - **ActivityNet Entities GCG**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EaG0sNQ--y1CjVf7WeYdahEBy2l6LQOvo_shVZqY22YRHg?e=r9itQ5)
28 |   - **Burst-YTVIS GCG**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EThjSLl_aMhIka6S1KxiqEEBf9rUCKNbX9LVyg60rw6Urg?e=wyhMsC)
29 |   - **Refer-YTVOS GCG**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EYy0FJi1PCxBiyudE-z9M6UBu-Ceae-mpjQ8w7aQ7c6KAA?e=VKUcoP)
30 |   - **VidSTG GCG**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EQhWEdhvCX1OkSSYPcnX9KsBrlw1AeTSffUtiD8K7wsc8w?e=FIEqEA)
31 |   - **HC-STVG GCG**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EaVtsayKs9ZIg83K36F9YC8B-7HiPa-SW3AXDT3-28m_Zw?e=H3dWQK)
32 |   - **MeViS GCG**: [Link](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/shehan_munasinghe_mbzuai_ac_ae/EcMipuuIMx9AofwShTCzAB8BVtLiRDJoFjPDTNnY48gv8Q?e=6tTTBx)
33 | 
34 | 
35 | ### **File Structure**
36 | 
37 | Extract **LISA dataset** and **GranDf dataset** under `./dataset`:
38 | 
39 |     dataset/
40 |     ├── ade20k
41 |     ├── coco
42 |     ├── cocostuff
43 |     ├── grandf_dataset
44 |     ├── llava_dataset
45 |     ├── mapillary
46 |     ├── other
47 |     ├── reason_seg
48 |     ├── refer_seg
49 |     └── vlpart
50 | 
51 | Extract **other datasets** under `./video_dataset`:
52 | 
53 |     video_dataset/
54 |     ├── activitynet
55 |     ├── activitynet_captions
56 |     ├── activitynet_entities
57 |     ├── activitynet_entities_gcg
58 |     ├── burst
59 |     ├── hcstvg
60 |     ├── hcstvg_gcg
61 |     ├── mevis
62 |     ├── mevis_gcg
63 |     ├── processed
64 |     ├── refer_davis
65 |     ├── refer_youtube_vos
66 |     ├── video_gcg
67 |     ├── video_instruct_100k
68 |     ├── vidstg
69 |     ├── vidstg_gcg
70 |     ├── ytvis
71 |     └── ytvos_gcg
72 | 
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # <img src="docs/images/logos/logo-videoglamm.png" height="40">  VideoGLaMM: A Large Multimodal Model for Pixel-Level Visual Grounding in Videos [CVPR 2025🔥]
  2 | ![](https://i.imgur.com/waxVImv.png)
  3 | 
  4 | [Shehan Munasinghe](https://github.com/shehanmunasinghe) , [Hanan Gani](https://github.com/hananshafi) , [Wenqi Zhu](#) , [Jiale Cao](https://jialecao001.github.io/), [Eric Xing](https://www.cs.cmu.edu/~epxing/), [Fahad Shahbaz Khan](https://scholar.google.es/citations?user=zvaeYnUAAAAJ&hl=en). [Salman Khan](https://salman-h-khan.github.io/),
  5 | 
  6 | **Mohamed bin Zayed University of Artificial Intelligence, Tianjin University,
  7 | Linköping University, Australian National University, Carnegie Mellon University**
  8 | 
  9 | [![Website](https://img.shields.io/badge/Project-Website-87CEEB)](https://mbzuai-oryx.github.io/VideoGLaMM/)
 10 | [![paper](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2411.04923)
 11 | 
 12 | ---
 13 | 
 14 | ## 📢 Latest Updates
 15 | 
 16 | - **Feb-2025:** Video-GLaMM is accepted at CVPR 2025! 🎊🎊
 17 | 
 18 | ---
 19 | 
 20 | ## <img src="docs/images/logos/logo-videoglamm.png" height="25">  Overview
 21 | 
 22 | <p align="center">
 23 |   <img src="docs/images/figures/cvpr25-teaser.png" width="70%" alt="VideoGLaMM Architectural Overview">
 24 | </p>
 25 | 
 26 | VideoGLaMM is a large video multimodal video model capable of pixel-level visual grounding. The model responds to natural language queries from the user and intertwines spatio-temporal object masks in its generated textual responses to provide a detailed understanding of video content. VideoGLaMM seamlessly connects three key components: a Large Language Model (LLM); dual vision encoders; and a spatio-temporal pixel decoder. The dual vision encoders extract spatial and temporal features separately, which are jointly passed to the LLM to output responses rich in both spatial and temporal cues. This is facilitated by end-to-end training on our proposed benchmark Grounded conversation Generation (GCG) dataset featuring 38k Video-QA triplets with 87k objects and 671k fine-grained masks.
 27 | 
 28 | ---
 29 | ## 🏆 Highlights
 30 | 1. We introduce Video Grounded Large Multi-modal Model (VideoGLaMM), a video large multimodal model, capable of pixel-level visual grounding, featuring an end-to-end alignment mechanism.
 31 | 
 32 | 2. To achieve fine-grained spatio-temporal alignment, we introduce a benchmark grounded conversation generation (GCG) dataset consisting of 38k grounded video-QA triplet pairs and 83k objects and roughly 671k fine-grained spatio-temporal masks.
 33 | 
 34 | 3. We assess the performance of VideoGLaMM across diverse tasks spanning grounded conversation generation, visual grounding, and referring video segmentation, where it achieves state-of-the-art performance
 35 | 
 36 | ---
 37 | <!-- Architecture -->
 38 | ## <img src="docs/images/logos/logo-videoglamm.png" height="25">  Architecture
 39 | 
 40 | <p align="center">
 41 |   <img src="docs/images/figures/cvpr25_main_block_diagram-jpg.jpg" alt="VideoGLaMM Architecture">
 42 | </p>
 43 | 
 44 | VideoGLaMM consists of following key components: (i) Spatio-Temporal Dual Encoder, (ii) Dual Alignment V-L Adapters for image and video features, (iii) Large Language Model (LLM) iv) L-V Adapter and (iv) Promptable Pixel Decoder.
 45 | 
 46 | ---
 47 | ## <img src="docs/images/logos/logo-videoglamm.png" height="25">  Benchmark and Annotation Pipeline
 48 | 
 49 | <p align="center">
 50 |   <img src="docs/images/figures/videoglamm_annotation_pipeline.png" alt="Annotation Pipeline">
 51 | </p>
 52 | 
 53 | We propose a semi-automatic annotation pipeline for creating a grounded conversation generation (GCG) dataset for videos.
 54 | 
 55 | 
 56 | ---
 57 | ## Examples 🔍
 58 | 
 59 | Given user queries, the VideoGLaMM generates textual responses and grounds objects and phrases using pixel-level masks, showing its detailed understanding of the video.
 60 | 
 61 | <p align="center">
 62 |   <img src="docs/images/figures/cvpr25_qualitative.png" alt="VideoGLaMM Architecture">
 63 | </p>
 64 | 
 65 | ---
 66 | 
 67 | ## Running VideoGLaMM 🔧
 68 | 
 69 | ### Environment setup
 70 | 
 71 |     conda create --name=videoglamm python=3.11
 72 | 
 73 |     conda activate videoglamm
 74 | 
 75 |     pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
 76 |     pip install transformers==4.41.0
 77 |     DS_BUILD_FUSED_ADAM=1 pip install deepspeed==0.14.0
 78 | 
 79 |     pip install -r VideoGLaMM/requirements_sam2_cluster.txt 
 80 | 
 81 |     cd VideoGLaMM/model/segment_anything_2
 82 |     python setup.py build_ext --inplace
 83 |     cd ../../..
 84 | 
 85 | ### Training and Evaluation
 86 | 
 87 | Please refer [here](RUN_VideoGLaMM.md) for instructions
 88 | 
 89 | 
 90 | ## Citation 📜
 91 | 
 92 | ```bibtex
 93 | @article{munasinghe2024videoglamm,
 94 |   title={VideoGLaMM: A Large Multimodal Model for Pixel-Level Visual Grounding in Videos}, 
 95 |   author={Shehan Munasinghe and Hanan Gani and Wenqi Zhu and Jiale Cao and Eric Xing and Fahad Khan and Salman Khan},
 96 |   journal={ArXiv},
 97 |   year={2024},
 98 |   url={https://arxiv.org/abs/2411.04923}
 99 | }
100 | ```
101 | 
102 | ---
103 | 
104 | [<img src="docs/images/logos/IVAL_logo.png" width="200" height="100">](https://www.ival-mbzuai.com)
105 | [<img src="docs/images/logos/Oryx_logo.png" width="100" height="100">](https://github.com/mbzuai-oryx)
106 | [<img src="docs/images/logos/MBZUAI_logo.png" width="360" height="85">](https://mbzuai.ac.ae)


--------------------------------------------------------------------------------
/RUN_VideoGLaMM.md:
--------------------------------------------------------------------------------
 1 | # Checkpoints
 2 | 
 3 | * Download SAM2 checkpoints from [here](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt)
 4 | 
 5 | * Download InternVideo2 checkpoints from [here](https://huggingface.co/OpenGVLab/InternVideo2-Stage2_1B-224p-f4)
 6 | 
 7 | * Download VideoGLaMM checkpoints from [here](https://mbzuaiac-my.sharepoint.com/:f:/g/personal/shehan_munasinghe_mbzuai_ac_ae/Etucj3LuqdRDocrle_8eJbcB8C11u-020AX7fwIYWJh-dg?e=uPanYM)
 8 | 
 9 | # Command Line Demo
10 | 
11 |     python chat.py \
12 |         --llava_version_or_path="<path_to_ckpts>" \
13 |         --use_sam2_video_branch \
14 |         --base_model_type="vgpt|phi3"
15 | 
16 | # Evaluation
17 | 
18 | 
19 | ## GCG Task
20 | 
21 |     python eval_gcg_infer.py \
22 |         --llava_version_or_path="<path_to_ckpts>" \
23 |         --use_sam2_video_branch \
24 |         --base_model_type="vgpt|phi3" \
25 |         --dataset_name='video_gcg'\
26 |         --vis_save_path="./vis_output_path"
27 | 
28 |     export OPENAI_API_KEY='<YOUR KEY>'
29 | 
30 |     python eval_gcg_metrics.py \
31 |         --vis_save_path="<path_to_ckpts>" \
32 |         --eval_miou --eval_recall --eval_caption --use_clair
33 | 
34 | ## MeViS
35 | 
36 |     python eval_mevis.py \
37 |         --llava_version_or_path="<path_to_ckpts>" \
38 |         --use_sam2_video_branch \
39 |         --base_model_type="vgpt|phi3" \
40 |         --dataset_name="MEVIS|valid"\
41 |         --vis_save_path="./vis_output_path"
42 | 
43 | You can use following command to prepare .zip submission file
44 | 
45 |     cd [vis_output_path]
46 |     zip -r ../mevis_out.zip *
47 | 
48 | ## VidSTG
49 | 
50 |     python eval_grounding.py \
51 |         --llava_version_or_path="<path_to_ckpts>" \
52 |         --use_sam2_video_branch \
53 |         --base_model_type="vgpt|phi3" \
54 |         --dataset_name="vidstg"\
55 |         --vis_save_path="./vis_output_path"
56 | 
57 | 
58 | ## HCSTVG
59 | 
60 | 
61 |     python eval_grounding.py \
62 |         --llava_version_or_path="<path_to_ckpts>" \
63 |         --use_sam2_video_branch \
64 |         --base_model_type="vgpt|phi3" \
65 |         --dataset_name="hcstvg"\
66 |         --vis_save_path="./vis_output_path"
67 | 
68 | ## ReferYTVOS
69 | 
70 |     python eval_mevis.py \
71 |         --llava_version_or_path="<path_to_ckpts>" \
72 |         --use_sam2_video_branch \
73 |         --base_model_type="vgpt|phi3" \
74 |         --dataset_name="ReferYouTubeVOS|valid" \
75 |         --vis_save_path="./vis_output_path"
76 | 
77 | 
78 | ## ReferDAVIS17
79 | 
80 | 
81 |     python eval_referdavis_infer.py \
82 |         --llava_version_or_path="<path_to_ckpts>" \
83 |         --use_sam2_video_branch \
84 |         --base_model_type="vgpt|phi3" \
85 |         --dataset_name="ReferDAVIS|valid" \
86 |         --vis_save_path="./vis_output_path"
87 | 
88 |     python eval_referdavis_metrics.py --output_dir \
89 |         "./vis_output_path"


--------------------------------------------------------------------------------
/Training.md:
--------------------------------------------------------------------------------
 1 | # Training Instructions
 2 | 
 3 | 
 4 | * Initial training 
 5 | 
 6 | 
 7 |         deepspeed --master_port=29504 --num_gpus=4 train_ds_with_videogptplus.py \
 8 |             --videogptplus_path="./checkpoints_hf/MBZUAI/VideoGPT-plus_Phi3-mini-4k/mvbench" \
 9 |             --vision_tower="./OpenGVLab/InternVideo2-Stage2_1B-224p-f4/InternVideo2-stage2_1b-224p-f4.pt" \
10 |             --image_vision_tower="openai/clip-vit-large-patch14-336" \
11 |             --dataset_dir='./dataset' \
12 |             --video_dataset_dir='./video_dataset' \
13 |             --sam_pretrained_path="./checkpoints/sam2/sam2_hiera_large.pt" \
14 |             --exp_name="sam2_videogptplusphi3" \
15 |             --logs_base_dir "./runs/logs" \
16 |             --ckpt_base_dir "./runs/ckpts" \
17 |             --dataset="sem_seg||refer_seg||vqa||reason_seg||grandf||refer_vos||mevis||vidstg||video_vqa" \
18 |             --sample_rates_for_datasets="9,3,3,1,1,10,10,10,10,10" \
19 |             --train_mask_decoder=False \
20 |             --tune_mm_mlp_adapter=True \
21 |             --use_sam_version='v2' \
22 |             --precision='fp16' \
23 |             --num_frames_for_sam=8 \
24 |             --batch_size=1 \
25 |             --grad_accumulation_steps=10 \
26 |             --epochs=20 \
27 |             --auto_resume
28 | 
29 | * Finetuning with video-GCG data
30 | 
31 |         deepspeed --master_port=29504 --num_gpus=4 train_ds_with_videogptplus.py \
32 |             --videogptplus_path="./checkpoints_hf/MBZUAI/VideoGPT-plus_Phi3-mini-4k/mvbench" \
33 |             --vision_tower="./OpenGVLab/InternVideo2-Stage2_1B-224p-f4/InternVideo2-stage2_1b-224p-f4.pt" \
34 |             --image_vision_tower="openai/clip-vit-large-patch14-336" \
35 |             --dataset_dir='./dataset' \
36 |             --video_dataset_dir='./video_dataset' \
37 |             --sam_pretrained_path="./checkpoints/sam2/sam2_hiera_large.pt" \
38 |             --exp_name="sam2_videogptplusphi3" \
39 |             --logs_base_dir "./runs/logs" \
40 |             --ckpt_base_dir "./runs/ckpts" \
41 |             --dataset="sem_seg||refer_seg||vqa||reason_seg||grandf||refer_vos||mevis||vidstg||video_vqa||anet_gcg||video_gcg||mevis_gcg||vidstg_gcg||hcstvg_gcg" \
42 |             --sample_rates_for_datasets="1,1,1,1,20,1,1,1,1,1,20,5,20,20,10" \
43 |             --train_mask_decoder=False \
44 |             --tune_mm_mlp_adapter=True \
45 |             --use_sam_version='v2' \
46 |             --precision='fp16' \
47 |             --num_frames_for_sam=8 \
48 |             --batch_size=1 \
49 |             --grad_accumulation_steps=10 \
50 |             --epochs=30 \
51 |             --auto_resume
52 | 
53 | 


--------------------------------------------------------------------------------
/VideoGLaMM/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/.DS_Store


--------------------------------------------------------------------------------
/VideoGLaMM/.gitignore:
--------------------------------------------------------------------------------
 1 | !aws_backup/
 2 | 
 3 | **/__pycache__
 4 | archive/
 5 | .runs/
 6 | runs
 7 | .vscode/
 8 | 
 9 | dataset
10 | video_dataset
11 | checkpoints
12 | checkpoints/
13 | checkpoints_hf
14 | checkpoints_hf/
15 | 
16 | .ipynb_checkpoints
17 | */.ipynb_checkpoints/*
18 | 
19 | *.ipynb
20 | 
21 | vis_output/
22 | slurm_outputs/
23 | 
24 | scripts/
25 | 
26 | 
27 | model/segment_anything_2/build/lib.linux-x86_64-cpython-310/sam2/_C.so
28 | model/segment_anything_2/build/lib.linux-x86_64-cpython-311/sam2/_C.so
29 | model/segment_anything_2/build/temp.linux-x86_64-cpython-310/.ninja_deps
30 | model/segment_anything_2/build/temp.linux-x86_64-cpython-310/.ninja_log
31 | model/segment_anything_2/build/temp.linux-x86_64-cpython-310/build.ninja
32 | model/segment_anything_2/build/temp.linux-x86_64-cpython-310/sam2/csrc/connected_components.o
33 | model/segment_anything_2/build/temp.linux-x86_64-cpython-311/.ninja_deps
34 | model/segment_anything_2/build/temp.linux-x86_64-cpython-311/.ninja_log
35 | model/segment_anything_2/build/temp.linux-x86_64-cpython-311/build.ninja
36 | model/segment_anything_2/build/temp.linux-x86_64-cpython-311/sam2/csrc/connected_components.o
37 | model/segment_anything_2/sam2/_C.so
38 | *.so
39 | *.ninja_log
40 | *.ninja_deps
41 | *.ninja
42 | *.o
43 | 
44 | 
45 | .gradio
46 | 
47 | 
48 | tools/stanford-corenlp-full-2018-02-27


--------------------------------------------------------------------------------
/VideoGLaMM/gcg_data_gen/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/gcg_data_gen/.DS_Store


--------------------------------------------------------------------------------
/VideoGLaMM/gcg_data_gen/burst_ytvis_gcg/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | **Run the demo** <br>
 3 | Take YouTubeVIS2019 for example
 4 | ```bash
 5 | # Generation step1 --rough description of each object
 6 | python generation.py --video_path video_data/youtube2019/train/JPEGImages --question These are frames from a video that I want to upload. What does the <cls> look like and what is the <cls> doing? --ann_path video_data/youtube2019/train.json --output_file generated_step1.txt --step step1
 7 | 
 8 | 
 9 | # Generation step2 --corrected description of the object
10 | python generation.py --video_path video_data/youtube2019/train/JPEGImages --question These are frames from a video that I want to upload. Please modify this caption: <cap> The instance in the video is surrounded by a rectangular box with color number <obj_id>. The output caption must include what the <cls> looks like and what the <cls> is doing. Please do not mention any information about the bbox in the output. --ann_path video_data/youtube2019/train.json --output_file generated_step2.txt --step step2 --caption_file output/generated_step1.json
11 | 
12 | # Generation step3 --comprehensive description of the video
13 | python generation.py --video_path video_data/youtube2019/train/JPEGImages --question These are frames from a video that I want to upload. In the video, the ID number of the box is on the top left of the box. There are some instance captions: '<cap>' Generate a dense caption that describes the video in detail based on the video and instance captions, including all of the instances mentioned in the instance captions and other instances in the video. Ensure that each instance mentioned in the instance caption appears exactly once in the dense caption, followed by the format {obj_} to indicate which instance caption the mentioned instance corresponds to. The {obj_} must directly follow the noun representing the instance.Please do not mention any information about the bbox in the output. --ann_path video_data/youtube2019/train.json --output_file generated_step3.txt --step step3 --caption_file output/generated_step2.json
14 | 
15 | # Manually review the {obj_id} in the generated video captions based on the video content
16 | 
17 | # Generate annotation file with caption
18 | python generate_annotations.py --ann_file video_data/youtube2019/train.json --obj_cap output/generated_step2.json --dense_cap output/manual_generated_step3.json --out_ann_file generated_annotation.json
19 | 
20 | 
21 | # Merge BURST and YouTubeVIS2019 annotation files
22 | python merge_b_y.py --burst_train video_data/burst/train/b2y_train_add_cap_del_filtered_ann.json --burst_val video_data/burst/val/b2y_val_add_cap_del_filtered_ann.json --yt19_train video_data/ytvis_2019/train_add_cap_filtered_ann.json --hq_ann_file video_data/ytvis_2019/ --out_ann_path output
23 | ```
24 | 


--------------------------------------------------------------------------------
/VideoGLaMM/gcg_data_gen/burst_ytvis_gcg/generate_annotations.py:
--------------------------------------------------------------------------------
 1 | import json 
 2 | import re
 3 | import argparse
 4 | 
 5 | 
 6 | def get_arguments():
 7 |     parser = argparse.ArgumentParser(description="Inference parameters")
 8 | 
 9 |     parser.add_argument("--ann_file", type=str, help="path to annotations")
10 |     parser.add_argument("--obj_cap", type=str, help="the generated object-level caption")
11 |     parser.add_argument("--dense_cap", type=str, help="the generated video-level caption")
12 |     parser.add_argument("--out_ann_file", type=str, help="path to the final annotations file")
13 |   
14 |     return parser.parse_args()
15 | 
16 | if __name__ == "__main__":
17 |     args=get_arguments()
18 |     ann_file= json.load(open(args.ann_file))
19 |     obj_cap=json.load(open(args.obj_cap))
20 |     dense_cap=json.load(open(args.dense_cap))
21 |     out_ann_file= args.out_ann_file
22 | 
23 | 
24 |     for ann in ann_file['annotations']:
25 |         if str(ann['id']) in obj_cap.keys():
26 |             ann['cap']=obj_cap[str(ann['id'])]
27 |         else:
28 |             ann['cap']=None
29 | 
30 |     video_cap={}
31 |     for ann in ann_file['annotations']:
32 |             if ann['video_id'] not in video_cap.keys():
33 |                 video_cap[ann['video_id']]=[]
34 |             if str(ann['id']) in obj_cap.keys():
35 |                 video_cap[ann['video_id']].append(dict(cls_id=ann['category_id'],seg=ann['segmentations'],bboxes=ann['bboxes'],cap=obj_cap[str(ann['id'])],obj_id=ann['id'],ann_id=len(video_cap[ann['video_id']])))
36 |     for vid,video in enumerate(ann_file['videos']):
37 |         if str(vid) in dense_cap.keys():
38 |             if len(video_cap[video['id']])==0:
39 |                 video['dense_cap']={}
40 |                 video['dense_cap']['v_id2o_id']=None
41 |                 video['dense_cap']['token_pos']=None
42 |                 video['dense_cap']['mask_id']=None
43 |                 video['dense_cap']['caption']=None
44 |             else:
45 |                 video['dense_cap']={}
46 |                 video['dense_cap']['v_id2o_id']={}
47 |                 video['dense_cap']['token_pos']=[]
48 |                 video['dense_cap']['mask_id']=[]
49 |                 for an in video_cap[video['id']]:
50 |                     video['dense_cap']['v_id2o_id'][an['ann_id']]=an['obj_id']
51 |                 spl_dense_cap=dense_cap[str(vid)].split(' ')
52 |                 me_cap=[]
53 |                 for wid,word in enumerate(spl_dense_cap):
54 |                     if '{obj_' in word :  
55 |                         video['dense_cap']['token_pos'].append(len(me_cap)-1)
56 |                         m_id=int(re.findall(r'\d+',word)[0])
57 |                         if m_id<len(video['dense_cap']['v_id2o_id']):
58 |                             video['dense_cap']['mask_id'].append(video['dense_cap']['v_id2o_id'][m_id])
59 |                         # spl_dense_cap.remove(word)
60 |                         # wid-=1
61 |                     else:
62 |                         me_cap.append(word)
63 |                 ori_dense=' '.join(me_cap) 
64 |                 video['dense_cap']['caption']=ori_dense
65 |         else:
66 |             video['dense_cap']={}
67 |             video['dense_cap']['v_id2o_id']=None
68 |             video['dense_cap']['token_pos']=None
69 |             video['dense_cap']['mask_id']=None
70 |             video['dense_cap']['caption']=None
71 | 
72 |     with open(out_ann_file, 'w') as json_file:
73 |         json.dump(ann_file, json_file)
74 |     print('Finished')
75 |             
76 |          


--------------------------------------------------------------------------------
/VideoGLaMM/gcg_data_gen/burst_ytvis_gcg/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow==10.3.0
2 | pycocotools==2.0.6
3 | google-generativeai


--------------------------------------------------------------------------------
/VideoGLaMM/model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/model/.DS_Store


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/model/chatunivi/__init__.py


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | MAX_IMAGE_LENGTH = 64
 8 | IGNORE_INDEX = -100
 9 | IMAGE_TOKEN_INDEX = -200
10 | DEFAULT_IMAGE_TOKEN = "<image>"
11 | DEFAULT_VIDEO_TOKEN = "<video>"
12 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13 | DEFAULT_IM_START_TOKEN = "<im_start>"
14 | DEFAULT_IM_END_TOKEN = "<im_end>"
15 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
16 | DEFAULT_VID_START_TOKEN = "<vid_start>"
17 | DEFAULT_VID_END_TOKEN = "<vid_end>"
18 | DEFAULT_BOX_START_TOKEN = "<box_start>"
19 | DEFAULT_BOX_END_TOKEN = "<box_end>"


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/mm_utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from io import BytesIO
 3 | import base64
 4 | 
 5 | import torch
 6 | from transformers import StoppingCriteria
 7 | from .constants import IMAGE_TOKEN_INDEX
 8 | 
 9 | 
10 | def load_image_from_base64(image):
11 |     return Image.open(BytesIO(base64.b64decode(image)))
12 | 
13 | 
14 | def process_images(images, image_processor, model_cfg):
15 |     return image_processor(images, return_tensors='pt')['pixel_values']
16 | 
17 | 
18 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
19 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
20 | 
21 |     def insert_separator(X, sep):
22 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
23 | 
24 |     input_ids = []
25 |     offset = 0
26 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
27 |         offset = 1
28 |         input_ids.append(prompt_chunks[0][0])
29 | 
30 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
31 |         input_ids.extend(x[offset:])
32 | 
33 |     if return_tensors is not None:
34 |         if return_tensors == 'pt':
35 |             return torch.tensor(input_ids, dtype=torch.long)
36 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
37 | 
38 |     return input_ids
39 | 
40 | 
41 | def get_model_name_from_path(model_path):
42 |     model_path = model_path.strip("/")
43 |     model_paths = model_path.split("/")
44 |     if model_paths[-1].startswith('checkpoint-'):
45 |         return model_paths[-2] + "_" + model_paths[-1]
46 |     else:
47 |         return model_paths[-1]
48 | 
49 | 
50 | class KeywordsStoppingCriteria(StoppingCriteria):
51 |     def __init__(self, keywords, tokenizer, input_ids):
52 |         self.keywords = keywords
53 |         self.keyword_ids = []
54 |         for keyword in keywords:
55 |             cur_keyword_ids = tokenizer(keyword).input_ids
56 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
57 |                 cur_keyword_ids = cur_keyword_ids[1:]
58 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
59 |         self.tokenizer = tokenizer
60 |         self.start_len = input_ids.shape[1]
61 | 
62 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
63 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
64 |         offset = min(output_ids.shape[1] - self.start_len, 3)
65 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
66 |         for keyword_id in self.keyword_ids:
67 |             if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
68 |                 return True
69 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
70 |         for keyword in self.keywords:
71 |             if keyword in outputs:
72 |                 return True
73 |         return False
74 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/model/chatunivi/model/__init__.py


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .clip_encoder import CLIPVisionTower
 2 | # from .eva_encoder import EVAVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     if vision_tower.startswith("openai") or vision_tower.startswith("laion"):
 8 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
 9 | 
10 |     # elif vision_tower.startswith("eva_vit_g"):
11 |     #     return EVAVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
12 | 
13 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
14 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args=None, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         if args is None:
15 |             self.select_layer = -2
16 |             self.select_feature = 'patch'
17 |         else:
18 |             self.select_layer = args.mm_vision_select_layer
19 |             self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
20 | 
21 |         if not delay_load:
22 |             self.load_model()
23 |         else:
24 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
25 | 
26 |     def load_model(self):
27 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
28 |         self.image_eval_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
29 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
30 |         self.vision_tower.requires_grad_(False)
31 | 
32 |         self.is_loaded = True
33 | 
34 |     def feature_select(self, image_forward_outs, select_feature='patch'):
35 |         image_features = image_forward_outs.hidden_states[self.select_layer]
36 |         if select_feature == 'patch':
37 |             image_features = image_features[:, 1:]
38 |         elif select_feature == 'cls_patch':
39 |             image_features = image_features
40 |         else:
41 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
42 |         return image_features
43 | 
44 |     @torch.no_grad()
45 |     def forward(self, images, select_feature='patch'):
46 |         if type(images) is list:
47 |             image_features = []
48 |             for image in images:
49 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
50 |                 image_feature = self.feature_select(image_forward_out, select_feature).to(image.dtype)
51 |                 image_features.append(image_feature)
52 |         else:
53 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
54 |             image_features = self.feature_select(image_forward_outs, select_feature).to(images.dtype)
55 | 
56 |         return image_features
57 | 
58 |     @property
59 |     def dummy_feature(self):
60 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
61 | 
62 |     @property
63 |     def dtype(self):
64 |         return self.vision_tower.dtype
65 | 
66 |     @property
67 |     def device(self):
68 |         return self.vision_tower.device
69 | 
70 |     @property
71 |     def config(self):
72 |         if self.is_loaded:
73 |             return self.vision_tower.config
74 |         else:
75 |             return self.cfg_only
76 | 
77 |     @property
78 |     def hidden_size(self):
79 |         return self.config.hidden_size
80 | 
81 |     @property
82 |     def num_patches(self):
83 |         return (self.config.image_size // self.config.patch_size) ** 2
84 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/chatunivi/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from ChatUniVi.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True)
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/mm_utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from io import BytesIO
  3 | 
  4 | import torch
  5 | from PIL import Image
  6 | from transformers import StoppingCriteria
  7 | 
  8 | from .constants import IMAGE_TOKEN_INDEX
  9 | 
 10 | 
 11 | def load_image_from_base64(image):
 12 |     return Image.open(BytesIO(base64.b64decode(image)))
 13 | 
 14 | 
 15 | def process_images(images, image_processor, model_cfg):
 16 |     return image_processor(images, return_tensors="pt")["pixel_values"]
 17 | 
 18 | 
 19 | def tokenizer_image_token(
 20 |     prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
 21 | ):
 22 |     ''' 
 23 |     This function is used to tokenize a prompt string containing the <image> token. 
 24 |     
 25 |     - The input string is tokenized into a list of token IDs.
 26 |     - The image_token_index is inserted between the chunks of the prompt where <image> was present.
 27 |     - The resulting result is either a list of token IDs or a PyTorch tensor, depending on the specified return_tensors parameter.
 28 |     
 29 |     '''
 30 |     # splits the prompt string into chunks based on the <image> token and tokenizes each chunk separately using the provided tokenizer.
 31 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
 32 |     
 33 | 
 34 |     # helper function that takes a list X and a separator sep, and it interleaves the elements of X with the separator. 
 35 |     # It is used to insert the image_token_index between the tokenized chunks of the prompt.
 36 |     def insert_separator(X, sep):
 37 |         return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
 38 | 
 39 |     input_ids = []
 40 |     offset = 0
 41 |     # If the first chunk of the prompt starts with the tokenizer's beginning-of-sequence (BOS) token, 
 42 |     # it increments the offset and appends the BOS token to the input_ids list.
 43 |     if (
 44 |         len(prompt_chunks) > 0
 45 |         and len(prompt_chunks[0]) > 0
 46 |         and prompt_chunks[0][0] == tokenizer.bos_token_id
 47 |     ):
 48 |         offset = 1
 49 |         input_ids.append(prompt_chunks[0][0])
 50 | 
 51 |     # uses the insert_separator function to insert the image_token_index between the tokenized chunks of the prompt. 
 52 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
 53 |         input_ids.extend(x[offset:])
 54 |         
 55 |     # The resulting input_ids list contains the tokenized prompt with the image token index appropriately inserted.
 56 | 
 57 |     if return_tensors is not None:
 58 |         if return_tensors == "pt":
 59 |             return torch.tensor(input_ids, dtype=torch.long)
 60 |         raise ValueError(f"Unsupported tensor type: {return_tensors}")
 61 |     return input_ids
 62 | 
 63 | 
 64 | def get_model_name_from_path(model_path):
 65 |     model_path = model_path.strip("/")
 66 |     model_paths = model_path.split("/")
 67 |     if model_paths[-1].startswith("checkpoint-"):
 68 |         return model_paths[-2] + "_" + model_paths[-1]
 69 |     else:
 70 |         return model_paths[-1]
 71 | 
 72 | 
 73 | class KeywordsStoppingCriteria(StoppingCriteria):
 74 |     def __init__(self, keywords, tokenizer, input_ids):
 75 |         self.keywords = keywords
 76 |         self.keyword_ids = []
 77 |         for keyword in keywords:
 78 |             cur_keyword_ids = tokenizer(keyword).input_ids
 79 |             if (
 80 |                 len(cur_keyword_ids) > 1
 81 |                 and cur_keyword_ids[0] == tokenizer.bos_token_id
 82 |             ):
 83 |                 cur_keyword_ids = cur_keyword_ids[1:]
 84 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
 85 |         self.tokenizer = tokenizer
 86 |         self.start_len = input_ids.shape[1]
 87 | 
 88 |     def __call__(
 89 |         self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
 90 |     ) -> bool:
 91 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
 92 |         offset = min(output_ids.shape[1] - self.start_len, 3)
 93 |         self.keyword_ids = [
 94 |             keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids
 95 |         ]
 96 |         for keyword_id in self.keyword_ids:
 97 |             if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
 98 |                 return True
 99 |         outputs = self.tokenizer.batch_decode(
100 |             output_ids[:, -offset:], skip_special_tokens=True
101 |         )[0]
102 |         for keyword in self.keywords:
103 |             if keyword in outputs:
104 |                 return True
105 |         return False
106 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaConfig, LlavaLlamaForCausalLM
2 | from .language_model.llava_mpt import LlavaMPTConfig, LlavaMPTForCausalLM
3 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from llava import LlavaLlamaForCausalLM
 9 | from tqdm import tqdm
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
17 |     )
18 | 
19 |     print("Loading delta")
20 |     delta = LlavaLlamaForCausalLM.from_pretrained(
21 |         delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
22 |     )
23 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
24 | 
25 |     print("Applying delta")
26 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
27 |         if name not in base.state_dict():
28 |             assert name in [
29 |                 "model.mm_projector.weight",
30 |                 "model.mm_projector.bias",
31 |             ], f"{name} not in base model"
32 |             continue
33 |         if param.data.shape == base.state_dict()[name].shape:
34 |             param.data += base.state_dict()[name]
35 |         else:
36 |             assert name in [
37 |                 "model.embed_tokens.weight",
38 |                 "lm_head.weight",
39 |             ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
40 |             bparam = base.state_dict()[name]
41 |             param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
42 | 
43 |     print("Saving target model")
44 |     delta.save_pretrained(target_model_path)
45 |     delta_tokenizer.save_pretrained(target_model_path)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument("--base-model-path", type=str, required=True)
51 |     parser.add_argument("--target-model-path", type=str, required=True)
52 |     parser.add_argument("--delta-path", type=str, required=True)
53 | 
54 |     args = parser.parse_args()
55 | 
56 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
57 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from llava.model import *
 9 | from llava.model.utils import auto_upgrade
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(
17 |         src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
18 |     )
19 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
20 |     src_model.save_pretrained(dst_path)
21 |     src_tokenizer.save_pretrained(dst_path)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--src", type=str, required=True)
27 |     parser.add_argument("--dst", type=str, required=True)
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     consolidate_ckpt(args.src, args.dst)
32 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/language_model/mpt/adapt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from transformers import (AutoTokenizer, PreTrainedTokenizer,
 4 |                           PreTrainedTokenizerFast)
 5 | 
 6 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 7 | NUM_SENTINEL_TOKENS: int = 100
 8 | 
 9 | 
10 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
11 |     """Adds sentinel tokens and padding token (if missing).
12 | 
13 |     Expands the tokenizer vocabulary to include sentinel tokens
14 |     used in mixture-of-denoiser tasks as well as a padding token.
15 | 
16 |     All added tokens are added as special tokens. No tokens are
17 |     added if sentinel tokens and padding token already exist.
18 |     """
19 |     sentinels_to_add = [f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)]
20 |     tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
21 |     if tokenizer.pad_token is None:
22 |         tokenizer.add_tokens("<pad>", special_tokens=True)
23 |         tokenizer.pad_token = "<pad>"
24 |         assert tokenizer.pad_token_id is not None
25 |     sentinels = "".join([f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)])
26 |     _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
27 |     tokenizer.sentinel_token_ids = _sentinel_token_ids
28 | 
29 | 
30 | class AutoTokenizerForMOD(AutoTokenizer):
31 |     """AutoTokenizer + Adaptation for MOD.
32 | 
33 |     A simple wrapper around AutoTokenizer to make instantiating
34 |     an MOD-adapted tokenizer a bit easier.
35 | 
36 |     MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
37 |     a padding token, and a property to get the token ids of the
38 |     sentinel tokens.
39 |     """
40 | 
41 |     @classmethod
42 |     def from_pretrained(cls, *args, **kwargs):
43 |         """See `AutoTokenizer.from_pretrained` docstring."""
44 |         tokenizer = super().from_pretrained(*args, **kwargs)
45 |         adapt_tokenizer_for_denoising(tokenizer)
46 |         return tokenizer
47 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/language_model/mpt/blocks.py:
--------------------------------------------------------------------------------
 1 | """GPT Blocks used for the GPT Model."""
 2 | from typing import Dict, Optional, Tuple
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from .attention import ATTN_CLASS_REGISTRY
 8 | from .norm import NORM_CLASS_REGISTRY
 9 | 
10 | 
11 | class MPTMLP(nn.Module):
12 |     def __init__(
13 |         self, d_model: int, expansion_ratio: int, device: Optional[str] = None
14 |     ):
15 |         super().__init__()
16 |         self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
17 |         self.act = nn.GELU(approximate="none")
18 |         self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
19 |         self.down_proj._is_residual = True
20 | 
21 |     def forward(self, x):
22 |         return self.down_proj(self.act(self.up_proj(x)))
23 | 
24 | 
25 | class MPTBlock(nn.Module):
26 |     def __init__(
27 |         self,
28 |         d_model: int,
29 |         n_heads: int,
30 |         expansion_ratio: int,
31 |         attn_config: Dict = {
32 |             "attn_type": "multihead_attention",
33 |             "attn_pdrop": 0.0,
34 |             "attn_impl": "triton",
35 |             "qk_ln": False,
36 |             "clip_qkv": None,
37 |             "softmax_scale": None,
38 |             "prefix_lm": False,
39 |             "attn_uses_sequence_id": False,
40 |             "alibi": False,
41 |             "alibi_bias_max": 8,
42 |         },
43 |         resid_pdrop: float = 0.0,
44 |         norm_type: str = "low_precision_layernorm",
45 |         verbose: int = 0,
46 |         device: Optional[str] = None,
47 |         **kwargs
48 |     ):
49 |         del kwargs
50 |         super().__init__()
51 |         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
52 |         attn_class = ATTN_CLASS_REGISTRY[attn_config["attn_type"]]
53 |         self.norm_1 = norm_class(d_model, device=device)
54 |         self.attn = attn_class(
55 |             attn_impl=attn_config["attn_impl"],
56 |             clip_qkv=attn_config["clip_qkv"],
57 |             qk_ln=attn_config["qk_ln"],
58 |             softmax_scale=attn_config["softmax_scale"],
59 |             attn_pdrop=attn_config["attn_pdrop"],
60 |             d_model=d_model,
61 |             n_heads=n_heads,
62 |             verbose=verbose,
63 |             device=device,
64 |         )
65 |         self.norm_2 = norm_class(d_model, device=device)
66 |         self.ffn = MPTMLP(
67 |             d_model=d_model, expansion_ratio=expansion_ratio, device=device
68 |         )
69 |         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
70 |         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
71 | 
72 |     def forward(
73 |         self,
74 |         x: torch.Tensor,
75 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
76 |         attn_bias: Optional[torch.Tensor] = None,
77 |         attention_mask: Optional[torch.ByteTensor] = None,
78 |         is_causal: bool = True,
79 |     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
80 |         a = self.norm_1(x)
81 |         (b, attn_weights, past_key_value) = self.attn(
82 |             a,
83 |             past_key_value=past_key_value,
84 |             attn_bias=attn_bias,
85 |             attention_mask=attention_mask,
86 |             is_causal=is_causal,
87 |         )
88 |         x = x + self.resid_attn_dropout(b)
89 |         m = self.norm_2(x)
90 |         n = self.ffn(m)
91 |         x = x + self.resid_ffn_dropout(n)
92 |         return (x, attn_weights, past_key_value)
93 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | class SharedEmbedding(nn.Embedding):
 8 |     def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)
12 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/language_model/mpt/meta_init_context.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | 
  7 | @contextmanager
  8 | def init_empty_weights(include_buffers: bool = False):
  9 |     """Meta initialization context manager.
 10 | 
 11 |     A context manager under which models are initialized with all parameters
 12 |     on the meta device, therefore creating an empty model. Useful when just
 13 |     initializing the model would blow the available RAM.
 14 | 
 15 |     Args:
 16 |         include_buffers (`bool`, *optional*, defaults to `False`): Whether or
 17 |             not to also put all buffers on the meta device while initializing.
 18 | 
 19 |     Example:
 20 |     ```python
 21 |     import torch.nn as nn
 22 | 
 23 |     # Initialize a model with 100 billions parameters in no time and without using any RAM.
 24 |     with init_empty_weights():
 25 |         tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
 26 |     ```
 27 | 
 28 |     <Tip warning={true}>
 29 | 
 30 |     Any model created under this context manager has no weights. As such you can't do something like
 31 |     `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
 32 | 
 33 |     </Tip>
 34 |     """
 35 |     with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
 36 |         yield f
 37 | 
 38 | 
 39 | @contextmanager
 40 | def init_on_device(device: torch.device, include_buffers: bool = False):
 41 |     """Device initialization context manager.
 42 | 
 43 |     A context manager under which models are initialized with all parameters
 44 |     on the specified device.
 45 | 
 46 |     Args:
 47 |         device (`torch.device`): Device to initialize all parameters on.
 48 |         include_buffers (`bool`, *optional*, defaults to `False`): Whether or
 49 |             not to also put all buffers on the meta device while initializing.
 50 | 
 51 |     Example:
 52 |     ```python
 53 |     import torch.nn as nn
 54 | 
 55 |     with init_on_device(device=torch.device("cuda")):
 56 |         tst = nn.Liner(100, 100)  # on `cuda` device
 57 |     ```
 58 |     """
 59 |     old_register_parameter = nn.Module.register_parameter
 60 |     if include_buffers:
 61 |         old_register_buffer = nn.Module.register_buffer
 62 | 
 63 |     def register_empty_parameter(module, name, param):
 64 |         old_register_parameter(module, name, param)
 65 |         if param is not None:
 66 |             param_cls = type(module._parameters[name])
 67 |             kwargs = module._parameters[name].__dict__
 68 |             module._parameters[name] = param_cls(
 69 |                 module._parameters[name].to(device), **kwargs
 70 |             )
 71 | 
 72 |     def register_empty_buffer(module, name, buffer):
 73 |         old_register_buffer(module, name, buffer)
 74 |         if buffer is not None:
 75 |             module._buffers[name] = module._buffers[name].to(device)
 76 | 
 77 |     if include_buffers:
 78 |         tensor_constructors_to_patch = {
 79 |             torch_function_name: getattr(torch, torch_function_name)
 80 |             for torch_function_name in ["empty", "zeros", "ones", "full"]
 81 |         }
 82 |     else:
 83 |         tensor_constructors_to_patch = {}
 84 | 
 85 |     def patch_tensor_constructor(fn):
 86 |         def wrapper(*args, **kwargs):
 87 |             kwargs["device"] = device
 88 |             return fn(*args, **kwargs)
 89 | 
 90 |         return wrapper
 91 | 
 92 |     try:
 93 |         nn.Module.register_parameter = register_empty_parameter
 94 |         if include_buffers:
 95 |             nn.Module.register_buffer = register_empty_buffer
 96 |         for torch_function_name in tensor_constructors_to_patch.keys():
 97 |             setattr(
 98 |                 torch,
 99 |                 torch_function_name,
100 |                 patch_tensor_constructor(getattr(torch, torch_function_name)),
101 |             )
102 |         yield
103 |     finally:
104 |         nn.Module.register_parameter = old_register_parameter
105 |         if include_buffers:
106 |             nn.Module.register_buffer = old_register_buffer
107 |         for (
108 |             torch_function_name,
109 |             old_torch_function,
110 |         ) in tensor_constructors_to_patch.items():
111 |             setattr(torch, torch_function_name, old_torch_function)
112 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/language_model/mpt/norm.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | 
  4 | def _cast_if_autocast_enabled(tensor):
  5 |     if torch.is_autocast_enabled():
  6 |         if tensor.device.type == "cuda":
  7 |             dtype = torch.get_autocast_gpu_dtype()
  8 |         elif tensor.device.type == "cpu":
  9 |             dtype = torch.get_autocast_cpu_dtype()
 10 |         else:
 11 |             raise NotImplementedError()
 12 |         return tensor.to(dtype=dtype)
 13 |     return tensor
 14 | 
 15 | 
 16 | class LPLayerNorm(torch.nn.LayerNorm):
 17 |     def __init__(
 18 |         self,
 19 |         normalized_shape,
 20 |         eps=1e-05,
 21 |         elementwise_affine=True,
 22 |         device=None,
 23 |         dtype=None,
 24 |     ):
 25 |         super().__init__(
 26 |             normalized_shape=normalized_shape,
 27 |             eps=eps,
 28 |             elementwise_affine=elementwise_affine,
 29 |             device=device,
 30 |             dtype=dtype,
 31 |         )
 32 | 
 33 |     def forward(self, x):
 34 |         module_device = x.device
 35 |         downcast_x = _cast_if_autocast_enabled(x)
 36 |         downcast_weight = (
 37 |             _cast_if_autocast_enabled(self.weight)
 38 |             if self.weight is not None
 39 |             else self.weight
 40 |         )
 41 |         downcast_bias = (
 42 |             _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
 43 |         )
 44 |         with torch.autocast(enabled=False, device_type=module_device.type):
 45 |             return torch.nn.functional.layer_norm(
 46 |                 downcast_x,
 47 |                 self.normalized_shape,
 48 |                 downcast_weight,
 49 |                 downcast_bias,
 50 |                 self.eps,
 51 |             )
 52 | 
 53 | 
 54 | def rms_norm(x, weight=None, eps=1e-05):
 55 |     output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
 56 |     if weight is not None:
 57 |         return output * weight
 58 |     return output
 59 | 
 60 | 
 61 | class RMSNorm(torch.nn.Module):
 62 |     def __init__(
 63 |         self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
 64 |     ):
 65 |         super().__init__()
 66 |         self.eps = eps
 67 |         if weight:
 68 |             self.weight = torch.nn.Parameter(
 69 |                 torch.ones(normalized_shape, dtype=dtype, device=device)
 70 |             )
 71 |         else:
 72 |             self.register_parameter("weight", None)
 73 | 
 74 |     def forward(self, x):
 75 |         return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
 76 | 
 77 | 
 78 | class LPRMSNorm(RMSNorm):
 79 |     def __init__(
 80 |         self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
 81 |     ):
 82 |         super().__init__(
 83 |             normalized_shape=normalized_shape,
 84 |             eps=eps,
 85 |             weight=weight,
 86 |             dtype=dtype,
 87 |             device=device,
 88 |         )
 89 | 
 90 |     def forward(self, x):
 91 |         downcast_x = _cast_if_autocast_enabled(x)
 92 |         downcast_weight = (
 93 |             _cast_if_autocast_enabled(self.weight)
 94 |             if self.weight is not None
 95 |             else self.weight
 96 |         )
 97 |         with torch.autocast(enabled=False, device_type=x.device.type):
 98 |             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
 99 | 
100 | 
101 | NORM_CLASS_REGISTRY = {
102 |     "layernorm": torch.nn.LayerNorm,
103 |     "low_precision_layernorm": LPLayerNorm,
104 |     "rmsnorm": RMSNorm,
105 |     "low_precision_rmsnorm": LPRMSNorm,
106 | }
107 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from llava.model.utils import auto_upgrade
 9 | from tqdm import tqdm
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
17 |     )
18 | 
19 |     print("Loading target model")
20 |     auto_upgrade(target_model_path)
21 |     target = AutoModelForCausalLM.from_pretrained(
22 |         target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
23 |     )
24 | 
25 |     print("Calculating delta")
26 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
27 |         if name not in base.state_dict():
28 |             assert name in [
29 |                 "model.mm_projector.weight",
30 |                 "model.mm_projector.bias",
31 |             ], f"{name} not in base model"
32 |             continue
33 |         if param.data.shape == base.state_dict()[name].shape:
34 |             param.data -= base.state_dict()[name]
35 |         else:
36 |             assert name in [
37 |                 "model.embed_tokens.weight",
38 |                 "lm_head.weight",
39 |             ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
40 |             bparam = base.state_dict()[name]
41 |             param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
42 | 
43 |     print("Saving delta")
44 |     if hub_repo_id:
45 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
46 |     else:
47 |         kwargs = {}
48 |     target.save_pretrained(delta_path, **kwargs)
49 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
50 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument("--base-model-path", type=str, required=True)
56 |     parser.add_argument("--target-model-path", type=str, required=True)
57 |     parser.add_argument("--delta-path", type=str, required=True)
58 |     parser.add_argument("--hub-repo-id", type=str, default=None)
59 |     args = parser.parse_args()
60 | 
61 |     make_delta(
62 |         args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id
63 |     )
64 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .clip_encoder import CLIPVisionTower
 2 | 
 3 | 
 4 | def build_vision_tower(vision_tower_cfg, **kwargs):
 5 |     vision_tower = getattr(
 6 |         vision_tower_cfg,
 7 |         "mm_vision_tower",
 8 |         getattr(vision_tower_cfg, "vision_tower", None),
 9 |     )
10 |     if (
11 |         vision_tower.startswith("openai")
12 |         or vision_tower.startswith("laion")
13 |         or "clip" in vision_tower
14 |     ):
15 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16 | 
17 |     raise ValueError(f"Unknown vision tower: {vision_tower}")
18 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
 4 | 
 5 | 
 6 | class CLIPVisionTower(nn.Module):
 7 |     def __init__(self, vision_tower, args, delay_load=False):
 8 |         super().__init__()
 9 | 
10 |         self.is_loaded = False
11 | 
12 |         self.vision_tower_name = vision_tower
13 |         self.select_layer = args.mm_vision_select_layer
14 |         self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
15 | 
16 |         if not delay_load:
17 |             self.load_model()
18 |         else:
19 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
20 | 
21 |     def load_model(self):
22 |         self.image_processor = CLIPImageProcessor.from_pretrained(
23 |             self.vision_tower_name
24 |         )
25 |         self.vision_tower = CLIPVisionModel.from_pretrained(
26 |             self.vision_tower_name, low_cpu_mem_usage=True
27 |         )
28 |         self.vision_tower.requires_grad_(False)
29 |         self.is_loaded = True
30 | 
31 |     def feature_select(self, image_forward_outs):
32 |         image_features = image_forward_outs.hidden_states[self.select_layer]
33 |         if self.select_feature == "patch":
34 |             image_features = image_features[:, 1:]
35 |         elif self.select_feature == "cls_patch":
36 |             image_features = image_features
37 |         else:
38 |             raise ValueError(f"Unexpected select feature: {self.select_feature}")
39 |         return image_features
40 | 
41 |     @torch.no_grad()
42 |     def forward(self, images):
43 |         if type(images) is list:
44 |             image_features = []
45 |             for image in images:
46 |                 image_forward_out = self.vision_tower(
47 |                     image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
48 |                     output_hidden_states=True,
49 |                 )
50 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
51 |                 image_features.append(image_feature)
52 |         else:
53 |             image_forward_outs = self.vision_tower(
54 |                 images.to(device=self.device, dtype=self.dtype),
55 |                 output_hidden_states=True,
56 |             )
57 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
58 | 
59 |         torch.cuda.empty_cache()
60 |         return image_features
61 | 
62 |     @property
63 |     def dummy_feature(self):
64 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
65 | 
66 |     @property
67 |     def dtype(self):
68 |         return self.vision_tower.dtype
69 | 
70 |     @property
71 |     def device(self):
72 |         return self.vision_tower.device
73 | 
74 |     @property
75 |     def config(self):
76 |         if self.is_loaded:
77 |             return self.vision_tower.config
78 |         else:
79 |             return self.cfg_only
80 | 
81 |     @property
82 |     def hidden_size(self):
83 |         return self.config.hidden_size
84 | 
85 |     @property
86 |     def num_patches(self):
87 |         return (self.config.image_size // self.config.patch_size) ** 2
88 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if "llava" in config and "llava" not in cfg.model_type:
 7 |         assert cfg.model_type == "llama"
 8 |         print(
 9 |             "You are using newer LLaVA code base, while the checkpoint of v0 is from older code base."
10 |         )
11 |         print(
12 |             "You must upgrade the checkpoint to the new code base (this can be done automatically)."
13 |         )
14 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
15 |         if confirm.lower() in ["y", "yes"]:
16 |             print("Upgrading checkpoint...")
17 |             assert len(cfg.architectures) == 1
18 |             setattr(cfg.__class__, "model_type", "llava")
19 |             cfg.architectures[0] = "LlavaLlamaForCausalLM"
20 |             cfg.save_pretrained(config)
21 |             print("Checkpoint upgraded.")
22 |         else:
23 |             print("Checkpoint upgrade aborted.")
24 |             exit(1)
25 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Optional, Tuple
  3 | 
  4 | import torch
  5 | import transformers
  6 | from einops import rearrange
  7 | from torch import nn
  8 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
  9 | 
 10 | try:
 11 |     from flash_attn.flash_attn_interface import \
 12 |         flash_attn_unpadded_qkvpacked_func
 13 | except ImportError:
 14 |     from flash_attn.flash_attn_interface import (
 15 |         flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func,
 16 |     )
 17 | 
 18 | from flash_attn.bert_padding import pad_input, unpad_input
 19 | 
 20 | 
 21 | def forward(
 22 |     self,
 23 |     hidden_states: torch.Tensor,
 24 |     attention_mask: Optional[torch.Tensor] = None,
 25 |     position_ids: Optional[torch.Tensor] = None,
 26 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 27 |     output_attentions: bool = False,
 28 |     use_cache: bool = False,
 29 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 30 |     """Input shape: Batch x Time x Channel
 31 | 
 32 |     attention_mask: [bsz, q_len]
 33 |     """
 34 |     bsz, q_len, _ = hidden_states.size()
 35 | 
 36 |     query_states = (
 37 |         self.q_proj(hidden_states)
 38 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 39 |         .transpose(1, 2)
 40 |     )
 41 |     key_states = (
 42 |         self.k_proj(hidden_states)
 43 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 44 |         .transpose(1, 2)
 45 |     )
 46 |     value_states = (
 47 |         self.v_proj(hidden_states)
 48 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 49 |         .transpose(1, 2)
 50 |     )
 51 |     # [bsz, q_len, nh, hd]
 52 |     # [bsz, nh, q_len, hd]
 53 | 
 54 |     kv_seq_len = key_states.shape[-2]
 55 |     assert past_key_value is None, "past_key_value is not supported"
 56 | 
 57 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 58 |     query_states, key_states = apply_rotary_pos_emb(
 59 |         query_states, key_states, cos, sin, position_ids
 60 |     )
 61 |     # [bsz, nh, t, hd]
 62 |     assert not output_attentions, "output_attentions is not supported"
 63 |     assert not use_cache, "use_cache is not supported"
 64 | 
 65 |     # Flash attention codes from
 66 |     # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
 67 | 
 68 |     # transform the data into the format required by flash attention
 69 |     qkv = torch.stack(
 70 |         [query_states, key_states, value_states], dim=2
 71 |     )  # [bsz, nh, 3, q_len, hd]
 72 |     qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
 73 |     # We have disabled _prepare_decoder_attention_mask in LlamaModel
 74 |     # the attention_mask should be the same as the key_padding_mask
 75 |     key_padding_mask = attention_mask
 76 | 
 77 |     if key_padding_mask is None:
 78 |         qkv = rearrange(qkv, "b s ... -> (b s) ...")
 79 |         max_s = q_len
 80 |         cu_q_lens = torch.arange(
 81 |             0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
 82 |         )
 83 |         output = flash_attn_unpadded_qkvpacked_func(
 84 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 85 |         )
 86 |         output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
 87 |     else:
 88 |         nheads = qkv.shape[-2]
 89 |         x = rearrange(qkv, "b s three h d -> b s (three h d)")
 90 |         x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
 91 |         x_unpad = rearrange(
 92 |             x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
 93 |         )
 94 |         output_unpad = flash_attn_unpadded_qkvpacked_func(
 95 |             x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 96 |         )
 97 |         output = rearrange(
 98 |             pad_input(
 99 |                 rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
100 |             ),
101 |             "b s (h d) -> b s h d",
102 |             h=nheads,
103 |         )
104 |     return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
105 | 
106 | 
107 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
108 | # requires the attention mask to be the same as the key_padding_mask
109 | def _prepare_decoder_attention_mask(
110 |     self, attention_mask, input_shape, inputs_embeds, past_key_values_length
111 | ):
112 |     # [bsz, seq_len]
113 |     return attention_mask
114 | 
115 | 
116 | def replace_llama_attn_with_flash_attn():
117 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
118 |     if cuda_major < 8:
119 |         logging.warning(
120 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
121 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
122 |         )
123 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
124 |         _prepare_decoder_attention_mask
125 |     )
126 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
127 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/train/llava_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | from transformers import Trainer
 6 | 
 7 | 
 8 | def maybe_zero_3(param, ignore_status=False, name=None):
 9 |     from deepspeed import zero
10 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
11 | 
12 |     if hasattr(param, "ds_id"):
13 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
14 |             if not ignore_status:
15 |                 print(name, "no ignore status")
16 |         with zero.GatheredParameters([param]):
17 |             param = param.data.detach().cpu().clone()
18 |     else:
19 |         param = param.detach().cpu().clone()
20 |     return param
21 | 
22 | 
23 | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
24 |     to_return = {
25 |         k: t
26 |         for k, t in named_params
27 |         if any(key_match in k for key_match in keys_to_match)
28 |     }
29 |     to_return = {
30 |         k: maybe_zero_3(v, ignore_status=True, name=k).cpu()
31 |         for k, v in to_return.items()
32 |     }
33 |     return to_return
34 | 
35 | 
36 | class LLaVATrainer(Trainer):
37 |     def _save_checkpoint(self, model, trial, metrics=None):
38 |         if getattr(self.args, "tune_mm_mlp_adapter", False):
39 |             from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
40 | 
41 |             checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
42 | 
43 |             run_dir = self._get_output_dir(trial=trial)
44 |             output_dir = os.path.join(run_dir, checkpoint_folder)
45 | 
46 |             # Only save Adapter
47 |             keys_to_match = ["mm_projector"]
48 |             if getattr(self.args, "use_im_start_end", False):
49 |                 keys_to_match.extend(["embed_tokens", "embed_in"])
50 | 
51 |             weight_to_save = get_mm_adapter_state_maybe_zero_3(
52 |                 self.model.named_parameters(), keys_to_match
53 |             )
54 | 
55 |             if self.args.local_rank == 0 or self.args.local_rank == -1:
56 |                 self.model.config.save_pretrained(output_dir)
57 |                 torch.save(
58 |                     weight_to_save, os.path.join(output_dir, f"mm_projector.bin")
59 |                 )
60 |         else:
61 |             super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
62 | 
63 |     def _save(self, output_dir: Optional[str] = None, state_dict=None):
64 |         if getattr(self.args, "tune_mm_mlp_adapter", False):
65 |             pass
66 |         else:
67 |             super(LLaVATrainer, self)._save(output_dir, state_dict)
68 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from llava.train.llama_flash_attn_monkey_patch import \
 7 |     replace_llama_attn_with_flash_attn
 8 | 
 9 | replace_llama_attn_with_flash_attn()
10 | 
11 | from llava.train.train import train
12 | 
13 | if __name__ == "__main__":
14 |     train()
15 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/llava/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | from llava.constants import LOGDIR
  9 | 
 10 | server_error_msg = (
 11 |     "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | )
 13 | moderation_msg = (
 14 |     "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 15 | )
 16 | 
 17 | handler = None
 18 | 
 19 | 
 20 | def build_logger(logger_name, logger_filename):
 21 |     global handler
 22 | 
 23 |     formatter = logging.Formatter(
 24 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 25 |         datefmt="%Y-%m-%d %H:%M:%S",
 26 |     )
 27 | 
 28 |     # Set the format of root handlers
 29 |     if not logging.getLogger().handlers:
 30 |         logging.basicConfig(level=logging.INFO)
 31 |     logging.getLogger().handlers[0].setFormatter(formatter)
 32 | 
 33 |     # Redirect stdout and stderr to loggers
 34 |     stdout_logger = logging.getLogger("stdout")
 35 |     stdout_logger.setLevel(logging.INFO)
 36 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 37 |     sys.stdout = sl
 38 | 
 39 |     stderr_logger = logging.getLogger("stderr")
 40 |     stderr_logger.setLevel(logging.ERROR)
 41 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 42 |     sys.stderr = sl
 43 | 
 44 |     # Get logger
 45 |     logger = logging.getLogger(logger_name)
 46 |     logger.setLevel(logging.INFO)
 47 | 
 48 |     # Add a file handler for all loggers
 49 |     if handler is None:
 50 |         os.makedirs(LOGDIR, exist_ok=True)
 51 |         filename = os.path.join(LOGDIR, logger_filename)
 52 |         handler = logging.handlers.TimedRotatingFileHandler(
 53 |             filename, when="D", utc=True
 54 |         )
 55 |         handler.setFormatter(formatter)
 56 | 
 57 |         for name, item in logging.root.manager.loggerDict.items():
 58 |             if isinstance(item, logging.Logger):
 59 |                 item.addHandler(handler)
 60 | 
 61 |     return logger
 62 | 
 63 | 
 64 | class StreamToLogger(object):
 65 |     """
 66 |     Fake file-like stream object that redirects writes to a logger instance.
 67 |     """
 68 | 
 69 |     def __init__(self, logger, log_level=logging.INFO):
 70 |         self.terminal = sys.stdout
 71 |         self.logger = logger
 72 |         self.log_level = log_level
 73 |         self.linebuf = ""
 74 | 
 75 |     def __getattr__(self, attr):
 76 |         return getattr(self.terminal, attr)
 77 | 
 78 |     def write(self, buf):
 79 |         temp_linebuf = self.linebuf + buf
 80 |         self.linebuf = ""
 81 |         for line in temp_linebuf.splitlines(True):
 82 |             # From the io.TextIOWrapper docs:
 83 |             #   On output, if newline is None, any '\n' characters written
 84 |             #   are translated to the system default line separator.
 85 |             # By default sys.stdout.write() expects '\n' newlines and then
 86 |             # translates them so this is still cross platform.
 87 |             if line[-1] == "\n":
 88 |                 self.logger.log(self.log_level, line.rstrip())
 89 |             else:
 90 |                 self.linebuf += line
 91 | 
 92 |     def flush(self):
 93 |         if self.linebuf != "":
 94 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 95 |         self.linebuf = ""
 96 | 
 97 | 
 98 | def disable_torch_init():
 99 |     """
100 |     Disable the redundant torch default initialization to accelerate model creation.
101 |     """
102 |     import torch
103 | 
104 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
105 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
106 | 
107 | 
108 | def violates_moderation(text):
109 |     """
110 |     Check whether the text violates OpenAI moderation API.
111 |     """
112 |     url = "https://api.openai.com/v1/moderations"
113 |     headers = {
114 |         "Content-Type": "application/json",
115 |         "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"],
116 |     }
117 |     text = text.replace("\n", "")
118 |     data = "{" + '"input": ' + f'"{text}"' + "}"
119 |     data = data.encode("utf-8")
120 |     try:
121 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
122 |         flagged = ret.json()["results"][0]["flagged"]
123 |     except requests.exceptions.RequestException as e:
124 |         flagged = False
125 |     except KeyError as e:
126 |         flagged = False
127 | 
128 |     return flagged
129 | 
130 | 
131 | def pretty_print_semaphore(semaphore):
132 |     if semaphore is None:
133 |         return "None"
134 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
135 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .automatic_mask_generator import SamAutomaticMaskGenerator
 8 | from .build_sam import (build_sam, build_sam_vit_b, build_sam_vit_h,
 9 |                         build_sam_vit_l, sam_model_registry)
10 | from .predictor import SamPredictor
11 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from functools import partial
  8 | 
  9 | import torch
 10 | 
 11 | from .modeling import (ImageEncoderViT, MaskDecoder, PromptEncoder, Sam,
 12 |                        TwoWayTransformer,
 13 |                        CustomMaskDecoder)
 14 | 
 15 | 
 16 | def build_sam_vit_h(checkpoint=None, with_itm=False):
 17 |     if with_itm:
 18 |         return _build_sam(
 19 |             encoder_embed_dim=1280,
 20 |             encoder_depth=32,
 21 |             encoder_num_heads=16,
 22 |             encoder_global_attn_indexes=[7, 15, 23, 31],
 23 |             checkpoint=checkpoint,
 24 |             with_itm=True
 25 |         )
 26 |     return _build_sam(
 27 |         encoder_embed_dim=1280,
 28 |         encoder_depth=32,
 29 |         encoder_num_heads=16,
 30 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 31 |         checkpoint=checkpoint,
 32 |     )
 33 | 
 34 | 
 35 | build_sam = build_sam_vit_h
 36 | 
 37 | 
 38 | def build_sam_vit_l(checkpoint=None):
 39 |     return _build_sam(
 40 |         encoder_embed_dim=1024,
 41 |         encoder_depth=24,
 42 |         encoder_num_heads=16,
 43 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 44 |         checkpoint=checkpoint,
 45 |     )
 46 | 
 47 | 
 48 | def build_sam_vit_b(checkpoint=None):
 49 |     return _build_sam(
 50 |         encoder_embed_dim=768,
 51 |         encoder_depth=12,
 52 |         encoder_num_heads=12,
 53 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 54 |         checkpoint=checkpoint,
 55 |     )
 56 | 
 57 | 
 58 | sam_model_registry = {
 59 |     "default": build_sam_vit_h,
 60 |     "vit_h": build_sam_vit_h,
 61 |     "vit_l": build_sam_vit_l,
 62 |     "vit_b": build_sam_vit_b,
 63 | }
 64 | 
 65 | 
 66 | def _build_sam(
 67 |     encoder_embed_dim,
 68 |     encoder_depth,
 69 |     encoder_num_heads,
 70 |     encoder_global_attn_indexes,
 71 |     checkpoint=None,
 72 |     with_itm=False,
 73 | ):
 74 |     prompt_embed_dim = 256
 75 |     image_size = 1024
 76 |     vit_patch_size = 16
 77 |     image_embedding_size = image_size // vit_patch_size
 78 |     sam = Sam(
 79 |         image_encoder=ImageEncoderViT(
 80 |             depth=encoder_depth,
 81 |             embed_dim=encoder_embed_dim,
 82 |             img_size=image_size,
 83 |             mlp_ratio=4,
 84 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 85 |             num_heads=encoder_num_heads,
 86 |             patch_size=vit_patch_size,
 87 |             qkv_bias=True,
 88 |             use_rel_pos=True,
 89 |             global_attn_indexes=encoder_global_attn_indexes,
 90 |             window_size=14,
 91 |             out_chans=prompt_embed_dim,
 92 |         ),
 93 |         prompt_encoder=PromptEncoder(
 94 |             embed_dim=prompt_embed_dim,
 95 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 96 |             input_image_size=(image_size, image_size),
 97 |             mask_in_chans=16,
 98 |         ),
 99 |         
100 |         mask_decoder= CustomMaskDecoder( #NOTE: replace with CustomMaskDecoder
101 |                             num_multimask_outputs=3,
102 |                             transformer=TwoWayTransformer(
103 |                                 depth=2,
104 |                                 embedding_dim=prompt_embed_dim,
105 |                                 mlp_dim=2048,
106 |                                 num_heads=8,
107 |                             ),
108 |                             transformer_dim=prompt_embed_dim,
109 |                             iou_head_depth=3,
110 |                             iou_head_hidden_dim=256,
111 |                         ) if with_itm else 
112 |                         MaskDecoder(
113 |                             num_multimask_outputs=3,
114 |                             transformer=TwoWayTransformer(
115 |                                 depth=2,
116 |                                 embedding_dim=prompt_embed_dim,
117 |                                 mlp_dim=2048,
118 |                                 num_heads=8,
119 |                             ),
120 |                             transformer_dim=prompt_embed_dim,
121 |                             iou_head_depth=3,
122 |                             iou_head_hidden_dim=256,
123 |                         ),
124 |                     
125 |         pixel_mean=[123.675, 116.28, 103.53],
126 |         pixel_std=[58.395, 57.12, 57.375],
127 |     )
128 |     sam.eval()
129 |     if checkpoint is not None:
130 |         with open(checkpoint, "rb") as f:
131 |             state_dict = torch.load(f)
132 |         sam.load_state_dict(state_dict, strict=False)
133 |     return sam
134 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .image_encoder import ImageEncoderViT
 8 | from .mask_decoder import MaskDecoder, CustomMaskDecoder
 9 | from .prompt_encoder import PromptEncoder
10 | from .sam import Sam
11 | from .transformer import TwoWayTransformer
12 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything/modeling/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Type
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class MLPBlock(nn.Module):
14 |     def __init__(
15 |         self,
16 |         embedding_dim: int,
17 |         mlp_dim: int,
18 |         act: Type[nn.Module] = nn.GELU,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 |         self.act = act()
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return self.lin2(self.act(self.lin1(x)))
27 | 
28 | 
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
31 | class LayerNorm2d(nn.Module):
32 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 |         super().__init__()
34 |         self.weight = nn.Parameter(torch.ones(num_channels))
35 |         self.bias = nn.Parameter(torch.zeros(num_channels))
36 |         self.eps = eps
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         u = x.mean(1, keepdim=True)
40 |         s = (x - u).pow(2).mean(1, keepdim=True)
41 |         x = (x - u) / torch.sqrt(s + self.eps)
42 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 |         return x
44 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from copy import deepcopy
  8 | from typing import Tuple
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | from torch.nn import functional as F
 13 | from torchvision.transforms.functional import resize  # type: ignore
 14 | from torchvision.transforms.functional import to_pil_image
 15 | 
 16 | 
 17 | class ResizeLongestSide:
 18 |     """
 19 |     Resizes images to the longest side 'target_length', as well as provides
 20 |     methods for resizing coordinates and boxes. Provides methods for
 21 |     transforming both numpy array and batched torch tensors.
 22 |     """
 23 | 
 24 |     def __init__(self, target_length: int) -> None:
 25 |         self.target_length = target_length
 26 | 
 27 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 28 |         """
 29 |         Expects a numpy array with shape HxWxC in uint8 format.
 30 |         """
 31 |         target_size = self.get_preprocess_shape(
 32 |             image.shape[0], image.shape[1], self.target_length
 33 |         )
 34 |         return np.array(resize(to_pil_image(image), target_size))
 35 | 
 36 |     def apply_coords(
 37 |         self, coords: np.ndarray, original_size: Tuple[int, ...]
 38 |     ) -> np.ndarray:
 39 |         """
 40 |         Expects a numpy array of length 2 in the final dimension. Requires the
 41 |         original image size in (H, W) format.
 42 |         """
 43 |         old_h, old_w = original_size
 44 |         new_h, new_w = self.get_preprocess_shape(
 45 |             original_size[0], original_size[1], self.target_length
 46 |         )
 47 |         coords = deepcopy(coords).astype(float)
 48 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 49 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 50 |         return coords
 51 | 
 52 |     def apply_boxes(
 53 |         self, boxes: np.ndarray, original_size: Tuple[int, ...]
 54 |     ) -> np.ndarray:
 55 |         """
 56 |         Expects a numpy array shape Bx4. Requires the original image size
 57 |         in (H, W) format.
 58 |         """
 59 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 60 |         return boxes.reshape(-1, 4)
 61 | 
 62 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 63 |         """
 64 |         Expects batched images with shape BxCxHxW and float format. This
 65 |         transformation may not exactly match apply_image. apply_image is
 66 |         the transformation expected by the model.
 67 |         """
 68 |         # Expects an image in BCHW format. May not exactly match apply_image.
 69 |         target_size = self.get_preprocess_shape(
 70 |             image.shape[0], image.shape[1], self.target_length
 71 |         )
 72 |         return F.interpolate(
 73 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 74 |         )
 75 | 
 76 |     def apply_coords_torch(
 77 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 78 |     ) -> torch.Tensor:
 79 |         """
 80 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 81 |         original image size in (H, W) format.
 82 |         """
 83 |         old_h, old_w = original_size
 84 |         new_h, new_w = self.get_preprocess_shape(
 85 |             original_size[0], original_size[1], self.target_length
 86 |         )
 87 |         coords = deepcopy(coords).to(torch.float)
 88 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 89 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 90 |         return coords
 91 | 
 92 |     def apply_boxes_torch(
 93 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 94 |     ) -> torch.Tensor:
 95 |         """
 96 |         Expects a torch tensor with shape Bx4. Requires the original image
 97 |         size in (H, W) format.
 98 |         """
 99 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
100 |         return boxes.reshape(-1, 4)
101 | 
102 |     @staticmethod
103 |     def get_preprocess_shape(
104 |         oldh: int, oldw: int, long_side_length: int
105 |     ) -> Tuple[int, int]:
106 |         """
107 |         Compute the output size given input size and target long side length.
108 |         """
109 |         scale = long_side_length * 1.0 / max(oldh, oldw)
110 |         newh, neww = oldh * scale, oldw * scale
111 |         neww = int(neww + 0.5)
112 |         newh = int(newh + 0.5)
113 |         return (newh, neww)
114 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | 
 8 | from hydra import initialize_config_module
 9 | 
10 | initialize_config_module("model/segment_anything_2/sam2_configs", version_base="1.2")
11 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import logging
  8 | 
  9 | import torch
 10 | from hydra import compose
 11 | from hydra.utils import instantiate
 12 | from omegaconf import OmegaConf
 13 | 
 14 | def build_sam2(
 15 |     config_file,
 16 |     ckpt_path=None,
 17 |     device="cuda",
 18 |     mode="eval",
 19 |     hydra_overrides_extra=[],
 20 |     apply_postprocessing=True,
 21 | ):
 22 | 
 23 |     if apply_postprocessing:
 24 |         hydra_overrides_extra = hydra_overrides_extra.copy()
 25 |         hydra_overrides_extra += [
 26 |             # dynamically fall back to multi-mask if the single mask is not stable
 27 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
 28 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
 29 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
 30 |         ]
 31 |     # Read config and init model
 32 |     cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
 33 |     OmegaConf.resolve(cfg)
 34 |     model = instantiate(cfg.model, _recursive_=True)
 35 |     _load_checkpoint(model, ckpt_path)
 36 |     if device:
 37 |         model = model.to(device)
 38 |     if mode == "eval":
 39 |         model.eval()
 40 |     return model
 41 | 
 42 | 
 43 | def build_sam2_video_predictor(
 44 |     config_file,
 45 |     ckpt_path=None,
 46 |     device="cuda",
 47 |     mode="eval",
 48 |     hydra_overrides_extra=[],
 49 |     apply_postprocessing=True,
 50 | ):
 51 |     hydra_overrides = [
 52 |         "++model._target_=model.segment_anything_2.sam2.sam2_video_predictor.SAM2VideoPredictor",
 53 |     ]
 54 |     if apply_postprocessing:
 55 |         hydra_overrides_extra = hydra_overrides_extra.copy()
 56 |         hydra_overrides_extra += [
 57 |             # dynamically fall back to multi-mask if the single mask is not stable
 58 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
 59 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
 60 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
 61 |             # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
 62 |             "++model.binarize_mask_from_pts_for_mem_enc=true",
 63 |             # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
 64 |             "++model.fill_hole_area=8",
 65 |         ]
 66 |     hydra_overrides.extend(hydra_overrides_extra)
 67 | 
 68 |     # Read config and init model
 69 |     cfg = compose(config_name=config_file, overrides=hydra_overrides)
 70 |     OmegaConf.resolve(cfg)
 71 |     model = instantiate(cfg.model, _recursive_=True)
 72 |     _load_checkpoint(model, ckpt_path)
 73 |     if device:
 74 |         model = model.to(device)
 75 |     if mode == "eval":
 76 |         model.eval()
 77 |     return model
 78 | 
 79 | 
 80 | # def _load_checkpoint(model, ckpt_path):
 81 | #     if ckpt_path is not None:
 82 | #         sd = torch.load(ckpt_path, map_location="cpu")["model"]
 83 | #         missing_keys, unexpected_keys = model.load_state_dict(sd)
 84 | #         if missing_keys:
 85 | #             logging.error(missing_keys)
 86 | #             raise RuntimeError()
 87 | #         if unexpected_keys:
 88 | #             logging.error(unexpected_keys)
 89 | #             raise RuntimeError()
 90 | #         logging.info("Loaded checkpoint sucessfully")
 91 | 
 92 | def _load_checkpoint(model, ckpt_path):
 93 |     ''' 
 94 |     load checkpoint from ckpt_path to model, while renaming 'gamma' to 'weight' in the state dict
 95 |     '''
 96 |     if ckpt_path is not None:
 97 |         sd = torch.load(ckpt_path, map_location="cpu")["model"]
 98 | 
 99 |         # Rename 'gamma' to 'weight' in the state dict
100 |         sd = {key.replace('.gamma', '.weight'): value for key, value in sd.items()}
101 | 
102 |         missing_keys, unexpected_keys = model.load_state_dict(sd)
103 | 
104 |         if missing_keys:
105 |             logging.error(f"Missing keys: {missing_keys}")
106 |             raise RuntimeError("Missing keys found in the state dict.")
107 |         
108 |         if unexpected_keys:
109 |             logging.error(f"Unexpected keys: {unexpected_keys}")
110 |             raise RuntimeError("Unexpected keys found in the state dict.")
111 |         
112 |         logging.info("Loaded checkpoint successfully.")
113 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/modeling/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/modeling/backbones/image_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from typing import List, Optional
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | class ImageEncoder(nn.Module):
 15 |     def __init__(
 16 |         self,
 17 |         trunk: nn.Module,
 18 |         neck: nn.Module,
 19 |         scalp: int = 0,
 20 |     ):
 21 |         super().__init__()
 22 |         self.trunk = trunk
 23 |         self.neck = neck
 24 |         self.scalp = scalp
 25 |         assert (
 26 |             self.trunk.channel_list == self.neck.backbone_channel_list
 27 |         ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"
 28 | 
 29 |     def forward(self, sample: torch.Tensor):
 30 |         # Forward through backbone
 31 |         features, pos = self.neck(self.trunk(sample))
 32 |         if self.scalp > 0:
 33 |             # Discard the lowest resolution features
 34 |             features, pos = features[: -self.scalp], pos[: -self.scalp]
 35 | 
 36 |         src = features[-1]
 37 |         output = {
 38 |             "vision_features": src,
 39 |             "vision_pos_enc": pos,
 40 |             "backbone_fpn": features,
 41 |         }
 42 |         return output
 43 | 
 44 | 
 45 | class FpnNeck(nn.Module):
 46 |     """
 47 |     A modified variant of Feature Pyramid Network (FPN) neck
 48 |     (we remove output conv and also do bicubic interpolation similar to ViT
 49 |     pos embed interpolation)
 50 |     """
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         position_encoding: nn.Module,
 55 |         d_model: int,
 56 |         backbone_channel_list: List[int],
 57 |         kernel_size: int = 1,
 58 |         stride: int = 1,
 59 |         padding: int = 0,
 60 |         fpn_interp_model: str = "bilinear",
 61 |         fuse_type: str = "sum",
 62 |         fpn_top_down_levels: Optional[List[int]] = None,
 63 |     ):
 64 |         """Initialize the neck
 65 |         :param trunk: the backbone
 66 |         :param position_encoding: the positional encoding to use
 67 |         :param d_model: the dimension of the model
 68 |         :param neck_norm: the normalization to use
 69 |         """
 70 |         super().__init__()
 71 |         self.position_encoding = position_encoding
 72 |         self.convs = nn.ModuleList()
 73 |         self.backbone_channel_list = backbone_channel_list
 74 |         for dim in backbone_channel_list:
 75 |             current = nn.Sequential()
 76 |             current.add_module(
 77 |                 "conv",
 78 |                 nn.Conv2d(
 79 |                     in_channels=dim,
 80 |                     out_channels=d_model,
 81 |                     kernel_size=kernel_size,
 82 |                     stride=stride,
 83 |                     padding=padding,
 84 |                 ),
 85 |             )
 86 | 
 87 |             self.convs.append(current)
 88 |         self.fpn_interp_model = fpn_interp_model
 89 |         assert fuse_type in ["sum", "avg"]
 90 |         self.fuse_type = fuse_type
 91 | 
 92 |         # levels to have top-down features in its outputs
 93 |         # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
 94 |         # have top-down propagation, while outputs of level 0 and level 1 have only
 95 |         # lateral features from the same backbone level.
 96 |         if fpn_top_down_levels is None:
 97 |             # default is to have top-down features on all levels
 98 |             fpn_top_down_levels = range(len(self.convs))
 99 |         self.fpn_top_down_levels = list(fpn_top_down_levels)
100 | 
101 |     def forward(self, xs: List[torch.Tensor]):
102 | 
103 |         out = [None] * len(self.convs)
104 |         pos = [None] * len(self.convs)
105 |         assert len(xs) == len(self.convs)
106 |         # fpn forward pass
107 |         # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
108 |         prev_features = None
109 |         # forward in top-down order (from low to high resolution)
110 |         n = len(self.convs) - 1
111 |         for i in range(n, -1, -1):
112 |             x = xs[i]
113 |             lateral_features = self.convs[n - i](x)
114 |             if i in self.fpn_top_down_levels and prev_features is not None:
115 |                 top_down_features = F.interpolate(
116 |                     prev_features.to(dtype=torch.float32),
117 |                     scale_factor=2.0,
118 |                     mode=self.fpn_interp_model,
119 |                     align_corners=(
120 |                         None if self.fpn_interp_model == "nearest" else False
121 |                     ),
122 |                     antialias=False,
123 |                 )
124 |                 prev_features = lateral_features + top_down_features
125 |                 if self.fuse_type == "avg":
126 |                     prev_features /= 2
127 |             else:
128 |                 prev_features = lateral_features
129 |             x_out = prev_features
130 |             out[i] = x_out
131 |             pos[i] = self.position_encoding(x_out).to(x_out.dtype)
132 | 
133 |         return out, pos
134 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/modeling/backbones/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Some utilities for backbones, in particular for windowing"""
 8 | 
 9 | from typing import Tuple
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | 
16 | def window_partition(x, window_size):
17 |     """
18 |     Partition into non-overlapping windows with padding if needed.
19 |     Args:
20 |         x (tensor): input tokens with [B, H, W, C].
21 |         window_size (int): window size.
22 |     Returns:
23 |         windows: windows after partition with [B * num_windows, window_size, window_size, C].
24 |         (Hp, Wp): padded height and width before partition
25 |     """
26 |     B, H, W, C = x.shape
27 | 
28 |     pad_h = (window_size - H % window_size) % window_size
29 |     pad_w = (window_size - W % window_size) % window_size
30 |     if pad_h > 0 or pad_w > 0:
31 |         x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
32 |     Hp, Wp = H + pad_h, W + pad_w
33 | 
34 |     x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
35 |     windows = (
36 |         x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
37 |     )
38 |     return windows, (Hp, Wp)
39 | 
40 | 
41 | def window_unpartition(windows, window_size, pad_hw, hw):
42 |     """
43 |     Window unpartition into original sequences and removing padding.
44 |     Args:
45 |         x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
46 |         window_size (int): window size.
47 |         pad_hw (Tuple): padded height and width (Hp, Wp).
48 |         hw (Tuple): original height and width (H, W) before padding.
49 |     Returns:
50 |         x: unpartitioned sequences with [B, H, W, C].
51 |     """
52 |     Hp, Wp = pad_hw
53 |     H, W = hw
54 |     B = windows.shape[0] // (Hp * Wp // window_size // window_size)
55 |     x = windows.view(
56 |         B, Hp // window_size, Wp // window_size, window_size, window_size, -1
57 |     )
58 |     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
59 | 
60 |     if Hp > H or Wp > W:
61 |         x = x[:, :H, :W, :].contiguous()
62 |     return x
63 | 
64 | 
65 | class PatchEmbed(nn.Module):
66 |     """
67 |     Image to Patch Embedding.
68 |     """
69 | 
70 |     def __init__(
71 |         self,
72 |         kernel_size: Tuple[int, ...] = (7, 7),
73 |         stride: Tuple[int, ...] = (4, 4),
74 |         padding: Tuple[int, ...] = (3, 3),
75 |         in_chans: int = 3,
76 |         embed_dim: int = 768,
77 |     ):
78 |         """
79 |         Args:
80 |             kernel_size (Tuple): kernel size of the projection layer.
81 |             stride (Tuple): stride of the projection layer.
82 |             padding (Tuple): padding size of the projection layer.
83 |             in_chans (int): Number of input image channels.
84 |             embed_dim (int):  embed_dim (int): Patch embedding dimension.
85 |         """
86 |         super().__init__()
87 |         self.proj = nn.Conv2d(
88 |             in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
89 |         )
90 | 
91 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
92 |         x = self.proj(x)
93 |         # B C H W -> B H W C
94 |         x = x.permute(0, 2, 3, 1)
95 |         return x
96 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/modeling/sam/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torchvision.transforms import Normalize, Resize, ToTensor
 11 | 
 12 | 
 13 | class SAM2Transforms(nn.Module):
 14 |     def __init__(
 15 |         self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0
 16 |     ):
 17 |         """
 18 |         Transforms for SAM2.
 19 |         """
 20 |         super().__init__()
 21 |         self.resolution = resolution
 22 |         self.mask_threshold = mask_threshold
 23 |         self.max_hole_area = max_hole_area
 24 |         self.max_sprinkle_area = max_sprinkle_area
 25 |         self.mean = [0.485, 0.456, 0.406]
 26 |         self.std = [0.229, 0.224, 0.225]
 27 |         self.to_tensor = ToTensor()
 28 |         self.transforms = torch.jit.script(
 29 |             nn.Sequential(
 30 |                 Resize((self.resolution, self.resolution)),
 31 |                 Normalize(self.mean, self.std),
 32 |             )
 33 |         )
 34 | 
 35 |     def __call__(self, x):
 36 |         x = self.to_tensor(x)
 37 |         return self.transforms(x)
 38 | 
 39 |     def forward_batch(self, img_list):
 40 |         img_batch = [self.transforms(self.to_tensor(img)) for img in img_list]
 41 |         img_batch = torch.stack(img_batch, dim=0)
 42 |         return img_batch
 43 | 
 44 |     def transform_coords(
 45 |         self, coords: torch.Tensor, normalize=False, orig_hw=None
 46 |     ) -> torch.Tensor:
 47 |         """
 48 |         Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
 49 |         If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
 50 | 
 51 |         Returns
 52 |             Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model.
 53 |         """
 54 |         if normalize:
 55 |             assert orig_hw is not None
 56 |             h, w = orig_hw
 57 |             coords = coords.clone()
 58 |             coords[..., 0] = coords[..., 0] / w
 59 |             coords[..., 1] = coords[..., 1] / h
 60 | 
 61 |         coords = coords * self.resolution  # unnormalize coords
 62 |         return coords
 63 | 
 64 |     def transform_boxes(
 65 |         self, boxes: torch.Tensor, normalize=False, orig_hw=None
 66 |     ) -> torch.Tensor:
 67 |         """
 68 |         Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates,
 69 |         if the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
 70 |         """
 71 |         boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw)
 72 |         return boxes
 73 | 
 74 |     def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor:
 75 |         """
 76 |         Perform PostProcessing on output masks.
 77 |         """
 78 |         from model.segment_anything_2.sam2.utils.misc import get_connected_components
 79 | 
 80 |         masks = masks.float()
 81 |         if self.max_hole_area > 0:
 82 |             # Holes are those connected components in background with area <= self.fill_hole_area
 83 |             # (background regions are those with mask scores <= self.mask_threshold)
 84 |             mask_flat = masks.flatten(0, 1).unsqueeze(1)  # flatten as 1-channel image
 85 |             labels, areas = get_connected_components(mask_flat <= self.mask_threshold)
 86 |             is_hole = (labels > 0) & (areas <= self.max_hole_area)
 87 |             is_hole = is_hole.reshape_as(masks)
 88 |             # We fill holes with a small positive mask score (10.0) to change them to foreground.
 89 |             masks = torch.where(is_hole, self.mask_threshold + 10.0, masks)
 90 | 
 91 |         if self.max_sprinkle_area > 0:
 92 |             labels, areas = get_connected_components(mask_flat > self.mask_threshold)
 93 |             is_hole = (labels > 0) & (areas <= self.max_sprinkle_area)
 94 |             is_hole = is_hole.reshape_as(masks)
 95 |             # We fill holes with negative mask score (-10.0) to change them to background.
 96 |             masks = torch.where(is_hole, self.mask_threshold - 10.0, masks)
 97 | 
 98 |         masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False)
 99 |         return masks
100 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2_configs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2_configs/sam2_hiera_b+.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: model.segment_anything_2.sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: model.segment_anything_2.sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 112
 12 |       num_heads: 2
 13 |     neck:
 14 |       _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.FpnNeck
 15 |       position_encoding:
 16 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 17 |         num_pos_feats: 256
 18 |         normalize: true
 19 |         scale: null
 20 |         temperature: 10000
 21 |       d_model: 256
 22 |       backbone_channel_list: [896, 448, 224, 112]
 23 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 24 |       fpn_interp_model: nearest
 25 | 
 26 |   memory_attention:
 27 |     _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttention
 28 |     d_model: 256
 29 |     pos_enc_at_input: true
 30 |     layer:
 31 |       _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttentionLayer
 32 |       activation: relu
 33 |       dim_feedforward: 2048
 34 |       dropout: 0.1
 35 |       pos_enc_at_attn: false
 36 |       self_attention:
 37 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 38 |         rope_theta: 10000.0
 39 |         feat_sizes: [32, 32]
 40 |         embedding_dim: 256
 41 |         num_heads: 1
 42 |         downsample_rate: 1
 43 |         dropout: 0.1
 44 |       d_model: 256
 45 |       pos_enc_at_cross_attn_keys: true
 46 |       pos_enc_at_cross_attn_queries: false
 47 |       cross_attention:
 48 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 49 |         rope_theta: 10000.0
 50 |         feat_sizes: [32, 32]
 51 |         rope_k_repeat: True
 52 |         embedding_dim: 256
 53 |         num_heads: 1
 54 |         downsample_rate: 1
 55 |         dropout: 0.1
 56 |         kv_in_dim: 64
 57 |     num_layers: 4
 58 | 
 59 |   memory_encoder:
 60 |       _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MemoryEncoder
 61 |       out_dim: 64
 62 |       position_encoding:
 63 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 64 |         num_pos_feats: 64
 65 |         normalize: true
 66 |         scale: null
 67 |         temperature: 10000
 68 |       mask_downsampler:
 69 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MaskDownSampler
 70 |         kernel_size: 3
 71 |         stride: 2
 72 |         padding: 1
 73 |       fuser:
 74 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.Fuser
 75 |         layer:
 76 |           _target_: model.segment_anything_2.sam2.modeling.memory_encoder.CXBlock
 77 |           dim: 256
 78 |           kernel_size: 7
 79 |           padding: 3
 80 |           layer_scale_init_value: 1e-6
 81 |           use_dwconv: True  # depth-wise convs
 82 |         num_layers: 2
 83 | 
 84 |   num_maskmem: 7
 85 |   image_size: 1024
 86 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 87 |   sigmoid_scale_for_mem_enc: 20.0
 88 |   sigmoid_bias_for_mem_enc: -10.0
 89 |   use_mask_input_as_output_without_sam: true
 90 |   # Memory
 91 |   directly_add_no_mem_embed: true
 92 |   # use high-resolution feature map in the SAM mask decoder
 93 |   use_high_res_features_in_sam: true
 94 |   # output 3 masks on the first click on initial conditioning frames
 95 |   multimask_output_in_sam: true
 96 |   # SAM heads
 97 |   iou_prediction_use_sigmoid: True
 98 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
 99 |   use_obj_ptrs_in_encoder: true
100 |   add_tpos_enc_to_obj_ptrs: false
101 |   only_obj_ptrs_in_the_past_for_eval: true
102 |   # object occlusion prediction
103 |   pred_obj_scores: true
104 |   pred_obj_scores_mlp: true
105 |   fixed_no_obj_ptr: true
106 |   # multimask tracking settings
107 |   multimask_output_for_tracking: true
108 |   use_multimask_token_for_obj_ptr: true
109 |   multimask_min_pt_num: 0
110 |   multimask_max_pt_num: 1
111 |   use_mlp_for_obj_ptr_proj: true
112 |   # Compilation flag
113 |   compile_image_encoder: False
114 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2_configs/sam2_hiera_l.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: model.segment_anything_2.sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: model.segment_anything_2.sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 144
 12 |       num_heads: 2
 13 |       stages: [2, 6, 36, 4]
 14 |       global_att_blocks: [23, 33, 43]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |       window_spec: [8, 4, 16, 8]
 17 |     neck:
 18 |       _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.FpnNeck
 19 |       position_encoding:
 20 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 21 |         num_pos_feats: 256
 22 |         normalize: true
 23 |         scale: null
 24 |         temperature: 10000
 25 |       d_model: 256
 26 |       backbone_channel_list: [1152, 576, 288, 144]
 27 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 28 |       fpn_interp_model: nearest
 29 | 
 30 |   memory_attention:
 31 |     _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttention
 32 |     d_model: 256
 33 |     pos_enc_at_input: true
 34 |     layer:
 35 |       _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttentionLayer
 36 |       activation: relu
 37 |       dim_feedforward: 2048
 38 |       dropout: 0.1
 39 |       pos_enc_at_attn: false
 40 |       self_attention:
 41 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 42 |         rope_theta: 10000.0
 43 |         feat_sizes: [32, 32]
 44 |         embedding_dim: 256
 45 |         num_heads: 1
 46 |         downsample_rate: 1
 47 |         dropout: 0.1
 48 |       d_model: 256
 49 |       pos_enc_at_cross_attn_keys: true
 50 |       pos_enc_at_cross_attn_queries: false
 51 |       cross_attention:
 52 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 53 |         rope_theta: 10000.0
 54 |         feat_sizes: [32, 32]
 55 |         rope_k_repeat: True
 56 |         embedding_dim: 256
 57 |         num_heads: 1
 58 |         downsample_rate: 1
 59 |         dropout: 0.1
 60 |         kv_in_dim: 64
 61 |     num_layers: 4
 62 | 
 63 |   memory_encoder:
 64 |       _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MemoryEncoder
 65 |       out_dim: 64
 66 |       position_encoding:
 67 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 68 |         num_pos_feats: 64
 69 |         normalize: true
 70 |         scale: null
 71 |         temperature: 10000
 72 |       mask_downsampler:
 73 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MaskDownSampler
 74 |         kernel_size: 3
 75 |         stride: 2
 76 |         padding: 1
 77 |       fuser:
 78 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.Fuser
 79 |         layer:
 80 |           _target_: model.segment_anything_2.sam2.modeling.memory_encoder.CXBlock
 81 |           dim: 256
 82 |           kernel_size: 7
 83 |           padding: 3
 84 |           layer_scale_init_value: 1e-6
 85 |           use_dwconv: True  # depth-wise convs
 86 |         num_layers: 2
 87 | 
 88 |   num_maskmem: 7
 89 |   image_size: 1024
 90 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 91 |   sigmoid_scale_for_mem_enc: 20.0
 92 |   sigmoid_bias_for_mem_enc: -10.0
 93 |   use_mask_input_as_output_without_sam: true
 94 |   # Memory
 95 |   directly_add_no_mem_embed: true
 96 |   # use high-resolution feature map in the SAM mask decoder
 97 |   use_high_res_features_in_sam: true
 98 |   # output 3 masks on the first click on initial conditioning frames
 99 |   multimask_output_in_sam: true
100 |   # SAM heads
101 |   iou_prediction_use_sigmoid: True
102 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103 |   use_obj_ptrs_in_encoder: true
104 |   add_tpos_enc_to_obj_ptrs: false
105 |   only_obj_ptrs_in_the_past_for_eval: true
106 |   # object occlusion prediction
107 |   pred_obj_scores: true
108 |   pred_obj_scores_mlp: true
109 |   fixed_no_obj_ptr: true
110 |   # multimask tracking settings
111 |   multimask_output_for_tracking: true
112 |   use_multimask_token_for_obj_ptr: true
113 |   multimask_min_pt_num: 0
114 |   multimask_max_pt_num: 1
115 |   use_mlp_for_obj_ptr_proj: true
116 |   # Compilation flag
117 |   compile_image_encoder: False
118 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2_configs/sam2_hiera_s.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: model.segment_anything_2.sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: model.segment_anything_2.sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 96
 12 |       num_heads: 1
 13 |       stages: [1, 2, 11, 2]
 14 |       global_att_blocks: [7, 10, 13]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |     neck:
 17 |       _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.FpnNeck
 18 |       position_encoding:
 19 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 20 |         num_pos_feats: 256
 21 |         normalize: true
 22 |         scale: null
 23 |         temperature: 10000
 24 |       d_model: 256
 25 |       backbone_channel_list: [768, 384, 192, 96]
 26 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 27 |       fpn_interp_model: nearest
 28 | 
 29 |   memory_attention:
 30 |     _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttention
 31 |     d_model: 256
 32 |     pos_enc_at_input: true
 33 |     layer:
 34 |       _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttentionLayer
 35 |       activation: relu
 36 |       dim_feedforward: 2048
 37 |       dropout: 0.1
 38 |       pos_enc_at_attn: false
 39 |       self_attention:
 40 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 41 |         rope_theta: 10000.0
 42 |         feat_sizes: [32, 32]
 43 |         embedding_dim: 256
 44 |         num_heads: 1
 45 |         downsample_rate: 1
 46 |         dropout: 0.1
 47 |       d_model: 256
 48 |       pos_enc_at_cross_attn_keys: true
 49 |       pos_enc_at_cross_attn_queries: false
 50 |       cross_attention:
 51 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 52 |         rope_theta: 10000.0
 53 |         feat_sizes: [32, 32]
 54 |         rope_k_repeat: True
 55 |         embedding_dim: 256
 56 |         num_heads: 1
 57 |         downsample_rate: 1
 58 |         dropout: 0.1
 59 |         kv_in_dim: 64
 60 |     num_layers: 4
 61 | 
 62 |   memory_encoder:
 63 |       _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MemoryEncoder
 64 |       out_dim: 64
 65 |       position_encoding:
 66 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 67 |         num_pos_feats: 64
 68 |         normalize: true
 69 |         scale: null
 70 |         temperature: 10000
 71 |       mask_downsampler:
 72 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MaskDownSampler
 73 |         kernel_size: 3
 74 |         stride: 2
 75 |         padding: 1
 76 |       fuser:
 77 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.Fuser
 78 |         layer:
 79 |           _target_: model.segment_anything_2.sam2.modeling.memory_encoder.CXBlock
 80 |           dim: 256
 81 |           kernel_size: 7
 82 |           padding: 3
 83 |           layer_scale_init_value: 1e-6
 84 |           use_dwconv: True  # depth-wise convs
 85 |         num_layers: 2
 86 | 
 87 |   num_maskmem: 7
 88 |   image_size: 1024
 89 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 90 |   sigmoid_scale_for_mem_enc: 20.0
 91 |   sigmoid_bias_for_mem_enc: -10.0
 92 |   use_mask_input_as_output_without_sam: true
 93 |   # Memory
 94 |   directly_add_no_mem_embed: true
 95 |   # use high-resolution feature map in the SAM mask decoder
 96 |   use_high_res_features_in_sam: true
 97 |   # output 3 masks on the first click on initial conditioning frames
 98 |   multimask_output_in_sam: true
 99 |   # SAM heads
100 |   iou_prediction_use_sigmoid: True
101 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
102 |   use_obj_ptrs_in_encoder: true
103 |   add_tpos_enc_to_obj_ptrs: false
104 |   only_obj_ptrs_in_the_past_for_eval: true
105 |   # object occlusion prediction
106 |   pred_obj_scores: true
107 |   pred_obj_scores_mlp: true
108 |   fixed_no_obj_ptr: true
109 |   # multimask tracking settings
110 |   multimask_output_for_tracking: true
111 |   use_multimask_token_for_obj_ptr: true
112 |   multimask_min_pt_num: 0
113 |   multimask_max_pt_num: 1
114 |   use_mlp_for_obj_ptr_proj: true
115 |   # Compilation flag
116 |   compile_image_encoder: False
117 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/sam2_configs/sam2_hiera_t.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: model.segment_anything_2.sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: model.segment_anything_2.sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 96
 12 |       num_heads: 1
 13 |       stages: [1, 2, 7, 2]
 14 |       global_att_blocks: [5, 7, 9]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |     neck:
 17 |       _target_: model.segment_anything_2.sam2.modeling.backbones.image_encoder.FpnNeck
 18 |       position_encoding:
 19 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 20 |         num_pos_feats: 256
 21 |         normalize: true
 22 |         scale: null
 23 |         temperature: 10000
 24 |       d_model: 256
 25 |       backbone_channel_list: [768, 384, 192, 96]
 26 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 27 |       fpn_interp_model: nearest
 28 | 
 29 |   memory_attention:
 30 |     _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttention
 31 |     d_model: 256
 32 |     pos_enc_at_input: true
 33 |     layer:
 34 |       _target_: model.segment_anything_2.sam2.modeling.memory_attention.MemoryAttentionLayer
 35 |       activation: relu
 36 |       dim_feedforward: 2048
 37 |       dropout: 0.1
 38 |       pos_enc_at_attn: false
 39 |       self_attention:
 40 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 41 |         rope_theta: 10000.0
 42 |         feat_sizes: [32, 32]
 43 |         embedding_dim: 256
 44 |         num_heads: 1
 45 |         downsample_rate: 1
 46 |         dropout: 0.1
 47 |       d_model: 256
 48 |       pos_enc_at_cross_attn_keys: true
 49 |       pos_enc_at_cross_attn_queries: false
 50 |       cross_attention:
 51 |         _target_: model.segment_anything_2.sam2.modeling.sam.transformer.RoPEAttention
 52 |         rope_theta: 10000.0
 53 |         feat_sizes: [32, 32]
 54 |         rope_k_repeat: True
 55 |         embedding_dim: 256
 56 |         num_heads: 1
 57 |         downsample_rate: 1
 58 |         dropout: 0.1
 59 |         kv_in_dim: 64
 60 |     num_layers: 4
 61 | 
 62 |   memory_encoder:
 63 |       _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MemoryEncoder
 64 |       out_dim: 64
 65 |       position_encoding:
 66 |         _target_: model.segment_anything_2.sam2.modeling.position_encoding.PositionEmbeddingSine
 67 |         num_pos_feats: 64
 68 |         normalize: true
 69 |         scale: null
 70 |         temperature: 10000
 71 |       mask_downsampler:
 72 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.MaskDownSampler
 73 |         kernel_size: 3
 74 |         stride: 2
 75 |         padding: 1
 76 |       fuser:
 77 |         _target_: model.segment_anything_2.sam2.modeling.memory_encoder.Fuser
 78 |         layer:
 79 |           _target_: model.segment_anything_2.sam2.modeling.memory_encoder.CXBlock
 80 |           dim: 256
 81 |           kernel_size: 7
 82 |           padding: 3
 83 |           layer_scale_init_value: 1e-6
 84 |           use_dwconv: True  # depth-wise convs
 85 |         num_layers: 2
 86 | 
 87 |   num_maskmem: 7
 88 |   image_size: 1024
 89 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 90 |   # SAM decoder
 91 |   sigmoid_scale_for_mem_enc: 20.0
 92 |   sigmoid_bias_for_mem_enc: -10.0
 93 |   use_mask_input_as_output_without_sam: true
 94 |   # Memory
 95 |   directly_add_no_mem_embed: true
 96 |   # use high-resolution feature map in the SAM mask decoder
 97 |   use_high_res_features_in_sam: true
 98 |   # output 3 masks on the first click on initial conditioning frames
 99 |   multimask_output_in_sam: true
100 |   # SAM heads
101 |   iou_prediction_use_sigmoid: True
102 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103 |   use_obj_ptrs_in_encoder: true
104 |   add_tpos_enc_to_obj_ptrs: false
105 |   only_obj_ptrs_in_the_past_for_eval: true
106 |   # object occlusion prediction
107 |   pred_obj_scores: true
108 |   pred_obj_scores_mlp: true
109 |   fixed_no_obj_ptr: true
110 |   # multimask tracking settings
111 |   multimask_output_for_tracking: true
112 |   use_multimask_token_for_obj_ptr: true
113 |   multimask_min_pt_num: 0
114 |   multimask_max_pt_num: 1
115 |   use_mlp_for_obj_ptr_proj: true
116 |   # Compilation flag
117 |   # HieraT does not currently support compilation, should always be set to False
118 |   compile_image_encoder: False
119 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/segment_anything_2/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from setuptools import find_packages, setup
 8 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 9 | 
10 | def get_extensions():
11 |     srcs = ["sam2/csrc/connected_components.cu"]
12 |     compile_args = {
13 |         "cxx": [],
14 |         "nvcc": [
15 |             "-DCUDA_HAS_FP16=1",
16 |             "-D__CUDA_NO_HALF_OPERATORS__",
17 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
18 |             "-D__CUDA_NO_HALF2_OPERATORS__",
19 |         ],
20 |     }
21 |     ext_modules = [CUDAExtension("sam2._C", srcs, extra_compile_args=compile_args)]
22 |     return ext_modules
23 | 
24 | 
25 | # Setup configuration
26 | setup(
27 |     ext_modules=get_extensions(),
28 |     cmdclass={"build_ext": BuildExtension.with_options(no_python_abi_suffix=True)},
29 | )
30 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import VideoGPTPlusPhi3ForCausalLM
2 | from .model import VideoGPTPlusLlamaForCausalLM
3 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from distutils.util import strtobool
 3 | 
 4 | # Configuration Constants
 5 | # TODO: Change the chunk size if you use any other video encoder accordingly
 6 | CHUNK_SIZE = 4  # Video chunk size for InternVideo2-Stage2_1B-224p-f4 which is trained using 4 frames per video
 7 | NUM_FRAMES = int(os.environ.get("NUM_FRAMES", 16))  # Number of video frames (if using video)
 8 | NUM_CONTEXT_IMAGES = int(os.environ.get("NUM_CONTEXT_IMAGES", 16))  # Number of context images for video
 9 | 
10 | # Model Constants
11 | IGNORE_INDEX = -100
12 | IMAGE_TOKEN_INDEX = -200
13 | DEFAULT_IMAGE_TOKEN = "<image>"
14 | DEFAULT_VIDEO_TOKEN = "<video>"
15 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
16 | DEFAULT_IM_START_TOKEN = "<im_start>"
17 | DEFAULT_IM_END_TOKEN = "<im_end>"
18 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
19 | DEFAULT_VID_START_TOKEN = "<vid_start>"
20 | DEFAULT_VID_END_TOKEN = "<vid_end>"
21 | DEFAULT_BOX_START_TOKEN = "<box_start>"
22 | DEFAULT_BOX_END_TOKEN = "<box_end>"
23 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/conversation.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from enum import auto, Enum
  3 | from typing import List, Tuple
  4 | 
  5 | 
  6 | class SeparatorStyle(Enum):
  7 |     """Different separator style."""
  8 |     SINGLE = auto()
  9 |     TWO = auto()
 10 |     MPT = auto()
 11 |     PLAIN = auto()
 12 | 
 13 | 
 14 | @dataclasses.dataclass
 15 | class Conversation:
 16 |     """A class that keeps all conversation history."""
 17 |     system: str
 18 |     roles: List[str]
 19 |     messages: List[List[str]]
 20 |     offset: int
 21 |     sep_style: SeparatorStyle = SeparatorStyle.SINGLE
 22 |     sep: str = "###"
 23 |     sep2: str = None
 24 |     version: str = "Unknown"
 25 | 
 26 |     skip_next: bool = False
 27 | 
 28 |     def get_prompt(self):
 29 |         messages = self.messages
 30 |         if len(messages) > 0 and type(messages[0][1]) is tuple:
 31 |             messages = self.messages.copy()
 32 |             init_role, init_msg = messages[0].copy()
 33 |             init_msg = init_msg[0].replace("<image>", "").strip()
 34 |             if 'mmtag' in self.version:
 35 |                 messages[0] = (init_role, init_msg)
 36 |                 messages.insert(0, (self.roles[0], "<Image><image></Image>"))
 37 |                 messages.insert(1, (self.roles[1], "Received."))
 38 |             else:
 39 |                 messages[0] = (init_role, "<image>\n" + init_msg)
 40 | 
 41 |         if self.sep_style == SeparatorStyle.SINGLE:
 42 |             ret = self.system + self.sep
 43 |             for role, message in messages:
 44 |                 if message:
 45 |                     if type(message) is tuple:
 46 |                         message, _, _ = message
 47 |                     ret += role + ": " + message + self.sep
 48 |                 else:
 49 |                     ret += role + ":"
 50 |         elif self.sep_style == SeparatorStyle.TWO:
 51 |             seps = [self.sep, self.sep2]
 52 |             ret = self.system + seps[0]
 53 |             for i, (role, message) in enumerate(messages):
 54 |                 if message:
 55 |                     if type(message) is tuple:
 56 |                         message, _, _ = message
 57 |                     ret += role + ": " + message + seps[i % 2]
 58 |                 else:
 59 |                     ret += role + ":"
 60 |         elif self.sep_style == SeparatorStyle.MPT:
 61 |             ret = self.system + self.sep
 62 |             for role, message in messages:
 63 |                 if message:
 64 |                     if type(message) is tuple:
 65 |                         message, _, _ = message
 66 |                     ret += role + message + self.sep
 67 |                 else:
 68 |                     ret += role
 69 |         elif self.sep_style == SeparatorStyle.PLAIN:
 70 |             seps = [self.sep, self.sep2]
 71 |             ret = self.system
 72 |             for i, (role, message) in enumerate(messages):
 73 |                 if message:
 74 |                     if type(message) is tuple:
 75 |                         message, _, _ = message
 76 |                     ret += message + seps[i % 2]
 77 |                 else:
 78 |                     ret += ""
 79 |         else:
 80 |             raise ValueError(f"Invalid style: {self.sep_style}")
 81 | 
 82 |         return ret
 83 | 
 84 |     def append_message(self, role, message):
 85 |         self.messages.append([role, message])
 86 | 
 87 |     def copy(self):
 88 |         return Conversation(
 89 |             system=self.system,
 90 |             roles=self.roles,
 91 |             messages=[[x, y] for x, y in self.messages],
 92 |             offset=self.offset,
 93 |             sep_style=self.sep_style,
 94 |             sep=self.sep,
 95 |             sep2=self.sep2,
 96 |             version=self.version)
 97 | 
 98 | 
 99 | # Used during pretraining
100 | conv_plain = Conversation(
101 |     system="",
102 |     roles=("", ""),
103 |     messages=(
104 |     ),
105 |     offset=0,
106 |     sep_style=SeparatorStyle.PLAIN,
107 |     sep="\n",
108 | )
109 | 
110 | # Should be used for Vicuna
111 | conv_v1 = Conversation(
112 |     system="A chat between a curious user and an artificial intelligence assistant. "
113 |            "The assistant gives helpful, detailed, and polite answers to the user's questions.",
114 |     roles=("USER", "ASSISTANT"),
115 |     version="v1",
116 |     messages=(),
117 |     offset=0,
118 |     sep_style=SeparatorStyle.TWO,
119 |     sep=" ",
120 |     sep2="</s>",
121 | )
122 | 
123 | # Should be used for Phi-3-mini models
124 | conv_phi3_instruct = Conversation(
125 |     system="""<|system|>\nYou are a helpful AI assistant.""",
126 |     roles=("\n<|user|>\n", "\n<|assistant|>\n"),
127 |     version="phi3",
128 |     messages=(),
129 |     offset=0,
130 |     sep_style=SeparatorStyle.MPT,
131 |     sep="<|end|>",
132 | )
133 | 
134 | conv_llama_3 = Conversation(
135 |     system="A chat between a curious user and an artificial intelligence assistant. "
136 |     "The assistant gives helpful, detailed, and polite answers to the user's questions.",
137 |     roles=("USER", "ASSISTANT"),
138 |     version="v3",
139 |     messages=(),
140 |     offset=0,
141 |     sep_style=SeparatorStyle.TWO,
142 |     sep=" ",
143 |     sep2="<|end_of_text|>",
144 | )
145 | 
146 | # default_conversation = conv_phi3_instruct
147 | # default_conversation = conv_llama_3
148 | 
149 | conv_templates = {
150 |     # "default": conv_phi3_instruct,
151 |     # "default": conv_llama_3,
152 |     "plain": conv_plain,
153 |     "v1": conv_v1,
154 |     "phi3_instruct": conv_phi3_instruct,
155 |     'llama3_1': conv_llama_3,
156 | }
157 | 
158 | if __name__ == "__main__":
159 |     print(default_conversation.get_prompt())
160 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/mm_utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from io import BytesIO
 3 | import base64
 4 | import torch
 5 | from transformers import StoppingCriteria
 6 | from .constants import IMAGE_TOKEN_INDEX
 7 | 
 8 | 
 9 | def load_image_from_base64(image):
10 |     return Image.open(BytesIO(base64.b64decode(image)))
11 | 
12 | 
13 | def process_images(images, image_processor, model_cfg):
14 |     return image_processor(images, return_tensors='pt')['pixel_values']
15 | 
16 | 
17 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
18 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
19 | 
20 |     def insert_separator(X, sep):
21 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
22 | 
23 |     input_ids = []
24 |     offset = 0
25 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
26 |         offset = 1
27 |         input_ids.append(prompt_chunks[0][0])
28 | 
29 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
30 |         input_ids.extend(x[offset:])
31 | 
32 |     if return_tensors is not None:
33 |         if return_tensors == 'pt':
34 |             return torch.tensor(input_ids, dtype=torch.long)
35 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
36 | 
37 |     return input_ids
38 | 
39 | 
40 | def get_model_name_from_path(model_path):
41 |     model_path = model_path.strip("/")
42 |     model_paths = model_path.split("/")
43 |     if model_paths[-1].startswith('checkpoint-'):
44 |         return model_paths[-2] + "_" + model_paths[-1]
45 |     else:
46 |         return model_paths[-1]
47 | 
48 | 
49 | class KeywordsStoppingCriteria(StoppingCriteria):
50 |     def __init__(self, keywords, tokenizer, input_ids):
51 |         self.keywords = keywords
52 |         self.keyword_ids = []
53 |         for keyword in keywords:
54 |             cur_keyword_ids = tokenizer(keyword).input_ids
55 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
56 |                 cur_keyword_ids = cur_keyword_ids[1:]
57 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
58 |         self.tokenizer = tokenizer
59 |         self.start_len = input_ids.shape[1]
60 | 
61 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
62 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
63 |         offset = min(output_ids.shape[1] - self.start_len, 3)
64 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
65 |         for keyword_id in self.keyword_ids:
66 |             if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
67 |                 return True
68 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
69 |         for keyword in self.keywords:
70 |             if keyword in outputs:
71 |                 return True
72 |         return False
73 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.phi3 import VideoGPTPlusPhi3ForCausalLM, VideoGPTPlusPhi3Config 
2 | from .language_model.llama3_1 import VideoGPTPlusLlamaForCausalLM, VideoGPTPlusLlamaConfig 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/builder.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 2 | import torch
 3 | from modelvideogpt_plus.model import *
 4 | from modelvideogpt_plus.constants import *
 5 | from transformers import AutoConfig, AutoModelForCausalLM
 6 | 
 7 | 
 8 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, lora_weights=True,
 9 |                           base_type='phi3'):
10 |     kwargs = {}
11 | 
12 |     if load_8bit:
13 |         kwargs['load_in_8bit'] = True
14 |     elif load_4bit:
15 |         kwargs['load_in_4bit'] = True
16 |         kwargs['quantization_config'] = BitsAndBytesConfig(
17 |             load_in_4bit=True,
18 |             bnb_4bit_compute_dtype=torch.float16,
19 |             bnb_4bit_use_double_quant=True,
20 |             bnb_4bit_quant_type='nf4'
21 |         )
22 |     else:
23 |         kwargs['torch_dtype'] = torch.float16
24 | 
25 |     if lora_weights and model_base is not None:
26 |         lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
27 |         tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
28 |         print('Loading VideoGPT+ from base model...')
29 |         if base_type == 'phi3':
30 |             model = VideoGPTPlusPhi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=False, config=lora_cfg_pretrained, **kwargs)
31 |         elif base_type == 'llama3_1':
32 |             model = VideoGPTPlusLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=False, config=cfg_pretrained, **kwargs)
33 |         else:
34 |             raise ValueError('Unknown base type: ' + base_type, ' -  Supported types are: phi3, llama3_1')
35 |         token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
36 |         if model.lm_head.weight.shape[0] != token_num:
37 |             model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
38 |             model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
39 | 
40 |         print('Loading additional VideoGPT+ weights...')
41 |         if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
42 |             non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
43 |         else:
44 |             raise FileNotFoundError("Lora is specified in the model path, however could not find non-LORA trainables weights.")
45 | 
46 |         non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
47 |         if any(k.startswith('model.model.') for k in non_lora_trainables):
48 |             non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
49 |         model.load_state_dict(non_lora_trainables, strict=False)
50 | 
51 |         from peft import PeftModel
52 |         print('Loading LoRA weights...')
53 |         model = PeftModel.from_pretrained(model, model_path)
54 |         print('Merging LoRA weights...')
55 |         model = model.merge_and_unload()
56 |         print('Model is loaded...')
57 |     elif model_base is not None:
58 |         # This may be mm projector only
59 |         print('Loading VideoGPT+ from base model...')
60 |         tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
61 |         cfg_pretrained = AutoConfig.from_pretrained(model_path)
62 |         if base_type == 'phi3':
63 |             model = VideoGPTPlusPhi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=False, config=cfg_pretrained, **kwargs)
64 |         elif base_type == 'llama3_1':
65 |             model = VideoGPTPlusLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=False, config=cfg_pretrained, **kwargs)
66 |         else:
67 |             raise ValueError('Unknown base type: ' + base_type, ' -  Supported types are: phi3, llama3_1')
68 | 
69 |         mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
70 |         mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
71 |         model.load_state_dict(mm_projector_weights, strict=False)
72 |     else:
73 |         tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
74 |         model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=False, **kwargs)
75 | 
76 |     mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
77 |     mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
78 |     if mm_use_im_patch_token:
79 |         tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
80 |     if mm_use_im_start_end:
81 |         tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
82 |     model.resize_token_embeddings(len(tokenizer))
83 | 
84 |     vision_tower = model.get_vision_tower()
85 | 
86 |     image_processor = vision_tower.image_processor
87 | 
88 |     if hasattr(model.config, "max_sequence_length"):
89 |         context_len = model.config.max_sequence_length
90 |     else:
91 |         context_len = 2048
92 | 
93 |     return tokenizer, model, image_processor, context_len
94 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/dataloader.py:
--------------------------------------------------------------------------------
 1 | from decord import VideoReader, cpu
 2 | from mmengine import fileio
 3 | import io
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | def uniform_sample(lst, n):
 9 |     assert n <= len(lst)
10 |     m = len(lst)
11 |     step = m // n  # Calculate the step size
12 |     return [lst[i * step] for i in range(n)]
13 | 
14 | 
15 | def _get_rawvideo_dec(video_path, image_processor, video_processor, max_frames=8, frame_resolution=224,
16 |                       video_framerate=1, s=None, e=None, min_frames=8, num_video_frames=8, num_context_images=8):
17 |     if s is None:
18 |         start_time, end_time = None, None
19 |     else:
20 |         start_time = int(s)
21 |         end_time = int(e)
22 |         start_time = start_time if start_time >= 0. else 0.
23 |         end_time = end_time if end_time >= 0. else 0.
24 |         if start_time > end_time:
25 |             start_time, end_time = end_time, start_time
26 |         elif start_time == end_time:
27 |             end_time = start_time + 1
28 | 
29 |     try:
30 |         vreader = VideoReader(video_path, num_threads=1)
31 |     except Exception as e:
32 |         video_bytes = fileio.get(video_path)
33 |         vreader = VideoReader(io.BytesIO(video_bytes), num_threads=1)
34 | 
35 |     fps = vreader.get_avg_fps()
36 |     f_start = 0 if start_time is None else int(start_time * fps)
37 |     f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
38 |     num_frames = f_end - f_start + 1
39 |     if num_frames > 0:
40 |         # T x 3 x H x W
41 |         sample_fps = int(video_framerate)
42 |         t_stride = int(round(float(fps) / sample_fps))
43 | 
44 |         all_pos = list(range(f_start, f_end + 1, t_stride))
45 |         if len(all_pos) > max_frames:
46 |             sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
47 |         elif len(all_pos) < min_frames:
48 |             if num_frames < min_frames:
49 |                 min_frames = num_frames
50 |             t_stride = max(1, (f_end - f_start) // (min_frames - 1))
51 |             adjusted_f_end = f_start + t_stride * (min_frames - 1)
52 |             sample_pos = list(range(f_start, adjusted_f_end + 1, t_stride))
53 |         else:
54 |             sample_pos = all_pos
55 | 
56 |         all_images = [f for f in vreader.get_batch(sample_pos).asnumpy()]
57 |         # In case if we can't sample MAX_IMAGE_LENGTH frames
58 |         num_video_frames_sampled = min(num_video_frames, len(all_images))
59 |         num_context_images_sampled = min(num_context_images, len(all_images))
60 | 
61 |         video_frames = uniform_sample(all_images, num_video_frames_sampled)
62 |         context_images = uniform_sample(all_images, num_context_images_sampled)
63 | 
64 |         video_frames = video_processor.preprocess(video_frames)['pixel_values']
65 |         context_images = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in context_images]
66 | 
67 |         if len(context_images) < num_context_images:  # Pad
68 |             while len(context_images) < num_context_images:
69 |                 context_images.append(
70 |                     torch.zeros((3, image_processor.crop_size['height'], image_processor.crop_size['width'])))
71 | 
72 |         slice_len = len(video_frames)
73 | 
74 |         if slice_len < 1:
75 |             pass
76 |         else:
77 |             while len(video_frames) < num_video_frames:
78 |                 video_frames.append(torch.zeros((3, frame_resolution, frame_resolution)))
79 |     else:
80 |         print("video path: {} error.".format(video_path))
81 | 
82 |     return video_frames, context_images
83 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/internvideo/build_internvideo.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Modified from https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2
 3 | 
 4 |     Copyright 2024 OpenGVLab/InternVideo
 5 |     Copyright 2024 MBZUAI ORYX
 6 | """
 7 | 
 8 | from model.videogpt_plus.model.internvideo.config import Config, eval_dict_leaf
 9 | from model.videogpt_plus.model.internvideo.utils import setup_internvideo2V
10 | 
11 | 
12 | def build_internvideo(model_path):
13 |     config = Config.from_file('model/videogpt_plus/model/internvideo/internvideo2_stage2_config_vision.py')
14 |     config.model.vision_encoder['pretrained'] = model_path
15 |     config = eval_dict_leaf(config)
16 |     config['pretrained_path'] = model_path
17 |     intern_model = setup_internvideo2V(config)
18 | 
19 |     return intern_model
20 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/internvideo/easydict.py:
--------------------------------------------------------------------------------
  1 | class EasyDict(dict):
  2 |     """
  3 |     Get attributes
  4 | 
  5 |     >>> d = EasyDict({'foo':3})
  6 |     >>> d['foo']
  7 |     3
  8 |     >>> d.foo
  9 |     3
 10 |     >>> d.bar
 11 |     Traceback (most recent call last):
 12 |     ...
 13 |     AttributeError: 'EasyDict' object has no attribute 'bar'
 14 | 
 15 |     Works recursively
 16 | 
 17 |     >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
 18 |     >>> isinstance(d.bar, dict)
 19 |     True
 20 |     >>> d.bar.x
 21 |     1
 22 | 
 23 |     Bullet-proof
 24 | 
 25 |     >>> EasyDict({})
 26 |     {}
 27 |     >>> EasyDict(d={})
 28 |     {}
 29 |     >>> EasyDict(None)
 30 |     {}
 31 |     >>> d = {'a': 1}
 32 |     >>> EasyDict(**d)
 33 |     {'a': 1}
 34 | 
 35 |     Set attributes
 36 | 
 37 |     >>> d = EasyDict()
 38 |     >>> d.foo = 3
 39 |     >>> d.foo
 40 |     3
 41 |     >>> d.bar = {'prop': 'value'}
 42 |     >>> d.bar.prop
 43 |     'value'
 44 |     >>> d
 45 |     {'foo': 3, 'bar': {'prop': 'value'}}
 46 |     >>> d.bar.prop = 'newer'
 47 |     >>> d.bar.prop
 48 |     'newer'
 49 | 
 50 | 
 51 |     Values extraction
 52 | 
 53 |     >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
 54 |     >>> isinstance(d.bar, list)
 55 |     True
 56 |     >>> from operator import attrgetter
 57 |     >>> map(attrgetter('x'), d.bar)
 58 |     [1, 3]
 59 |     >>> map(attrgetter('y'), d.bar)
 60 |     [2, 4]
 61 |     >>> d = EasyDict()
 62 |     >>> d.keys()
 63 |     []
 64 |     >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
 65 |     >>> d.foo
 66 |     3
 67 |     >>> d.bar.x
 68 |     1
 69 | 
 70 |     Still like a dict though
 71 | 
 72 |     >>> o = EasyDict({'clean':True})
 73 |     >>> o.items()
 74 |     [('clean', True)]
 75 | 
 76 |     And like a class
 77 | 
 78 |     >>> class Flower(EasyDict):
 79 |     ...     power = 1
 80 |     ...
 81 |     >>> f = Flower()
 82 |     >>> f.power
 83 |     1
 84 |     >>> f = Flower({'height': 12})
 85 |     >>> f.height
 86 |     12
 87 |     >>> f['power']
 88 |     1
 89 |     >>> sorted(f.keys())
 90 |     ['height', 'power']
 91 | 
 92 |     update and pop items
 93 |     >>> d = EasyDict(a=1, b='2')
 94 |     >>> e = EasyDict(c=3.0, a=9.0)
 95 |     >>> d.update(e)
 96 |     >>> d.c
 97 |     3.0
 98 |     >>> d['c']
 99 |     3.0
100 |     >>> d.get('c')
101 |     3.0
102 |     >>> d.update(a=4, b=4)
103 |     >>> d.b
104 |     4
105 |     >>> d.pop('a')
106 |     4
107 |     >>> d.a
108 |     Traceback (most recent call last):
109 |     ...
110 |     AttributeError: 'EasyDict' object has no attribute 'a'
111 |     """
112 | 
113 |     def __init__(self, d=None, **kwargs):
114 |         if d is None:
115 |             d = {}
116 |         if kwargs:
117 |             d.update(**kwargs)
118 |         for k, v in d.items():
119 |             setattr(self, k, v)
120 |         # Class attributes
121 |         for k in self.__class__.__dict__.keys():
122 |             if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"):
123 |                 setattr(self, k, getattr(self, k))
124 | 
125 |     def __setattr__(self, name, value):
126 |         if isinstance(value, (list, tuple)):
127 |             value = [self.__class__(x) if isinstance(x, dict) else x for x in value]
128 |         elif isinstance(value, dict) and not isinstance(value, self.__class__):
129 |             value = self.__class__(value)
130 |         super(EasyDict, self).__setattr__(name, value)
131 |         super(EasyDict, self).__setitem__(name, value)
132 | 
133 |     __setitem__ = __setattr__
134 | 
135 |     def update(self, e=None, **f):
136 |         d = e or dict()
137 |         d.update(f)
138 |         for k in d:
139 |             setattr(self, k, d[k])
140 | 
141 |     def pop(self, k, d=None):
142 |         if hasattr(self, k):
143 |             delattr(self, k)
144 |         return super(EasyDict, self).pop(k, d)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     import doctest
149 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/internvideo/flash_attention_class.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Modified from https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2
 3 | 
 4 |     Copyright 2024 OpenGVLab/InternVideo
 5 |     Copyright 2024 MBZUAI ORYX
 6 | """
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | from einops import rearrange
12 | 
13 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
14 | from flash_attn.bert_padding import unpad_input, pad_input
15 | 
16 | 
17 | class FlashAttention(nn.Module):
18 |     """Implement the scaled dot product attention with softmax.
19 |     Arguments
20 |     ---------
21 |         softmax_scale: The temperature to use for the softmax attention.
22 |                       (default: 1/sqrt(d_keys) where d_keys is computed at
23 |                       runtime)
24 |         attention_dropout: The dropout rate to apply to the attention
25 |                            (default: 0.0)
26 |     """
27 | 
28 |     def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
29 |         super().__init__()
30 |         self.softmax_scale = softmax_scale
31 |         self.dropout_p = attention_dropout
32 | 
33 |     def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
34 |                 max_s=None, need_weights=False):
35 |         """Implements the multihead softmax attention.
36 |         Arguments
37 |         ---------
38 |             qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
39 |                 if unpadded: (nnz, 3, h, d)
40 |             key_padding_mask: a bool tensor of shape (B, S)
41 |         """
42 |         assert not need_weights
43 |         assert qkv.dtype in [torch.float16, torch.bfloat16]
44 |         assert qkv.is_cuda
45 | 
46 |         if cu_seqlens is None:
47 |             batch_size = qkv.shape[0]
48 |             seqlen = qkv.shape[1]
49 |             if key_padding_mask is None:
50 |                 qkv = rearrange(qkv, 'b s ... -> (b s) ...')
51 |                 max_s = seqlen
52 |                 cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
53 |                                           device=qkv.device)
54 |                 output = flash_attn_varlen_qkvpacked_func(
55 |                     qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
56 |                     softmax_scale=self.softmax_scale, causal=causal
57 |                 )
58 |                 output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
59 |             else:
60 |                 nheads = qkv.shape[-2]
61 |                 x = rearrange(qkv, 'b s three h d -> b s (three h d)')
62 |                 x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
63 |                 x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
64 |                 output_unpad = flash_attn_varlen_qkvpacked_func(
65 |                     x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
66 |                     softmax_scale=self.softmax_scale, causal=causal
67 |                 )
68 |                 output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
69 |                                              indices, batch_size, seqlen),
70 |                                    'b s (h d) -> b s h d', h=nheads)
71 |         else:
72 |             assert max_s is not None
73 |             output = flash_attn_varlen_qkvpacked_func(
74 |                 qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
75 |                 softmax_scale=self.softmax_scale, causal=causal
76 |             )
77 | 
78 |         return output, None
79 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/internvideo/internvideo2_stage2_config_vision.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Modified from https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2
  3 | 
  4 |     Copyright 2024 OpenGVLab/InternVideo
  5 |     Copyright 2024 MBZUAI ORYX
  6 | """
  7 | 
  8 | # ========================= data ==========================
  9 | # NOTE The train_file will not be used during the evaluation
 10 | 
 11 | num_workers = 6
 12 | 
 13 | # ========================= input ==========================
 14 | num_frames = 4
 15 | num_frames_test = 4
 16 | batch_size = 8
 17 | batch_size_test = 4
 18 | size_t = 224
 19 | max_txt_l = 40
 20 | 
 21 | origin_num_frames = 4
 22 | 
 23 | use_half_precision = False
 24 | use_bf16 = False
 25 | 
 26 | inputs = dict(
 27 |     image_res=224,
 28 |     video_input=dict(
 29 |         num_frames="${num_frames}",
 30 |         sample_type="rand",
 31 |         num_frames_test="${num_frames_test}",
 32 |         sample_type_test="middle",
 33 |         random_aug=False,
 34 |     ),
 35 |     max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
 36 |     batch_size=dict(image="${batch_size}", video="${batch_size}"),
 37 |     batch_size_test=dict(image="${batch_size_test}", video="${batch_size_test}"),
 38 | )
 39 | 
 40 | # ========================= model ==========================
 41 | model = dict(
 42 |     model_cls="InternVideo2_Stage2",
 43 |     vision_encoder=dict(
 44 |         # backbone
 45 |         name="pretrain_internvideo2_1b_patch14_224",
 46 |         img_size=224,
 47 |         num_frames="${num_frames}",
 48 |         tubelet_size=1,
 49 |         patch_size=14,
 50 |         d_model=1408,
 51 |         clip_embed_dim=768,
 52 |         clip_teacher_embed_dim=3200,
 53 |         clip_teacher_final_dim=768,
 54 |         clip_norm_type='l2',
 55 |         clip_return_layer=6,
 56 |         clip_student_return_interval=1,
 57 |         pretrained='./hf_checkpoints/InternVideo2-CLIP-1B-224p-f8/1B_clip.pth',
 58 |         use_checkpoint=True,
 59 |         checkpoint_num=40,
 60 |         use_flash_attn=use_half_precision,
 61 |         use_fused_rmsnorm=use_half_precision,
 62 |         use_fused_mlp=use_half_precision,
 63 |         # clip teacher
 64 |         clip_teacher=None,
 65 |         clip_input_resolution=224,
 66 |         clip_teacher_return_interval=1,
 67 |         # mask
 68 |         video_mask_type="random",
 69 |         video_mask_ratio=0.8,
 70 |         image_mask_type="random",
 71 |         image_mask_ratio=0.5,
 72 |         sep_image_video_pos_embed=True,
 73 |         keep_temporal=False,
 74 |         only_mask=True
 75 |     ),
 76 |     multimodal=dict(enable=True),
 77 |     embed_dim=512,
 78 |     temp=0.07,
 79 |     find_unused_parameters=False
 80 | )
 81 | 
 82 | evaluate = True
 83 | deep_fusion = False
 84 | evaluation = dict(
 85 |     eval_frame_ensemble="concat",  # [concat, max, mean, lse]
 86 |     eval_x_only=False,
 87 |     k_test=128,
 88 |     eval_offload=True,  # offload gpu tensors to cpu to save memory.
 89 | )
 90 | 
 91 | gradient_checkpointing = True  # for text encoder
 92 | use_flash_sdp = False
 93 | use_mem_efficient_sdp = False and not use_flash_sdp
 94 | compile_model = False
 95 | 
 96 | # ========================= optimizer ==========================
 97 | dist_url = "env://"
 98 | device = "cuda"
 99 | mode = "pt"
100 | 
101 | # ========================= others ==========================
102 | output_dir = None  # output dir
103 | resume = False  # if True, load optimizer and scheduler states as well
104 | debug = False
105 | log_freq = 100
106 | seed = 42
107 | 
108 | save_latest = False
109 | auto_resume = True
110 | jump_evaluate = False
111 | pretrained_path = ""
112 | 
113 | deepspeed = dict(
114 |     enable=True,
115 |     stage=1,
116 | )
117 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .clip_encoder import CLIPVisionTower
 2 | from model.videogpt_plus.model.internvideo.build_internvideo import build_internvideo
 3 | 
 4 | def build_vision_tower(vision_tower_cfg, **kwargs):
 5 |     image_vision_tower = kwargs['image_vision_tower']
 6 |     if image_vision_tower:
 7 |         vision_tower = getattr(vision_tower_cfg, 'image_mm_vision_tower', getattr(vision_tower_cfg, 'image_vision_tower', None))
 8 |         kwargs.pop('image_vision_tower', None)
 9 |     else:
10 |         vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
11 | 
12 |     print(f"Building {vision_tower}")
13 |     if vision_tower.startswith("openai") or vision_tower.startswith("laion"):
14 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
15 | 
16 |     elif 'InternVideo2' in vision_tower:
17 |         InternVideoTower = build_internvideo(vision_tower)
18 |         InternVideoTower.requires_grad_(False)
19 |         return InternVideoTower
20 | 
21 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
22 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
  5 | 
  6 | 
  7 | class CLIPVisionTower(nn.Module):
  8 |     def __init__(self, vision_tower, args=None, delay_load=False):
  9 |         super().__init__()
 10 | 
 11 |         self.is_loaded = False
 12 | 
 13 |         self.vision_tower_name = vision_tower
 14 |         if args is None:
 15 |             self.select_layer = -2
 16 |             self.select_feature = 'patch'
 17 |         else:
 18 |             self.select_layer = args.mm_vision_select_layer
 19 |             self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
 20 | 
 21 |         if not delay_load:
 22 |             self.load_model()
 23 |         else:
 24 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
 25 | 
 26 |     def load_model(self):
 27 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
 28 |         self.image_eval_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
 29 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
 30 |         self.vision_tower.requires_grad_(False)
 31 | 
 32 |         self.is_loaded = True
 33 | 
 34 |     def feature_select(self, image_forward_outs, select_feature='patch'):
 35 |         image_features = image_forward_outs.hidden_states[self.select_layer]
 36 |         if select_feature == 'patch':
 37 |             image_features = image_features[:, 1:]
 38 |         elif select_feature == 'cls_patch':
 39 |             image_features = image_features
 40 |         else:
 41 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
 42 |         return image_features
 43 | 
 44 |     @torch.no_grad()
 45 |     def forward(self, images, select_feature='patch', batch_size=128):
 46 |         if type(images) is list:
 47 |             image_features = []
 48 |             for image in images:
 49 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
 50 |                                                       output_hidden_states=True)
 51 |                 image_feature = self.feature_select(image_forward_out, select_feature).to(image.dtype)
 52 |                 image_features.append(image_feature)
 53 |         else:
 54 |             if images.shape[0] > batch_size:
 55 |                 n_chunk = images.shape[0]
 56 |                 image_features = []
 57 |                 n_iter = int(math.ceil(n_chunk / float(batch_size)))
 58 |                 for i in range(n_iter):
 59 |                     min_ind = i * batch_size
 60 |                     max_ind = (i + 1) * batch_size
 61 |                     batch = images[min_ind:max_ind]
 62 |                     batch_forward_outs = self.vision_tower(batch.to(device=self.device, dtype=self.dtype),
 63 |                                                            output_hidden_states=True)
 64 |                     batch_features = self.feature_select(batch_forward_outs, select_feature).to(batch.dtype)
 65 |                     image_features.append(batch_features)
 66 |                 image_features = torch.cat(image_features)
 67 |             else:
 68 |                 image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
 69 |                                                        output_hidden_states=True)
 70 |                 image_features = self.feature_select(image_forward_outs, select_feature).to(images.dtype)
 71 | 
 72 |         return image_features
 73 | 
 74 |     @property
 75 |     def dummy_feature(self):
 76 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 77 | 
 78 |     @property
 79 |     def dtype(self):
 80 |         return self.vision_tower.dtype
 81 | 
 82 |     @property
 83 |     def device(self):
 84 |         return self.vision_tower.device
 85 | 
 86 |     @property
 87 |     def config(self):
 88 |         if self.is_loaded:
 89 |             return self.vision_tower.config
 90 |         else:
 91 |             return self.cfg_only
 92 | 
 93 |     @property
 94 |     def hidden_size(self):
 95 |         return self.config.hidden_size
 96 | 
 97 |     @property
 98 |     def num_patches(self):
 99 |         return (self.config.image_size // self.config.patch_size) ** 2
100 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/multimodal_encoder/processor.py:
--------------------------------------------------------------------------------
 1 | from model.videogpt_plus.constants import IMAGE_RESOLUTION
 2 | from torchvision import transforms
 3 | from torchvision.transforms.functional import InterpolationMode
 4 | 
 5 | 
 6 | class BaseProcessor:
 7 |     def __init__(self, mean=None, std=None):
 8 |         if mean is None:
 9 |             mean = (0.48145466, 0.4578275, 0.40821073)
10 |         if std is None:
11 |             std = (0.26862954, 0.26130258, 0.27577711)
12 | 
13 |         self.normalize = transforms.Normalize(mean, std)
14 | 
15 | 
16 | class ImageTrainProcessor(BaseProcessor):
17 |     def __init__(self, image_size=IMAGE_RESOLUTION, mean=None, std=None, min_scale=0.5, max_scale=1.0):
18 |         super().__init__(mean=mean, std=std)
19 | 
20 |         self.transform = transforms.Compose(
21 |             [
22 |                 transforms.Resize(
23 |                     (image_size, image_size), interpolation=InterpolationMode.BICUBIC
24 |                 ),
25 |                 transforms.ToTensor(),
26 |                 self.normalize,
27 |             ]
28 |         )
29 | 
30 |     def preprocess(self, item, return_tensors):
31 |         return {'pixel_values': [self.transform(item)]}
32 | 
33 | 
34 | class ImageEvalProcessor(BaseProcessor):
35 |     def __init__(self, image_size=IMAGE_RESOLUTION, mean=None, std=None):
36 |         super().__init__(mean=mean, std=std)
37 | 
38 |         self.transform = transforms.Compose(
39 |             [
40 |                 transforms.Resize(
41 |                     (image_size, image_size), interpolation=InterpolationMode.BICUBIC
42 |                 ),
43 |                 transforms.ToTensor(),
44 |                 self.normalize,
45 |             ]
46 |         )
47 | 
48 |     def preprocess(self, item, return_tensors):
49 |         return {'pixel_values': [self.transform(item)]}
50 | 


--------------------------------------------------------------------------------
/VideoGLaMM/model/videogpt_plus/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import re
 3 | 
 4 | 
 5 | class IdentityMap(nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, x, *args, **kwargs):
10 |         return x
11 | 
12 |     @property
13 |     def config(self):
14 |         return {"mm_projector_type": 'identity'}
15 | 
16 | 
17 | def build_vision_projector(config, **kwargs):
18 |     """
19 |         mm_hidden_size = 1408 for InternVideo2-Stage2_1B-224p-f4 (TODO: Update it if you use a different video encoder)
20 |     """
21 |     image_mm_projector = kwargs['image_mm_projector']
22 |     if image_mm_projector:
23 |         projector_type = getattr(config, 'image_mm_projector_type', 'linear')
24 |         config.mm_hidden_size = 1024
25 |     else:
26 |         config.mm_hidden_size = 1408
27 |         projector_type = getattr(config, 'mm_projector_type', 'linear')
28 |     print(f"Building {projector_type}")
29 | 
30 |     if projector_type == 'linear':
31 |         projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
32 |         config.mm_hidden_size = 1408
33 |         config.image_mm_hidden_size = 1024
34 |         return projector
35 | 
36 |     print("projector_type:", projector_type)
37 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
38 |     if mlp_gelu_match:
39 |         mlp_depth = int(mlp_gelu_match.group(1))
40 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
41 |         for _ in range(1, mlp_depth):
42 |             modules.append(nn.GELU())
43 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
44 |         config.mm_hidden_size = 1408
45 |         config.image_mm_hidden_size = 1024
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         projector = IdentityMap()
50 |         config.mm_hidden_size = 1408
51 |         config.image_mm_hidden_size = 1024
52 |         return projector
53 | 
54 |     raise ValueError(f'Unknown projector type: {projector_type}')
55 | 


--------------------------------------------------------------------------------
/VideoGLaMM/requirements.txt:
--------------------------------------------------------------------------------
 1 | packaging
 2 | sentencepiece
 3 | einops==0.4.1
 4 | fastapi==0.100.1
 5 | markdown2==2.4.10
 6 | numpy
 7 | openai==1.51.2
 8 | opencv_python
 9 | Pillow==9.4.0
10 | pycocotools==2.0.6
11 | ray==2.6.1
12 | Requests==2.31.0
13 | shortuuid==1.0.11
14 | tqdm
15 | uvicorn==0.23.2
16 | scipy==1.11.2
17 | bitsandbytes==0.41.1
18 | timm
19 | blobfile
20 | mypy
21 | pytest
22 | requests
23 | tensorboardX
24 | ftfy
25 | opencv-python
26 | pyarrow
27 | torchmetrics==0.7.3
28 | pycocoevalcap
29 | torchscale==0.2.0
30 | hydra-core
31 | accelerate
32 | triton==2.1.0
33 | peft==0.12.0
34 | gradio==3.39.0
35 | tensorboard
36 | scikit-image
37 | decord
38 | h5py
39 | jsonlines
40 | ffmpeg-python
41 | py-cpuinfo
42 | scikit-learn
43 | flash-attn


--------------------------------------------------------------------------------
/VideoGLaMM/utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/utils/.DS_Store


--------------------------------------------------------------------------------
/VideoGLaMM/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/utils/__init__.py


--------------------------------------------------------------------------------
/VideoGLaMM/utils/ade20k_classes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "wall", "building", "sky", "floor", "tree", "ceiling", "road",
 3 |     "bed", "windowpane", "grass", "cabinet", "sidewalk",
 4 |     "person", "earth", "door", "table", "mountain", "plant",
 5 |     "curtain", "chair", "car", "water", "painting", "sofa",
 6 |     "shelf", "house", "sea", "mirror", "rug", "field", "armchair",
 7 |     "seat", "fence", "desk", "rock", "wardrobe", "lamp",
 8 |     "bathtub", "railing", "cushion", "base", "box", "column",
 9 |     "signboard", "chest of drawers", "counter", "sand", "sink",
10 |     "skyscraper", "fireplace", "refrigerator", "grandstand",
11 |     "path", "stairs", "runway", "case", "pool table", "pillow",
12 |     "screen door", "stairway", "river", "bridge", "bookcase",
13 |     "blind", "coffee table", "toilet", "flower", "book", "hill",
14 |     "bench", "countertop", "stove", "palm", "kitchen island",
15 |     "computer", "swivel chair", "boat", "bar", "arcade machine",
16 |     "hovel", "bus", "towel", "light", "truck", "tower",
17 |     "chandelier", "awning", "streetlight", "booth",
18 |     "television receiver", "airplane", "dirt track", "apparel",
19 |     "pole", "land", "bannister", "escalator", "ottoman", "bottle",
20 |     "buffet", "poster", "stage", "van", "ship", "fountain",
21 |     "conveyer belt", "canopy", "washer", "plaything",
22 |     "swimming pool", "stool", "barrel", "basket", "waterfall",
23 |     "tent", "bag", "minibike", "cradle", "oven", "ball", "food",
24 |     "step", "tank", "trade name", "microwave", "pot", "animal",
25 |     "bicycle", "lake", "dishwasher", "screen", "blanket",
26 |     "sculpture", "hood", "sconce", "vase", "traffic light",
27 |     "tray", "ashcan", "fan", "pier", "crt screen", "plate",
28 |     "monitor", "bulletin board", "shower", "radiator", "glass",
29 |     "clock", "flag"
30 | ]


--------------------------------------------------------------------------------
/VideoGLaMM/utils/clair.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from PIL import Image
  4 | import copy
  5 | from tqdm import tqdm
  6 | import argparse
  7 | 
  8 | import ast
  9 | import os
 10 | from typing import Any, List
 11 | 
 12 | # import openai
 13 | # openai.api_key = os.getenv("OPENAI_API_KEY")
 14 | 
 15 | from openai import OpenAI
 16 | import ast
 17 | 
 18 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 19 | client = OpenAI(api_key=OPENAI_API_KEY)
 20 | 
 21 | import json
 22 | import logging
 23 | import re
 24 | import sys
 25 | from typing import List, Optional, Tuple
 26 | 
 27 | # from grazier import LLMEngine
 28 | 
 29 | _engine_cache = {}
 30 | 
 31 | _CLAIR_PROMPT = """\
 32 | You are trying to tell if a candidate set of captions is describing the same image as a reference set of captions.
 33 | Candidate set:
 34 | {candidate_statements}
 35 | Reference set:
 36 | {target_statements}
 37 | On a precise scale from 0 to 100, how likely is it that the candidate set is \
 38 | describing the same image as the reference set? (JSON format, with a key "score", \
 39 | value between 0 and 100, and a key "reason" with a string value.)
 40 | """
 41 | 
 42 | def clair(
 43 |     candidates: List[str],
 44 |     targets: List[str],
 45 |     model: str = "chat-gpt",
 46 |     max_tokens: int = 1024,
 47 | ) -> Tuple[float, Optional[str]]:
 48 |     # Compute the CLAIR score for a list of candidates and targets.
 49 | 
 50 |     # if model not in _engine_cache:
 51 |     #     _engine_cache[model] = LLMEngine.from_string(model)
 52 | 
 53 |     # Format the canndidates and targets
 54 |     candidate_statements = [f"- {c}\n" for c in candidates]
 55 |     target_statements = [f"- {t}\n" for t in targets]
 56 |     formatted_prompt = _CLAIR_PROMPT.format(
 57 |         candidate_statements="".join(candidate_statements),
 58 |         target_statements="".join(target_statements),
 59 |     )
 60 | 
 61 |     temperature, score, reason = 0.0, None, None
 62 |     for _ in range(3):
 63 |         # Run the model
 64 |         # logging.debug(f'CLAIR prompt: "{formatted_prompt}"')
 65 |         # response = _engine_cache[model](formatted_prompt, temperature=temperature, max_tokens=max_tokens)[0]
 66 |         # response = call_openai_model(formatted_prompt, temperature=temperature, max_tokens=max_tokens)[0]
 67 |         
 68 |         # response_obj = openai.ChatCompletion.create(
 69 |         #     model="gpt-3.5-turbo",
 70 |         #     messages=[
 71 |         #         {"role": "system", "content": "You are a helpful assistant."},
 72 |         #         {"role": "user", "content": formatted_prompt},
 73 |         #     ],
 74 |         #     max_tokens=100,
 75 |         #     temperature=0.5,
 76 |         # )
 77 |         
 78 |         completion = client.chat.completions.create(
 79 |             model="gpt-3.5-turbo",
 80 |             messages=[
 81 |                 {"role": "system", "content": "You are a helpful assistant."},
 82 |                 {"role": "user", "content": formatted_prompt},
 83 |             ],
 84 |             max_tokens=100,
 85 |             temperature=0.5,
 86 |         )
 87 | 
 88 |         # Extract the response content
 89 |         # response = response_obj['choices'][0]['message']['content']
 90 |         response = completion.choices[0].message.content
 91 | 
 92 |         print('response', response)
 93 |         logging.debug(f'CLAIR response: "{response.strip()}"')
 94 | 
 95 |         # Parse the first JSON object in the response
 96 |         try:
 97 |             parsed = response.split("{")[1]
 98 |             parsed = "{" + parsed.split("}")[0] + "}"
 99 |             data = json.loads(parsed)
100 |             score = float(data["score"])
101 |             reason = data.get("reason", 'Unknown')
102 |             break
103 |         except (json.JSONDecodeError, KeyError, IndexError):
104 |             # Try to extract the first number in the response using regex
105 |             parsed = re.findall(r"\d*\.?\d+", response)
106 |             if len(parsed) > 0:
107 |                 score = float(parsed[0])
108 |                 if score < 1:
109 |                     score *= 100 # This is a weird situation where some models auto-normalize the score for us.
110 | 
111 |                 # Look for the word "reason" in the response, and extract anything after it (ignoring case)
112 |                 reason = re.findall(r"(?i)reason.*", response)
113 |                 if len(reason) > 0:
114 |                     # Clean up the reason a bit.
115 |                     reason = reason[0].strip()[len('reason'):].replace(':', '').strip()
116 |                 else:
117 |                     reason = 'Unknown'
118 |                 break
119 |             else:
120 |                 logging.warn(
121 |                     f"Could not parse response from CLAIR: {response}. Retrying"
122 |                 )
123 |                 continue
124 |     else:
125 |         logging.error("Could not parse response from CLAIR after 3 tries. Setting score to 0.")
126 |         score = 0.0
127 |         reason = None
128 | 
129 |     return score / 100, reason
130 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/cocostuff_classes.txt:
--------------------------------------------------------------------------------
  1 | 0: unlabeled
  2 | 1: person
  3 | 2: bicycle
  4 | 3: car
  5 | 4: motorcycle
  6 | 5: airplane
  7 | 6: bus
  8 | 7: train
  9 | 8: truck
 10 | 9: boat
 11 | 10: traffic light
 12 | 11: fire hydrant
 13 | 12: street sign
 14 | 13: stop sign
 15 | 14: parking meter
 16 | 15: bench
 17 | 16: bird
 18 | 17: cat
 19 | 18: dog
 20 | 19: horse
 21 | 20: sheep
 22 | 21: cow
 23 | 22: elephant
 24 | 23: bear
 25 | 24: zebra
 26 | 25: giraffe
 27 | 26: hat
 28 | 27: backpack
 29 | 28: umbrella
 30 | 29: shoe
 31 | 30: eye glasses
 32 | 31: handbag
 33 | 32: tie
 34 | 33: suitcase
 35 | 34: frisbee
 36 | 35: skis
 37 | 36: snowboard
 38 | 37: sports ball
 39 | 38: kite
 40 | 39: baseball bat
 41 | 40: baseball glove
 42 | 41: skateboard
 43 | 42: surfboard
 44 | 43: tennis racket
 45 | 44: bottle
 46 | 45: plate
 47 | 46: wine glass
 48 | 47: cup
 49 | 48: fork
 50 | 49: knife
 51 | 50: spoon
 52 | 51: bowl
 53 | 52: banana
 54 | 53: apple
 55 | 54: sandwich
 56 | 55: orange
 57 | 56: broccoli
 58 | 57: carrot
 59 | 58: hot dog
 60 | 59: pizza
 61 | 60: donut
 62 | 61: cake
 63 | 62: chair
 64 | 63: couch
 65 | 64: potted plant
 66 | 65: bed
 67 | 66: mirror
 68 | 67: dining table
 69 | 68: window
 70 | 69: desk
 71 | 70: toilet
 72 | 71: door
 73 | 72: tv
 74 | 73: laptop
 75 | 74: mouse
 76 | 75: remote
 77 | 76: keyboard
 78 | 77: cell phone
 79 | 78: microwave
 80 | 79: oven
 81 | 80: toaster
 82 | 81: sink
 83 | 82: refrigerator
 84 | 83: blender
 85 | 84: book
 86 | 85: clock
 87 | 86: vase
 88 | 87: scissors
 89 | 88: teddy bear
 90 | 89: hair drier
 91 | 90: toothbrush
 92 | 91: hair brush
 93 | 92: banner
 94 | 93: blanket
 95 | 94: branch
 96 | 95: bridge
 97 | 96: building-other
 98 | 97: bush
 99 | 98: cabinet
100 | 99: cage
101 | 100: cardboard
102 | 101: carpet
103 | 102: ceiling-other
104 | 103: ceiling-tile
105 | 104: cloth
106 | 105: clothes
107 | 106: clouds
108 | 107: counter
109 | 108: cupboard
110 | 109: curtain
111 | 110: desk-stuff
112 | 111: dirt
113 | 112: door-stuff
114 | 113: fence
115 | 114: floor-marble
116 | 115: floor-other
117 | 116: floor-stone
118 | 117: floor-tile
119 | 118: floor-wood
120 | 119: flower
121 | 120: fog
122 | 121: food-other
123 | 122: fruit
124 | 123: furniture-other
125 | 124: grass
126 | 125: gravel
127 | 126: ground-other
128 | 127: hill
129 | 128: house
130 | 129: leaves
131 | 130: light
132 | 131: mat
133 | 132: metal
134 | 133: mirror-stuff
135 | 134: moss
136 | 135: mountain
137 | 136: mud
138 | 137: napkin
139 | 138: net
140 | 139: paper
141 | 140: pavement
142 | 141: pillow
143 | 142: plant-other
144 | 143: plastic
145 | 144: platform
146 | 145: playingfield
147 | 146: railing
148 | 147: railroad
149 | 148: river
150 | 149: road
151 | 150: rock
152 | 151: roof
153 | 152: rug
154 | 153: salad
155 | 154: sand
156 | 155: sea
157 | 156: shelf
158 | 157: sky
159 | 158: skyscraper
160 | 159: snow
161 | 160: solid-other
162 | 161: stairs
163 | 162: stone
164 | 163: straw
165 | 164: structural-other
166 | 165: table
167 | 166: tent
168 | 167: textile-other
169 | 168: towel
170 | 169: tree
171 | 170: vegetable
172 | 171: wall-brick
173 | 172: wall-concrete
174 | 173: wall-other
175 | 174: wall-panel
176 | 175: wall-stone
177 | 176: wall-tile
178 | 177: wall-wood
179 | 178: water-other
180 | 179: waterdrops
181 | 180: window-blind
182 | 181: window-other
183 | 182: wood
184 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/data_processing.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | 
 5 | import cv2
 6 | import numpy as np
 7 | 
 8 | 
 9 | def get_mask_from_json(json_path, img):
10 |     try:
11 |         with open(json_path, "r") as r:
12 |             anno = json.loads(r.read())
13 |     except:
14 |         with open(json_path, "r", encoding="cp1252") as r:
15 |             anno = json.loads(r.read())
16 | 
17 |     inform = anno["shapes"]
18 |     comments = anno["text"]
19 |     is_sentence = anno["is_sentence"]
20 | 
21 |     height, width = img.shape[:2]
22 | 
23 |     ### sort polies by area
24 |     area_list = []
25 |     valid_poly_list = []
26 |     for i in inform:
27 |         label_id = i["label"]
28 |         points = i["points"]
29 |         if "flag" == label_id.lower():  ## meaningless deprecated annotations
30 |             continue
31 | 
32 |         tmp_mask = np.zeros((height, width), dtype=np.uint8)
33 |         cv2.polylines(tmp_mask, np.array([points], dtype=np.int32), True, 1, 1)
34 |         cv2.fillPoly(tmp_mask, np.array([points], dtype=np.int32), 1)
35 |         tmp_area = tmp_mask.sum()
36 | 
37 |         area_list.append(tmp_area)
38 |         valid_poly_list.append(i)
39 | 
40 |     ### ground-truth mask
41 |     sort_index = np.argsort(area_list)[::-1].astype(np.int32)
42 |     sort_index = list(sort_index)
43 |     sort_inform = []
44 |     for s_idx in sort_index:
45 |         sort_inform.append(valid_poly_list[s_idx])
46 | 
47 |     mask = np.zeros((height, width), dtype=np.uint8)
48 |     for i in sort_inform:
49 |         label_id = i["label"]
50 |         points = i["points"]
51 | 
52 |         if "ignore" in label_id.lower():
53 |             label_value = 255  # ignored during evaluation
54 |         else:
55 |             label_value = 1  # target
56 | 
57 |         cv2.polylines(mask, np.array([points], dtype=np.int32), True, label_value, 1)
58 |         cv2.fillPoly(mask, np.array([points], dtype=np.int32), label_value)
59 | 
60 |     return mask, comments, is_sentence
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     data_dir = "./train"
65 |     vis_dir = "./vis"
66 | 
67 |     if not os.path.exists(vis_dir):
68 |         os.makedirs(vis_dir)
69 | 
70 |     json_path_list = sorted(glob.glob(data_dir + "/*.json"))
71 |     for json_path in json_path_list:
72 |         img_path = json_path.replace(".json", ".jpg")
73 |         img = cv2.imread(img_path)[:, :, ::-1]
74 | 
75 |         # In generated mask, value 1 denotes valid target region, and value 255 stands for region ignored during evaluaiton.
76 |         mask, comments, is_sentence = get_mask_from_json(json_path, img)
77 | 
78 |         ## visualization. Green for target, and red for ignore.
79 |         valid_mask = (mask == 1).astype(np.float32)[:, :, None]
80 |         ignore_mask = (mask == 255).astype(np.float32)[:, :, None]
81 |         vis_img = img * (1 - valid_mask) * (1 - ignore_mask) + (
82 |             (np.array([0, 255, 0]) * 0.6 + img * 0.4) * valid_mask
83 |             + (np.array([255, 0, 0]) * 0.6 + img * 0.4) * ignore_mask
84 |         )
85 |         vis_img = np.concatenate([img, vis_img], 1)
86 |         vis_path = os.path.join(
87 |             vis_dir, json_path.split("/")[-1].replace(".json", ".jpg")
88 |         )
89 |         cv2.imwrite(vis_path, vis_img[:, :, ::-1])
90 |         print("Visualization has been saved to: ", vis_path)
91 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/grounding_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/utils/grounding_utils/__init__.py


--------------------------------------------------------------------------------
/VideoGLaMM/utils/grounding_utils/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | """
  4 | Utilities for bounding box manipulation and GIoU.
  5 | """
  6 | import torch
  7 | import numpy as np
  8 | from torchvision.ops.boxes import box_area
  9 | from typing import Tuple
 10 | 
 11 | #### Bounding box utilities imported from torchvision and converted to numpy
 12 | def np_box_area(boxes: np.array) -> np.array:
 13 |     """
 14 |     Computes the area of a set of bounding boxes, which are specified by its
 15 |     (x1, y1, x2, y2) coordinates.
 16 | 
 17 |     Args:
 18 |         boxes (Tensor[N, 4]): boxes for which the area will be computed. They
 19 |             are expected to be in (x1, y1, x2, y2) format with
 20 |             ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 21 | 
 22 |     Returns:
 23 |         area (Tensor[N]): area for each box
 24 |     """
 25 |     assert boxes.ndim == 2 and boxes.shape[-1] == 4
 26 |     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 27 | 
 28 | 
 29 | # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
 30 | # with slight modifications
 31 | def _box_inter_union(boxes1: np.array, boxes2: np.array) -> Tuple[np.array, np.array]:
 32 |     area1 = np_box_area(boxes1)
 33 |     area2 = np_box_area(boxes2)
 34 | 
 35 |     lt = np.maximum(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 36 |     rb = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 37 | 
 38 |     wh = (rb - lt).clip(min=0)  # [N,M,2]
 39 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 40 | 
 41 |     union = area1[:, None] + area2 - inter
 42 | 
 43 |     return inter, union
 44 | 
 45 | 
 46 | def np_box_iou(boxes1: np.array, boxes2: np.array) -> np.array:
 47 |     """
 48 |     Return intersection-over-union (Jaccard index) of boxes.
 49 | 
 50 |     Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
 51 |     ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 52 | 
 53 |     Args:
 54 |         boxes1 (Tensor[N, 4])
 55 |         boxes2 (Tensor[M, 4])
 56 | 
 57 |     Returns:
 58 |         iou (Tensor[N, M]): the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
 59 |     """
 60 |     inter, union = _box_inter_union(boxes1, boxes2)
 61 |     iou = inter / union
 62 |     return iou
 63 | 
 64 | 
 65 | def box_cxcywh_to_xyxy(x):
 66 |     x_c, y_c, w, h = x.unbind(-1)
 67 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
 68 |     return torch.stack(b, dim=-1)
 69 | 
 70 | 
 71 | def box_xyxy_to_cxcywh(x):
 72 |     x0, y0, x1, y1 = x.unbind(-1)
 73 |     b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
 74 |     return torch.stack(b, dim=-1)
 75 | 
 76 | 
 77 | # modified from torchvision to also return the union
 78 | def box_iou(boxes1, boxes2):
 79 |     area1 = box_area(boxes1)
 80 |     area2 = box_area(boxes2)
 81 | 
 82 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 83 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 84 | 
 85 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 86 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 87 | 
 88 |     union = area1[:, None] + area2 - inter
 89 | 
 90 |     iou = inter / union
 91 |     return iou, union
 92 | 
 93 | 
 94 | def generalized_box_iou(boxes1, boxes2):
 95 |     """
 96 |     Generalized IoU from https://giou.stanford.edu/
 97 | 
 98 |     The boxes should be in [x0, y0, x1, y1] format
 99 | 
100 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
101 |     and M = len(boxes2)
102 |     """
103 |     # degenerate boxes gives inf / nan results
104 |     # so do an early check
105 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
106 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
107 |     iou, union = box_iou(boxes1, boxes2)
108 | 
109 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
110 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
111 | 
112 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
113 |     area = wh[:, :, 0] * wh[:, :, 1]
114 | 
115 |     return iou - (area - union) / area
116 | 
117 | 
118 | def masks_to_boxes(masks):
119 |     """Compute the bounding boxes around the provided masks
120 | 
121 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
122 | 
123 |     Returns a [N, 4] tensors, with the boxes in xyxy format
124 |     """
125 |     if masks.numel() == 0:
126 |         return torch.zeros((0, 4), device=masks.device)
127 | 
128 |     h, w = masks.shape[-2:]
129 | 
130 |     y = torch.arange(0, h, dtype=torch.float)
131 |     x = torch.arange(0, w, dtype=torch.float)
132 |     y, x = torch.meshgrid(y, x)
133 | 
134 |     x_mask = masks * x.unsqueeze(0)
135 |     x_max = x_mask.flatten(1).max(-1)[0]
136 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
137 | 
138 |     y_mask = masks * y.unsqueeze(0)
139 |     y_max = y_mask.flatten(1).max(-1)[0]
140 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
141 | 
142 |     return torch.stack([x_min, y_min, x_max, y_max], 1)


--------------------------------------------------------------------------------
/VideoGLaMM/utils/itm_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import torch
  4 | import torchvision
  5 | import torch.nn.functional as F
  6 | import torchvision.transforms.functional as TF
  7 | 
  8 | # Define a class for the transformation
  9 | class RandomTransforms:
 10 |     def __init__(self, size=(1024, 1024)):
 11 |         self.size = size
 12 | 
 13 |     def __call__(self, img, mask=None):
 14 |         # # Random horizontal flip with a probability of 0.5
 15 |         # if random.random() > 0.5:
 16 |         #     img = TF.hflip(img)
 17 |         #     if mask is not None:
 18 |         #         mask = TF.hflip(mask)
 19 | 
 20 |         # Random resize
 21 |         resize_scale_min = 1.0
 22 |         resize_scale_max = 1.2
 23 |         scale = random.uniform(resize_scale_min, resize_scale_max)
 24 |         new_size = int(self.size[0] * scale)
 25 |         img = TF.resize(img, [new_size, new_size])
 26 |         if mask is not None:
 27 |             mask = TF.resize(mask, [new_size, new_size])
 28 | 
 29 |         # Random crop
 30 |         i, j, h, w = torchvision.transforms.RandomCrop.get_params(img, output_size=self.size)
 31 |         img = TF.crop(img, i, j, h, w)
 32 |         if mask is not None:
 33 |             mask = TF.crop(mask, i, j, h, w)
 34 | 
 35 |         # Photometric distortion (here using a simple color jitter as an example)
 36 |         color_jitter = torchvision.transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1)
 37 |         img = color_jitter(img)
 38 | 
 39 |         return img, mask
 40 |     
 41 | __transform = RandomTransforms()
 42 | def __transform_fn(preprocessed_for_sam, masks):
 43 |     ''' 
 44 |         preprocessed_for_sample: [T_sam, 3, 1024, 1024]
 45 |         masks: [num_masks, T_sam, h', w']
 46 |     '''
 47 |     h,w = masks.shape[-2:]
 48 |     masks_resized = F.interpolate(masks.float(), size=(1024, 1024), mode='nearest') # [num_masks, T_sam, h', w']-> [num_masks, T_sam, 1024, 1024]
 49 |     # Apply the transforms to an image and its corresponding mask
 50 |     image_transformed, mask_transformed = __apply_transforms(preprocessed_for_sam, masks_resized, __transform)
 51 |     
 52 |     masks_transformed_original_shape = F.interpolate(mask_transformed, size=(h, w), mode='nearest') # [num_masks, T_sam, h', w']
 53 |     
 54 |     return image_transformed, masks_transformed_original_shape
 55 | 
 56 | def __adjust_temporal_dimension(images, masks, T_train=5):
 57 |     """
 58 |     Adjust the time dimension of the images and masks tensors to match T_train.
 59 | 
 60 |     Parameters:
 61 |     - images (torch.Tensor): The images tensor with shape [T_sam, 3, 1024, 1024]
 62 |     - masks (torch.Tensor): The masks tensor with shape [num_seg_tokens_per_sample, T_sam, H, W]
 63 |     - T_train (int): The desired time dimension size.
 64 | 
 65 |     Returns:
 66 |     - tuple of torch.Tensor: The new_images and new_masks tensors with adjusted time dimensions.
 67 |     """
 68 |     # Get current T_sam from images or masks
 69 |     T_sam = images.shape[0]
 70 | 
 71 |     if T_sam < T_train:
 72 |         # Calculate number of repetitions needed
 73 |         repeat_times = T_train // T_sam
 74 |         extra_repeat = T_train % T_sam
 75 | 
 76 |         # Repeat the whole tensor the necessary number of whole times
 77 |         new_images = images.repeat(repeat_times, 1, 1, 1)
 78 |         new_masks = masks.repeat(1, repeat_times, 1, 1)
 79 | 
 80 |         # If there's a remainder, repeat enough samples to fill up to T_train
 81 |         if extra_repeat > 0:
 82 |             new_images = torch.cat((new_images, images[:extra_repeat]), dim=0)
 83 |             new_masks = torch.cat((new_masks, masks[:, :extra_repeat]), dim=1)
 84 |     else:
 85 |         # If T_sam is already greater than or equal to T_train, optionally slice the tensors
 86 |         new_images = images[:T_train]
 87 |         new_masks = masks[:, :T_train]
 88 | 
 89 |     return new_images, new_masks
 90 | 
 91 | def apply_augmentations_and_transforms(preprocessed_for_sam, masks, T_train=5):
 92 |     # make sure the required number of image, mask pairs are there
 93 |     preprocessed_for_sam, masks = __adjust_temporal_dimension(preprocessed_for_sam, masks, T_train)
 94 |     # augment frames
 95 |     preprocessed_for_sam, masks = __transform_fn(preprocessed_for_sam, masks) 
 96 |     
 97 |     return preprocessed_for_sam, masks
 98 | 
 99 | def __apply_transforms(images, masks, transform):
100 |     '''
101 |     Assume we have a batch of images and masks
102 |         Images: [T, 3, 1024, 1024] 
103 |         Masks: [num_classes, T, 1024, 1024]
104 |     '''
105 |     transformed_images = []
106 |     transformed_masks = []
107 | 
108 |     T = images.shape[0]
109 |     num_classes = masks.shape[0]
110 | 
111 |     for t in range(T):
112 |         img = images[t]  # Current frame
113 |         mask_frames = masks[:, t]  # Masks corresponding to the current frame
114 | 
115 |         if num_classes>0:
116 |             # Apply the same transform to image and each class mask
117 |             img_transformed, mask_transformed = transform(img, mask_frames)
118 |             transformed_images.append(img_transformed)
119 |             transformed_masks.append(mask_transformed)
120 |         else:
121 |             img_transformed, _ = transform(img)
122 |             transformed_images.append(img_transformed)
123 |             transformed_masks.append(mask_frames)
124 | 
125 |     return torch.stack(transformed_images), torch.stack(transformed_masks, dim=1)
126 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def get_dimensions(obj):
 5 |     if isinstance(obj, (torch.Tensor, np.ndarray)):
 6 |         return list(obj.shape)
 7 |     elif isinstance(obj, list) or isinstance(obj, tuple):
 8 |         dimensions = [len(obj)]
 9 |         if len(obj) > 0:
10 |             # Assuming uniform dimensions across all items in the list
11 |             dimensions += get_dimensions(obj[0])
12 |         return dimensions
13 |     else:
14 |         raise TypeError("Unsupported object type. Must be a PyTorch tensor, NumPy array, or a nested list of tensors/arrays.")
15 | 
16 | def print_dimensions(name, obj):
17 |     dimensions = get_dimensions(obj)
18 |     print('>>',name, ':', dimensions)


--------------------------------------------------------------------------------
/VideoGLaMM/utils/preproc_hcstvgv2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from tqdm import tqdm
 4 | import argparse
 5 | 
 6 | if __name__=='__main__':
 7 |     
 8 |     # parser = argparse.ArgumentParser(description="Preprocess HC-STVG Dataset")
 9 |     # parser.add_argument("--video_dir", required=True, help="The path to the directory containing HC-STVG videos")
10 |     # parser.add_argument("--ann_dir", required=True, help="The path to the directory containing HC-STVG annotations")
11 |     # args = parser.parse_args()
12 |     
13 |     # video_path = args.video_dir
14 |     # ann_path = args.ann_dir
15 |     
16 |     base_video_dataset_dir = '/home/shehan/workspace_grounding_lmm/LISA2/video_dataset'
17 |     
18 |     video_path = os.path.join(base_video_dataset_dir,'hcstvg', "Video")
19 |     ann_path   = os.path.join(base_video_dataset_dir,'hcstvg', "anno_v2") 
20 |     
21 |     processed_ann_path = os.path.join(base_video_dataset_dir,'processed/hcstvg/hcstvg_annotations')
22 |     if not os.path.exists(processed_ann_path):
23 |         os.makedirs(processed_ann_path)
24 | 
25 |     # get video to path mapping
26 |     dirs = os.listdir(video_path)
27 |     vid2path = {}
28 |     for dir in dirs:
29 |         files = os.listdir(os.path.join(video_path, dir))
30 |         for file in files:
31 |             assert os.path.exists(os.path.join(video_path, dir, file))
32 |             vid2path[file[:-4]] = os.path.join(dir, file)
33 | 
34 |     # preproc annotations
35 |     files = ["train_v2.json", "val_v2.json"]
36 |     for file in files:
37 |         videos = []
38 |         annotations = json.load(open(os.path.join(ann_path, file), "r"))
39 |         for video, annot in tqdm(annotations.items()):
40 |             out = {
41 |                 "original_video_id": video[:-4],
42 |                 "frame_count": annot["img_num"],
43 |                 "width": annot["img_size"][1],
44 |                 "height": annot["img_size"][0],
45 |                 "tube_start_frame": annot["st_frame"],  # starts with 1
46 |                 "tube_end_frame": annot["st_frame"] + len(annot["bbox"]),  # excluded
47 |                 "tube_start_time": annot["st_time"],
48 |                 "tube_end_time": annot["ed_time"],
49 |                 "video_path": vid2path[video[:-4]],
50 |                 "caption": annot["English"],
51 |                 "video_id": len(videos),
52 |                 "trajectory": annot["bbox"],
53 |             }
54 |             videos.append(out)
55 | 
56 |         # json.dump(videos, open(os.path.join(ann_path, file[:-5] + "_proc.json"), "w"))
57 |         json.dump(videos, open(os.path.join(processed_ann_path, file[:-5] + "_proc.json"), "w"))


--------------------------------------------------------------------------------
/VideoGLaMM/utils/refer_datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/VideoGLaMM/utils/refer_datasets/__init__.py


--------------------------------------------------------------------------------
/VideoGLaMM/utils/refer_datasets/box_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision.ops.boxes import box_area
 3 | import torchvision
 4 | 
 5 | def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
 6 |     # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
 7 |     """
 8 |     Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
 9 |     This will eventually be supported natively by PyTorch, and this
10 |     class can go away.
11 |     """
12 |     # if float(torchvision.__version__[:3]) < 0.7:
13 |     #     if input.numel() > 0:
14 |     #         return torch.nn.functional.interpolate(
15 |     #             input, size, scale_factor, mode, align_corners
16 |     #         )
17 | 
18 |     #     output_shape = _output_size(2, input, size, scale_factor)
19 |     #     output_shape = list(input.shape[:-2]) + list(output_shape)
20 |     #     return _new_empty_tensor(input, output_shape)
21 |     # else:
22 |     #     return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
23 |     return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
24 | 
25 | def box_xyxy_to_cxcywh(x):
26 |     x0, y0, x1, y1 = x.unbind(-1)
27 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
28 |          (x1 - x0), (y1 - y0)]
29 |     return torch.stack(b, dim=-1)
30 | 
31 | 
32 | # modified from torchvision to also return the union
33 | def box_iou(boxes1, boxes2):
34 |     area1 = box_area(boxes1)
35 |     area2 = box_area(boxes2)
36 | 
37 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
38 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
39 | 
40 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
41 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
42 | 
43 |     union = area1[:, None] + area2 - inter
44 | 
45 |     iou = (inter+1e-6) / (union+1e-6)
46 |     return iou, union


--------------------------------------------------------------------------------
/VideoGLaMM/utils/refer_datasets/new/davis17.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | import torch
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import torchvision.transforms as T
 8 | 
 9 | class ReferDAVISDataset(Dataset):
10 |     def __init__(self, video_dataset_dir, split, transform=None):
11 |         """
12 |         Initialize the Refer-DAVIS dataset.
13 | 
14 |         Args:
15 |             video_dataset_dir (str): Path to the root video dataset folder
16 |             split (str): Dataset split, either 'train' or 'valid'.
17 |             transform (callable, optional): Optional transform to be applied on an image.
18 |         """
19 |         self.davis_path = os.path.join(video_dataset_dir, "processed/refer_davis/2017")
20 |         self.split = split
21 |         self.transform = transform
22 | 
23 |         # Load meta expressions JSON
24 |         meta_file = os.path.join(self.davis_path, "meta_expressions", split, "meta_expressions.json")
25 |         with open(meta_file, "r") as f:
26 |             self.data = json.load(f)["videos"]
27 | 
28 |         # Store list of videos
29 |         self.video_list = list(self.data.keys())
30 |         self.img_folder = os.path.join(self.davis_path, split, "JPEGImages")
31 |         
32 |         # metadata
33 |         self.metas = []
34 |         for video in self.video_list:
35 |             video_data = self.data[video]
36 |             frames = video_data["frames"]
37 |             expressions = video_data["expressions"]
38 |             
39 |             for expression_id, expression in expressions.items():
40 |                 self.metas.append({
41 |                     'video': video,
42 |                     'expression_id': expression_id,
43 |                     'expression': expression['exp'],
44 |                     'frames': frames
45 |                 })
46 | 
47 |     def __len__(self):
48 |         """
49 |         Returns the number of videos in the dataset.
50 |         """
51 |         return len(self.metas)
52 | 
53 |     def __getitem__(self, idx):
54 |         """
55 |         Get data for a single video and expression.
56 | 
57 |         Args:
58 |             idx (int): Index of the video to retrieve.
59 | 
60 |         Returns:
61 |             dict: A dictionary containing video frames, expression, and metadata.
62 |         """
63 |         meta = self.metas[idx]
64 |         video = meta['video']
65 |         expression_id = meta['expression_id']
66 |         expression_text = meta['expression']
67 |         frames = meta['frames']
68 | 
69 |         # Extract frame paths and apply transformations
70 |         img_paths = [os.path.join(self.img_folder, video, frame + ".jpg") for frame in frames]
71 |         imgs = [Image.open(img_path).convert('RGB') for img_path in img_paths]
72 | 
73 |         if self.transform:
74 |             imgs = [self.transform(img) for img in imgs]
75 |             imgs = torch.stack(imgs, dim=0)  # [video_len, 3, H, W]
76 | 
77 |         
78 |         np_images = [np.array(image) for image in imgs]
79 |         np_images = np.stack(np_images, axis=0)
80 |         
81 |         target = {
82 |             'caption': expression_text,
83 |             'pil_images': imgs,
84 |             'video_path': (video, expression_id),
85 |             'frame_ids': frames
86 |         }
87 |         
88 |         return np_images, target
89 | 
90 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/refer_datasets/new/ytvos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | import torch
  5 | from torch.utils.data import Dataset
  6 | from PIL import Image
  7 | import torchvision.transforms as T
  8 | 
  9 | class ReferYouTubeVOSDataset(Dataset):
 10 |     def __init__(self, video_dataset_dir, split, transform=None):
 11 |         """
 12 |         Initialize the Refer-YouTube-VOS dataset.
 13 |         
 14 |         Args:
 15 |             video_dataset_dir (str): Path to the root video dataset folder
 16 |             split (str): Dataset split, either 'train' or 'val'.
 17 |             transform (callable, optional): Optional transform to be applied on an image.
 18 |         """
 19 |         assert split in ['train', 'valid', 'test'], "Invalid split. Must be one of ['train', 'valid', 'test']"
 20 |         
 21 |         self.ytvos_path = os.path.join(video_dataset_dir, "refer_youtube_vos")
 22 |         self.split = split
 23 |         self.transform = transform
 24 | 
 25 |         # Load meta expressions JSON
 26 |         if split == 'train':
 27 |             meta_file = os.path.join(self.ytvos_path, "meta_expressions", 'train', "meta_expressions.json")
 28 |             with open(meta_file, "r") as f:
 29 |                 data = json.load(f)["videos"]
 30 |             self.data = data
 31 |             self.videos = sorted(data.keys())
 32 |             assert len(self.videos) == 3471, "Expected 3471 training videos, got {}".format(len(self.videos))
 33 |         
 34 |         elif split == 'valid':
 35 |             valid_meta_file = os.path.join(self.ytvos_path, "meta_expressions", 'valid', "meta_expressions.json")
 36 |             with open(valid_meta_file, "r") as f:
 37 |                 valid_data = json.load(f)["videos"]
 38 |             self.data = valid_data
 39 |                 
 40 |             # for some reasons the competition's validation expressions dict contains both the validation (202) & test videos (305). 
 41 |             # so we simply load the test expressions dict and use it to filter out the test videos from the validation expressions dict:
 42 |             test_meta_file = os.path.join(self.ytvos_path, "meta_expressions", "test", "meta_expressions.json")
 43 |             with open(test_meta_file, 'r') as f:
 44 |                 test_data = json.load(f)['videos']
 45 |             test_videos = set(test_data.keys())
 46 |             self.videos = sorted([video for video in valid_data.keys() if video not in test_videos])
 47 |             assert len(self.videos) == 202, "Expected 202 validation videos, got {}".format(len(self.videos))
 48 |             
 49 |         elif split == 'test':
 50 |             meta_file = os.path.join(self.ytvos_path, "meta_expressions", 'test', "meta_expressions.json")
 51 |             with open(meta_file, "r") as f:
 52 |                 data = json.load(f)["videos"]
 53 |             self.data = data
 54 |             self.videos = sorted(data.keys())
 55 |             assert len(self.videos) == 305, "Expected 305 test videos, got {}".format(len(self.videos))
 56 |         
 57 |         self.img_folder = os.path.join(self.ytvos_path, split, "JPEGImages")
 58 |         
 59 |         # metadata
 60 |         self.metas = []
 61 |         for video in self.videos:
 62 |             video_data = self.data[video]
 63 |             frames = video_data["frames"]
 64 |             expressions = video_data["expressions"]
 65 |             
 66 |             for expression_id, expression in expressions.items():
 67 |                 self.metas.append({
 68 |                     'video': video,
 69 |                     'expression_id': expression_id,
 70 |                     'expression': expression['exp'],
 71 |                     'frames': frames
 72 |                 })
 73 |         
 74 | 
 75 |     def __len__(self):
 76 |         """
 77 |         Returns the number of valid videos in the dataset.
 78 |         """
 79 |         return len(self.metas)
 80 | 
 81 |     def __getitem__(self, idx):
 82 |         """
 83 |         Get data for a single video and expression.
 84 | 
 85 |         Args:
 86 |             idx (int): Index of the video to retrieve.
 87 | 
 88 |         Returns:
 89 |             dict: A dictionary containing video frames, expression, and metadata.
 90 |         """
 91 |         meta = self.metas[idx]
 92 |         video = meta['video']
 93 |         expression_id = meta['expression_id']
 94 |         expression_text = meta['expression']
 95 |         frames = meta['frames']
 96 | 
 97 |         # Extract frame paths and apply transformations
 98 |         img_paths = [os.path.join(self.img_folder, video, frame + ".jpg") for frame in frames]
 99 |         imgs = [Image.open(img_path).convert('RGB') for img_path in img_paths]
100 | 
101 |         if self.transform:
102 |             imgs = [self.transform(img) for img in imgs]
103 |             imgs = torch.stack(imgs, dim=0)  # [video_len, 3, H, W]
104 | 
105 |         
106 |         np_images = [np.array(image) for image in imgs]
107 |         np_images = np.stack(np_images, axis=0)
108 |         
109 |         target = {
110 |             'caption': expression_text,
111 |             'pil_images': imgs,
112 |             'video_path': (video, expression_id),
113 |             'frame_ids': frames
114 |         }
115 |         
116 |         return np_images, target
117 | 
118 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/sam_transforms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from model.segment_anything.utils.transforms import ResizeLongestSide
 5 | 
 6 | ####################
 7 | # SAM1 Transforms
 8 | 
 9 | # transform = ResizeLongestSide(image_size)
10 | 
11 | # def preprocess_for_sam(self, x: torch.Tensor) -> torch.Tensor:
12 | #     """Normalize pixel values and pad to a square input."""
13 | #     # Normalize colors
14 | #     x = (x - self.pixel_mean) / self.pixel_std
15 | 
16 | #     # Pad
17 | #     h, w = x.shape[-2:]
18 | #     padh = self.img_size - h
19 | #     padw = self.img_size - w
20 | #     x = F.pad(x, (0, padw, 0, padh))
21 | #     return x
22 | 
23 | ####################
24 | # SAM2 Transforms
25 | 
26 | def sam_preprocess(
27 |     x: np.ndarray, # np array in RGB format # (H, W, 3)
28 |     pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
29 |     pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
30 |     img_size=1024,
31 |     model_type="ori" # "ori" for SAM, "effi" for Effi-SAM, "sam2" for SAM2
32 |     ) -> torch.Tensor:
33 |     '''
34 |     
35 |     Preprocessing function of Segment Anything Model, including scaling, normalization and padding.  
36 |     Preprocess differs between SAM and Effi-SAM, where Effi-SAM use no padding.
37 |     
38 |     - input: ndarray
39 |     - output: torch.Tensor
40 |     
41 |     Usage:
42 |         image_np = cv2.imread(image_path)
43 |         image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
44 |         original_size_list = [image_np.shape[:2]]
45 | 
46 |         image_sam, resize_shape = __sam_preprocess(image_np, model_type=args.model_type)
47 |         
48 |     '''
49 |     assert img_size==1024, \
50 |         "both SAM and Effi-SAM receive images of size 1024^2, don't change this setting unless you're sure that your employed model works well with another size."
51 |     x = ResizeLongestSide(img_size).apply_image(x)
52 |     resize_shape = x.shape[:2]
53 |     x = torch.from_numpy(x).permute(2,0,1).contiguous()
54 | 
55 |     # Normalize colors
56 |     x = (x - pixel_mean) / pixel_std
57 |     if model_type=="effi" or model_type=="sam2":
58 |         x = F.interpolate(x.unsqueeze(0), (img_size, img_size), mode="bilinear").squeeze(0)
59 |     else:
60 |         # Pad
61 |         h, w = x.shape[-2:]
62 |         padh = img_size - h
63 |         padw = img_size - w
64 |         x = F.pad(x, (0, padw, 0, padh))
65 |     return x, resize_shape
66 | 
67 | 
68 | class SAM_v1_Preprocess:
69 |     def preprocess(self, x: np.ndarray) -> torch.Tensor:
70 |         return sam_preprocess(x, model_type="ori")
71 |     
72 | class SAM_v2_Preprocess:    
73 |     def preprocess(self, x: np.ndarray) -> torch.Tensor:
74 |         return sam_preprocess(x, model_type="sam2")
75 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.distributed as dist
  6 | 
  7 | class Summary(Enum):
  8 |     NONE = 0
  9 |     AVERAGE = 1
 10 |     SUM = 2
 11 |     COUNT = 3
 12 | 
 13 | 
 14 | class AverageMeter(object):
 15 |     """Computes and stores the average and current value"""
 16 | 
 17 |     def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
 18 |         self.name = name
 19 |         self.fmt = fmt
 20 |         self.summary_type = summary_type
 21 |         self.reset()
 22 | 
 23 |     def reset(self):
 24 |         self.val = 0
 25 |         self.avg = 0
 26 |         self.sum = 0
 27 |         self.count = 0
 28 | 
 29 |     def update(self, val, n=1):
 30 |         self.val = val
 31 |         self.sum += val * n
 32 |         self.count += n
 33 |         self.avg = self.sum / self.count
 34 | 
 35 |     def all_reduce(self):
 36 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 37 |         if isinstance(self.sum, np.ndarray):
 38 |             total = torch.tensor(
 39 |                 self.sum.tolist()
 40 |                 + [
 41 |                     self.count,
 42 |                 ],
 43 |                 dtype=torch.float32,
 44 |                 device=device,
 45 |             )
 46 |         else:
 47 |             total = torch.tensor(
 48 |                 [self.sum, self.count], dtype=torch.float32, device=device
 49 |             )
 50 | 
 51 |         dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
 52 |         if total.shape[0] > 2:
 53 |             self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
 54 |         else:
 55 |             self.sum, self.count = total.tolist()
 56 |         self.avg = self.sum / (self.count + 1e-5)
 57 | 
 58 |     def __str__(self):
 59 |         fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
 60 |         return fmtstr.format(**self.__dict__)
 61 | 
 62 |     def summary(self):
 63 |         fmtstr = ""
 64 |         if self.summary_type is Summary.NONE:
 65 |             fmtstr = ""
 66 |         elif self.summary_type is Summary.AVERAGE:
 67 |             fmtstr = "{name} {avg:.3f}"
 68 |         elif self.summary_type is Summary.SUM:
 69 |             fmtstr = "{name} {sum:.3f}"
 70 |         elif self.summary_type is Summary.COUNT:
 71 |             fmtstr = "{name} {count:.3f}"
 72 |         else:
 73 |             raise ValueError("invalid summary type %r" % self.summary_type)
 74 | 
 75 |         return fmtstr.format(**self.__dict__)
 76 | 
 77 | 
 78 | def intersectionAndUnionGPU(output, target, K, ignore_index=255):
 79 |     # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
 80 |     assert output.dim() in [1, 2, 3]
 81 |     assert output.shape == target.shape
 82 |     output = output.view(-1)
 83 |     target = target.view(-1)
 84 |     output[target == ignore_index] = ignore_index
 85 |     intersection = output[output == target]
 86 |     area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
 87 |     area_output = torch.histc(output, bins=K, min=0, max=K - 1)
 88 |     area_target = torch.histc(target, bins=K, min=0, max=K - 1)
 89 |     area_union = area_output + area_target - area_intersection
 90 |     return area_intersection, area_union, area_target
 91 | 
 92 | def intersectionAndUnionGPU_for_video(output, target, K, ignore_index=255):
 93 |     # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
 94 |     assert output.dim() in [1, 2, 3, 4]
 95 |     assert output.shape == target.shape
 96 |     output = output.view(-1)
 97 |     target = target.view(-1)
 98 |     output[target == ignore_index] = ignore_index
 99 |     intersection = output[output == target]
100 |     area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
101 |     area_output = torch.histc(output, bins=K, min=0, max=K - 1)
102 |     area_target = torch.histc(target, bins=K, min=0, max=K - 1)
103 |     area_union = area_output + area_target - area_intersection
104 |     return area_intersection, area_union, area_target
105 | 
106 | 
107 | class ProgressMeter(object):
108 |     def __init__(self, num_batches, meters, prefix=""):
109 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
110 |         self.meters = meters
111 |         self.prefix = prefix
112 | 
113 |     def display(self, batch):
114 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
115 |         entries += [str(meter) for meter in self.meters]
116 |         print("\t".join(entries))
117 | 
118 |     def display_summary(self):
119 |         entries = [" *"]
120 |         entries += [meter.summary() for meter in self.meters]
121 |         print(" ".join(entries))
122 | 
123 |     def _get_batch_fmtstr(self, num_batches):
124 |         num_digits = len(str(num_batches // 1))
125 |         fmt = "{:" + str(num_digits) + "d}"
126 |         return "[" + fmt + "/" + fmt.format(num_batches) + "]"
127 | 
128 | 
129 | def dict_to_cuda(input_dict):
130 |     for k, v in input_dict.items():
131 |         if isinstance(input_dict[k], torch.Tensor):
132 |             input_dict[k] = v.cuda(non_blocking=True)
133 |         elif (
134 |             isinstance(input_dict[k], list)
135 |             and len(input_dict[k]) > 0
136 |             and isinstance(input_dict[k][0], torch.Tensor)
137 |         ):
138 |             # input_dict[k] = [ele.cuda(non_blocking=True) for ele in v] #TODO: if ele is None, keep it None
139 |             input_dict[k] = [ele.cuda(non_blocking=True) if ele is not None else None for ele in v]
140 |             
141 |     return input_dict
142 | 


--------------------------------------------------------------------------------
/VideoGLaMM/utils/vqa_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | 
 5 | import cv2
 6 | import torch
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | class VQADataset(torch.utils.data.Dataset):
11 |     ignore_label = 255
12 | 
13 |     def __init__(
14 |         self,
15 |         base_image_dir,
16 |         enc_preprocessor,
17 |         sam_preprocessor,
18 |         conversation_generator,
19 |         num_classes_per_sample: int = 3,
20 |         vqa_data="llava_instruct_150k",
21 |     ):
22 |         self.enc_preprocessor = enc_preprocessor
23 |         self.sam_preprocessor = sam_preprocessor
24 |         self.conversation_generator = conversation_generator
25 |         
26 |         self.num_classes_per_sample = num_classes_per_sample
27 | 
28 |         self.base_image_dir = base_image_dir
29 | 
30 |         DATA_DIR = os.path.join(base_image_dir, "llava_dataset")
31 |         self.vqa_image_root = os.path.join(base_image_dir, "coco/train2017")
32 |         with open(os.path.join(DATA_DIR, "{}.json".format(vqa_data))) as f:
33 |             vqa_data = json.load(f)
34 |         self.vqa_data = vqa_data
35 | 
36 |         print("vqa_data: ", len(self.vqa_data))
37 | 
38 |     def __len__(self):
39 |         return len(self.vqa_data)
40 | 
41 |     def __getitem__(self, idx):
42 |         # idx = random.randint(0, len(self.vqa_data) - 1)
43 |         item = self.vqa_data[idx]
44 |         image_path = os.path.join(self.vqa_image_root, item["image"])
45 |         image = cv2.imread(image_path)
46 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # (224, 224, 3)
47 |         ori_size = image.shape[:2]
48 |         
49 |         # preprocess image for clip # [3,224,224]
50 |         enc_out = self.enc_preprocessor.preprocess(image)
51 | 
52 |         ###
53 |         source = item["conversations"]
54 |         conversations = self.conversation_generator.apply(source)
55 | 
56 | 
57 |         # Preprocess image for SAM
58 |         image_for_sam, resize_shape = self.sam_preprocessor.preprocess(image)
59 | 
60 |         masks = torch.rand(0, *ori_size) # random mask of size [0,224,224]
61 |         label = torch.ones(ori_size) * self.ignore_label # array of size [224,224] with ignore label
62 | 
63 |         data_dict = {
64 |             'file_path': image_path,
65 |             'preprocessed_for_sam': image_for_sam,
66 |             'images': enc_out['images'],
67 |             'context_images': enc_out['context_images'],
68 |             'conversations': conversations,
69 |             'masks': masks,
70 |             'label': label,
71 |             'resize': resize_shape,
72 |             'questions': None,
73 |             'sampled_classes': None,
74 |         }
75 |         return data_dict
76 | 


--------------------------------------------------------------------------------
/docs/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/.DS_Store


--------------------------------------------------------------------------------
/docs/images/figures/cvpr25-teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/figures/cvpr25-teaser.png


--------------------------------------------------------------------------------
/docs/images/figures/cvpr25_main_block_diagram-jpg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/figures/cvpr25_main_block_diagram-jpg.jpg


--------------------------------------------------------------------------------
/docs/images/figures/cvpr25_qualitative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/figures/cvpr25_qualitative.png


--------------------------------------------------------------------------------
/docs/images/figures/videoglamm_annotation_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/figures/videoglamm_annotation_pipeline.png


--------------------------------------------------------------------------------
/docs/images/logos/IVAL_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/logos/IVAL_logo.png


--------------------------------------------------------------------------------
/docs/images/logos/MBZUAI_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/logos/MBZUAI_logo.png


--------------------------------------------------------------------------------
/docs/images/logos/Oryx_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/logos/Oryx_logo.png


--------------------------------------------------------------------------------
/docs/images/logos/logo-videoglamm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/VideoGLaMM/f8351c5f1fcda715c6afff66fa4a777e75d08e1b/docs/images/logos/logo-videoglamm.png


--------------------------------------------------------------------------------