├── .gitattributes
├── .github
    ├── issue_template.md
    ├── pull_request_template.md
    └── workflows
    │   └── black.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── conftest.py
├── docs
    ├── OtterHD.md
    ├── benchmark_eval.md
    ├── credits.md
    ├── huggingface_compatible.md
    ├── mimicit_format.md
    └── server_host.md
├── environment.yml
├── mimic-it
    ├── README.md
    ├── convert-it
    │   ├── README.md
    │   ├── __init__.py
    │   ├── abstract_dataset.py
    │   ├── datasets
    │   │   ├── 2d.py
    │   │   ├── 3d.py
    │   │   ├── __init__.py
    │   │   ├── change.py
    │   │   ├── fpv.py
    │   │   ├── utils
    │   │   │   ├── scene_navigation_utils.py
    │   │   │   └── visual_story_telling_utils.py
    │   │   └── video.py
    │   ├── image_utils.py
    │   └── main.py
    └── syphus
    │   ├── abstract_dataset.py
    │   ├── datasets
    │       ├── 3d.py
    │       ├── __init__.py
    │       ├── change.py
    │       ├── fpv.py
    │       ├── funqa.py
    │       ├── translate.py
    │       └── video.py
    │   ├── file_utils.py
    │   ├── main.py
    │   └── prompts
    │       ├── coco_spot_the_difference_prompt.py
    │       ├── dense_captions.json
    │       ├── ego4d.json
    │       ├── funqa_dia.json
    │       ├── funqa_mcqa.json
    │       ├── funqa_translation.json
    │       ├── scene_navigation.json
    │       ├── spot_the_difference.json
    │       ├── translation_prompt.py
    │       ├── tv_captions.json
    │       └── visual_story_telling.json
├── pipeline
    ├── accelerate_configs
    │   ├── accelerate_config_ddp.yaml
    │   ├── accelerate_config_fsdp.yaml
    │   ├── accelerate_config_zero1.yaml
    │   ├── accelerate_config_zero2.yaml
    │   ├── accelerate_config_zero2_slurm.yaml
    │   ├── accelerate_config_zero3.yaml
    │   ├── accelerate_config_zero3_offload.yaml
    │   ├── accelerate_config_zero3_slurm.yaml
    │   └── ds_zero3_config.json
    ├── benchmarks
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── base_eval_dataset.py
    │   │   ├── magnifierbench.py
    │   │   ├── mathvista.py
    │   │   ├── mmbench.py
    │   │   ├── mme.py
    │   │   ├── mmvet.py
    │   │   ├── pope.py
    │   │   ├── scienceqa.py
    │   │   └── seedbench.py
    │   ├── evaluate.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── base_model.py
    │   │   ├── frozen_bilm.py
    │   │   ├── fuyu.py
    │   │   ├── gpt4v.py
    │   │   ├── idefics.py
    │   │   ├── instructblip.py
    │   │   ├── llama_adapter.py
    │   │   ├── llava_model.py
    │   │   ├── mplug_owl.py
    │   │   ├── otter_image.py
    │   │   ├── otter_video.py
    │   │   ├── otterhd.py
    │   │   ├── qwen_vl.py
    │   │   ├── video_chat.py
    │   │   └── video_chatgpt.py
    │   └── public_datasets_suite
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── classification_utils.py
    │   │   ├── coco_metric.py
    │   │   ├── config.yaml
    │   │   ├── eval_datasets.py
    │   │   ├── eval_model.py
    │   │   ├── evaluate.py
    │   │   ├── get_args.ipynb
    │   │   ├── models
    │   │       ├── __init__.py
    │   │       ├── blip.py
    │   │       ├── idefics.py
    │   │       ├── open_flamingo.py
    │   │       ├── otter.py
    │   │       └── utils.py
    │   │   ├── ok_vqa_utils.py
    │   │   ├── run_eval_coco_idefics.sh
    │   │   ├── run_eval_coco_otter.sh
    │   │   ├── run_eval_otter.sh
    │   │   ├── run_eval_otter_slurm.sh
    │   │   └── vqa_metric.py
    ├── demos
    │   ├── __init__.py
    │   ├── demo_models.py
    │   ├── demo_utils.py
    │   ├── inference.py
    │   └── interactive
    │   │   ├── otter_image.ipynb
    │   │   ├── otter_image.py
    │   │   ├── otter_image_incontext.py
    │   │   ├── otter_video.ipynb
    │   │   └── otter_video.py
    ├── mimicit_utils
    │   ├── data.py
    │   ├── mimicit_dataset.py
    │   └── transforms.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── conversation.py
    │   ├── deploy
    │   │   ├── conversation.py
    │   │   ├── deploy.py
    │   │   ├── otterhd_endpoint.py
    │   │   └── utils.py
    │   ├── gradio_css.py
    │   ├── gradio_patch.py
    │   ├── gradio_web_server.py
    │   ├── gradio_web_server_video.py
    │   ├── model_worker.py
    │   ├── multiplex_script
    │   │   └── otter_image_server.py
    │   ├── register_worker.py
    │   ├── serving_utils.py
    │   └── test_message.py
    ├── train
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── config.yaml
    │   ├── distributed.py
    │   ├── instruction_following.py
    │   ├── pretraining.py
    │   ├── pretraining_cc3m.py
    │   ├── train_args.py
    │   └── train_utils.py
    └── utils
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── convert_laion400m-tsv_to_laion400m-tar_mp_shard.py
    │   ├── convert_mmc4_to_wds.py
    │   ├── convert_to_parquet.py
    │   ├── general.py
    │   ├── make_a_train.py
    │   └── modeling_value_head.py
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── setup.py
├── shared_scripts
    ├── Demo_Data.yaml
    ├── Demo_OtterHD.sh
    ├── Demo_OtterMPT.sh
    └── Otter_MPT7B_Train_Decoder.json
├── src
    └── otter_ai
    │   ├── __init__.py
    │   └── models
    │       ├── __init__.py
    │       ├── falcon
    │           ├── __init__.py
    │           ├── configuration_RW.py
    │           └── modelling_RW.py
    │       ├── flamingo
    │           ├── __init__.py
    │           ├── config.json
    │           ├── configuration_flamingo.py
    │           ├── converting_flamingo_to_bf16.py
    │           ├── converting_flamingo_to_hf.py
    │           ├── converting_flamingo_to_lora.py
    │           ├── flamingo-falcon-7B.json
    │           ├── flamingo-llama2-chat-13B.json
    │           ├── flamingo-llama2-chat-7B.json
    │           ├── flamingo-mpt-1B-redpajama.json
    │           ├── flamingo-mpt-30B-bf16.json
    │           ├── flamingo-mpt-30B.json
    │           ├── flamingo-mpt-7B.json
    │           ├── flamingo-vicuna-33B-v1.3.json
    │           ├── flamingo-vicuna-7B-v1.3.json
    │           ├── injecting_falcon_into_flamingo.py
    │           ├── injecting_llama2_into_flamingo.py
    │           ├── injecting_mpt-1B-redpajama_into_flamingo.py
    │           ├── injecting_mpt_into_flamingo.py
    │           ├── injecting_vicuna_into_flamingo.py
    │           ├── modeling_flamingo.py
    │           ├── utils.py
    │           └── utils
    │           │   ├── converting_flamingo_to_bf16.py
    │           │   ├── converting_flamingo_to_hf.py
    │           │   ├── converting_flamingo_to_lora.py
    │           │   ├── flamingo-falcon-7B.json
    │           │   ├── flamingo-llama2-chat-13B.json
    │           │   ├── flamingo-llama2-chat-7B.json
    │           │   ├── flamingo-mpt-1B-redpajama.json
    │           │   ├── flamingo-mpt-30B-bf16.json
    │           │   ├── flamingo-mpt-30B.json
    │           │   ├── flamingo-mpt-7B.json
    │           │   ├── flamingo-vicuna-33B-v1.3.json
    │           │   ├── flamingo-vicuna-7B-v1.3.json
    │           │   ├── injecting_falcon_into_flamingo.py
    │           │   ├── injecting_llama2_into_flamingo.py
    │           │   ├── injecting_mpt-1B-redpajama_into_flamingo.py
    │           │   ├── injecting_mpt_into_flamingo.py
    │           │   └── injecting_vicuna_into_flamingo.py
    │       ├── fuyu
    │           ├── modeling_fuyu.py
    │           ├── modeling_persimmon.py
    │           └── processing_fuyu.py
    │       ├── mpt
    │           ├── __init__.py
    │           ├── adapt_tokenizer.py
    │           ├── attention.py
    │           ├── blocks.py
    │           ├── configuration_mpt.py
    │           ├── custom_embedding.py
    │           ├── flash_attn_triton.py
    │           ├── hf_prefixlm_converter.py
    │           ├── meta_init_context.py
    │           ├── modeling_mpt.py
    │           ├── norm.py
    │           └── param_init_fns.py
    │       ├── mpt_redpajama
    │           ├── __init__.py
    │           ├── attention.py
    │           ├── configuration_mosaic_gpt.py
    │           ├── gpt_blocks.py
    │           ├── low_precision_layernorm.py
    │           ├── mosaic_gpt.py
    │           └── param_init_fns.py
    │       └── otter
    │           ├── Otter-MPT7B-config.json
    │           ├── Otter-MPT7B-config.json~0d12192f665f5e9da1ecb2f23d6a360eb7753771
    │           ├── Otter-MPT7B-config.json~HEAD
    │           ├── __init__.py
    │           ├── config.json
    │           ├── configuration_otter.py
    │           ├── converting_flamingo_to_otter.py
    │           ├── converting_otter_fp32_to_fp16.py
    │           ├── converting_otter_pt_to_hf.py
    │           ├── converting_otter_to_lora.py
    │           ├── flamingo_pt2otter_hf.py
    │           ├── modeling_otter.py
    │           └── utils
    │               ├── Otter-MPT7B-config.json
    │               ├── config.json
    │               ├── converting_flamingo_to_otter.py
    │               ├── converting_otter_fp32_to_fp16.py
    │               ├── converting_otter_pt_to_hf.py
    │               ├── converting_otter_to_lora.py
    │               └── flamingo_pt2otter_hf.py
├── unit_tests
    ├── __init__.py
    ├── test_mmc4_dataset.py
    └── test_prerun.py
└── xformers_model
    ├── __init__.py
    ├── clip.py
    └── llama.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | example_unified_data/negative_sample/all_captions.txt filter=lfs diff=lfs merge=lfs -text
2 | LAVIS/coco-caption/annotations/captions_val2014.json filter=lfs diff=lfs merge=lfs -text
3 | LAVIS/coco-caption/annotations/caption_flickr30k.json filter=lfs diff=lfs merge=lfs -text


--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
 1 | Before you open an issue, please check if a similar issue already exists or has been closed before.
 2 | 
 3 | ### When you open an issue, please be sure to include the following
 4 | 
 5 | - [ ] A descriptive title: [xxx] XXXX
 6 | - [ ] A detailed description
 7 | - [ ] Assign an issue type tag (label):
 8 |   - `dataset` (mimic-it download, usage, etc.),
 9 |   - `demo` (online demo), `doc` (readme, wiki, paper, video etc.),
10 |   - `evaluation` (evaluation result, performance of Otter etc.),
11 |   - `model` (model configuration, components, etc.),
12 |   - `train` (training configuration, process, code, etc.)
13 | 
14 | Thank you for your contributions!
15 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | Before you open a pull-request, please check if a similar issue already exists or has been closed before.
 2 | 
 3 | ### When you open a pull-request, please be sure to include the following
 4 | 
 5 | - [ ] A descriptive title: [xxx] XXXX
 6 | - [ ] A detailed description
 7 | - [ ] Assign an issue type tag (label):
 8 |   - `dataset` (mimic-it download, usage, etc.),
 9 |   - `demo` (online demo),
10 |   - `doc` (readme, wiki, paper, video, etc.),
11 |   - `evaluation` (evaluation result, performance of Otter, etc.),
12 |   - `model` (model configuration, components, etc.),
13 |   - `train` (training configuration, process, code, etc.)
14 | 
15 | Thank you for your contributions!
16 | 


--------------------------------------------------------------------------------
/.github/workflows/black.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - uses: psf/black@stable


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | *.py[cod]
  3 | __pycache__/
  4 | *$py.class
  5 | *.sage.py
  6 | 
  7 | # Special and Backup files
  8 | *.bak
  9 | *.log
 10 | *.tsv
 11 | *.gz
 12 | *.zip
 13 | *.dat
 14 | *.dir
 15 | *.html
 16 | *.mp4
 17 | *.MP4
 18 | *.png
 19 | *.pt
 20 | *.bin
 21 | *.mo
 22 | *.pot
 23 | *.manifest
 24 | *.spec
 25 | *.egg
 26 | *.so
 27 | checkpoint
 28 | LICENSE
 29 | 
 30 | # Editor and IDE configurations
 31 | .vscode/
 32 | .spyderproject
 33 | .spyproject
 34 | .ropeproject
 35 | 
 36 | # Packaging and Distribution
 37 | *.egg-info/
 38 | dist/
 39 | build/
 40 | develop-eggs/
 41 | downloads/
 42 | eggs/
 43 | .eggs/
 44 | lib/
 45 | lib64/
 46 | sdist/
 47 | wheels/
 48 | parts/
 49 | share/python-wheels/
 50 | .installed.cfg
 51 | MANIFEST
 52 | 
 53 | # Unit Test and Coverage
 54 | .coverage*
 55 | .cache
 56 | *.cover
 57 | *.py,cover
 58 | .htmlcov/
 59 | .tox/
 60 | .nox/
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | cover/
 64 | mypy_cache/
 65 | .dmypy.json
 66 | dmypy.json
 67 | .pyre/
 68 | .pytype/
 69 | cython_debug/
 70 | */__pycache__/**
 71 | */.pytest_cache/**
 72 | */.mypy_cache/**
 73 | */.cache/**
 74 | */.coverage.*
 75 | */.c
 76 | 
 77 | # Environments and Dependencies
 78 | .env
 79 | .venv
 80 | .env.bak/
 81 | .venv.bak/
 82 | env/
 83 | venv/
 84 | ENV/
 85 | pip-log.txt
 86 | pip-delete-this-directory.txt
 87 | Pipfile.lock
 88 | poetry.lock
 89 | __pypackages__/
 90 | 
 91 | # Web Frameworks
 92 | local_settings.py
 93 | db.sqlite3
 94 | db.sqlite3-journal
 95 | instance/
 96 | .webassets-cache
 97 | .scrapy
 98 | .site
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # Documentation and Notebooks
103 | docs/_build/
104 | .ipynb_checkpoints
105 | profile_default/
106 | ipython_config.py
107 | 
108 | # Project specific
109 | output/
110 | debug*/
111 | wandb/
112 | archived/
113 | amlt/
114 | scripts/
115 | nginx/
116 | logs/
117 | ofa_compress/
118 | train_*.sh
119 | gpt_playground/
120 | data/
121 | azure/
122 | .deepspeed_env
123 | checkpoints/
124 | pipeline/serve/examples/
125 | mimic-it/syphus/annotations/**
126 | tools/
127 | otter9B-mpt7b-0705/
128 | tokenizer_checklist.chk
129 | tokenizer.model
130 | download.sh
131 | USE_POLICY.md
132 | 
133 | # Miscellaneous
134 | *.pyc
135 | open_flamingo.egg-info
136 | llama-7b-hf/*
137 | cache/
138 | *.code-workspace
139 | pipeline/benchmarks/ckpts/*
140 | pipeline/benchmarks/models/Ask_Anything
141 | pipeline/benchmarks/models/FrozenBiLM
142 | pipeline/benchmarks/models/LLaMA_Adapter
143 | 
144 | shared_scripts/gcp_instance/**
145 | shared_scripts/shai_instance/**
146 | pipeline/benchmarks/models/Video_ChatGPT
147 | pipeline/benchmarks/models/LLaVA
148 | pipeline/benchmarks/models/llava
149 | pipeline/benchmarks/evaluation_result
150 | pipeline/serve/user_logs
151 | config.yaml
152 | 
153 | azure_storage/
154 | checkpoints/
155 | *.ttf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--yaml-path",
 7 |         action="store",
 8 |         default="default_yaml_path.yaml",
 9 |         help="Path to the YAML file",
10 |     )
11 | 
12 | 
13 | @pytest.fixture
14 | def yaml_path(request):
15 |     return request.config.getoption("--yaml-path")
16 | 


--------------------------------------------------------------------------------
/docs/benchmark_eval.md:
--------------------------------------------------------------------------------
 1 | # Welcome to the benchmark evaluation page!
 2 | 
 3 | The evaluation pipeline is designed to be one-clickable and easy to use. However, you may encounter some problems when running the models (e.g. LLaVA, LLaMA-Adapter) that require you to clone their repo to local path. Please feel free to contact us if you have any questions.
 4 | 
 5 | We support the following benchmarks:
 6 | - MagnifierBench
 7 | - MMBench
 8 | - MM-VET
 9 | - MathVista
10 | - POPE
11 | - MME
12 | - SicenceQA
13 | - SeedBench
14 | 
15 | And following models:
16 | - LLaVA
17 | - Fuyu
18 | - OtterHD
19 | - Otter-Image
20 | - Otter-Video
21 | - Idefics
22 | - LLaMA-Adapter
23 | - Qwen-VL
24 | 
25 | many more, see `/pipeline/benchmarks/models`
26 | 
27 | https://github.com/Luodian/Otter/tree/main/pipeline/benchmarks/models
28 | 
29 | Create a yaml file `benchmark.yaml` with below content:
30 | ```yaml
31 | datasets:
32 |   - name: magnifierbench
33 |     split: test
34 |     data_path: Otter-AI/MagnifierBench
35 |     prompt: Answer with the option letter from the given choices directly.
36 |     api_key: [You GPT-4 API]
37 |   - name: mme
38 |     split: test
39 |   - name: pope
40 |     split: test
41 |     default_output_path: ./logs
42 |   - name: mmvet
43 |     split: test
44 |     api_key: [You GPT-4 API]
45 |     gpt_model: gpt-4-0613
46 |   - name: mathvista
47 |     split: test
48 |     api_key: [You GPT-4 API]
49 |     gpt_model: gpt-4-0613
50 |   - name: mmbench
51 |     split: test
52 | models:
53 |   - name: fuyu
54 |     model_path: adept/fuyu-8b
55 | ```
56 | 
57 | Then run
58 | 
59 | ```python
60 | python -m pipeline.benchmarks.evaluate --confg benchmark.yaml
61 | ```
62 | 


--------------------------------------------------------------------------------
/docs/credits.md:
--------------------------------------------------------------------------------
 1 | # Credits 🌟
 2 | 
 3 | The Otter/MIMIC-IT/OtterHD project is a collaborative effort involving many distinguished contributors. Below are their contributions across various aspects of the project.
 4 | 
 5 | ## Otter/MIMIC-IT 🚀
 6 | - **[Otter Team Lead/Model/Training]**: [Bo Li](https://brianboli.com/) 🧠💻, [Yuanhan Zhang](https://zhangyuanhan-ai.github.io/) 🧠💻
 7 | - **[MIMIC-IT Dataset Construction]**: [Liangyu Chen](https://cliangyu.com/) 📊, [Jinghao Wang](https://king159.github.io/) 📊🛠️, [Fanyi Pu](https://pufanyi.github.io/) 📊🛠️, [Jingkang Yang](https://jingkang50.github.io/) 📝
 8 | - **[Training Dataset Maintenance]**: [Fanyi Pu](https://pufanyi.github.io/) 📚, [Joshua Adrian Cahyono](https://www.linkedin.com/in/joshua-adrian-cahyono-5230b814b/?originalSubdomain=sg) 🛠️
 9 | - **[Evaluation Suite Maintenance]**: [Joshua Adrian Cahyono](https://www.linkedin.com/in/joshua-adrian-cahyono-5230b814b/?originalSubdomain=sg) 🖥️🛠️, [Fanyi Pu](https://pufanyi.github.io/) 🖥️
10 | 
11 | ## OtterHD 🎥
12 | - **[OtterHD Model/Training/Optimization]**: [Bo Li](https://brianboli.com/) 🧠💻, [Peiyuan Zhang](https://www.linkedin.com/in/lance-peiyuan-zhang-5b2886194/?originalSubdomain=sg) 🧠⚙️
13 | - **[Magnifier Dataset Construction]**: [Jingkang Yang](https://jingkang50.github.io/) 🛠️, [Fanyi Pu](https://pufanyi.github.io/) 🖥️
14 | - **[Evaluation Suite Maintenance]**: [Joshua Adrian Cahyono](https://www.linkedin.com/in/joshua-adrian-cahyono-5230b814b/?originalSubdomain=sg) 🖥️🛠️, [Fanyi Pu](https://pufanyi.github.io/) 🖥️
15 | 
16 | ## Academic/Directional Guidance 📚
17 | - [Ziwei Liu](https://liuziwei7.github.io/) 👨‍🏫, [Chunyuan Li](https://chunyuan.li/) 👨‍🏫.
18 | 
19 | ---
20 | 
21 | Emojis to symbolize the type of contribution (e.g., brain for leadership, tools for construction, books for maintenance, etc.), and should make the document more visually appealing and enjoyable to read. Remember to check for any potential accessibility issues that might arise from using emojis, as some readers might rely on screen readers or other assistive technologies.
22 | 


--------------------------------------------------------------------------------
/docs/huggingface_compatible.md:
--------------------------------------------------------------------------------
 1 | ## 🤗 Hugging Face Model
 2 | 
 3 | You can use the 🦩 Flamingo model / 🦦 Otter model as a 🤗 Hugging Face model with only a few lines! One-click and then model configs/weights are downloaded automatically.
 4 | 
 5 | ``` python
 6 | from flamingo import FlamingoModel
 7 | flamingo_model = FlamingoModel.from_pretrained("luodian/openflamingo-9b-hf", device_map=auto)
 8 | 
 9 | from otter import OtterModel
10 | otter_model = OtterModel.from_pretrained("luodian/otter-9b-hf", device_map=auto)
11 | ```
12 | 
13 | Previous [OpenFlamingo](https://github.com/mlfoundations/open_flamingo) was developed with [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) (DDP) on A100 cluster. Loading OpenFlamingo-9B to GPU requires **at least 33G GPU memory**, which is only available on A100 GPUs.
14 | 
15 | In order to allow more researchers without access to A100 machines to try training OpenFlamingo, we wrap the OpenFlamingo model into a 🤗 hugging Face model ([Jinghao](https://king159.github.io/) has submitted a [PR](https://github.com/huggingface/transformers/pull/23063) to the /huggingface/transformers!). Via `device_map=auto`, the large model is sharded across multiple GPUs when loading and training. This can help researchers who do not have access to A100-80G GPUs to achieve similar throughput in training, testing on 4x RTX-3090-24G GPUs, and model deployment on 2x RTX-3090-24G GPUs. Specific details are below (may vary depending on the CPU and disk performance, as we conducted training on different machines).
16 | 
17 | <div style="text-align:center">
18 | <img src="https://i.postimg.cc/LsNs55zG/table.png"  width="100%" height="100%">
19 | </div>
20 | 
21 | <!-- ---
22 | <div style="text-align:center">
23 | <img src="https://i.postimg.cc/tTcCdcv5/efficiency.png"  width="100%" height="100%">
24 | </div> -->
25 | 
26 | Our Otter model is also developed in this way and it's deployed on the 🤗 Hugging Face model hub. Our model can be hosted on two RTX-3090-24G GPUs and achieve a similar speed to one A100-80G machine.


--------------------------------------------------------------------------------
/docs/server_host.md:
--------------------------------------------------------------------------------
 1 | ## 🪩 Serving Demo
 2 | 
 3 | We will show you how to host a demo on your own computer using gradio.
 4 | 
 5 | ## Preparation
 6 | 
 7 | ### Warnings: Newest `gradio` and `gradio_client` versions may cause errors ❗❗❗
 8 | 
 9 | Please keep the packages fixed with the following versions (my local verified model serving environment).
10 | ```
11 | braceexpand==0.1.7
12 | einops==0.7.0
13 | fastapi==0.104.1
14 | gradio==4.7.1
15 | horovod==0.27.0
16 | huggingface_hub==0.14.0
17 | ijson==3.2.3
18 | importlib_metadata==6.6.0
19 | inflection==0.5.1
20 | markdown2==2.4.8
21 | natsort==8.4.0
22 | nltk==3.8.1
23 | numpy==1.26.2
24 | openai==1.3.7
25 | opencv_python==4.8.1.78
26 | opencv_python_headless==4.8.1.78
27 | orjson==3.9.10
28 | packaging==23.2
29 | Pillow==10.1.0
30 | pycocoevalcap==1.2
31 | pycocotools==2.0.7
32 | Requests==2.31.0
33 | tqdm==4.65.0
34 | transformers==4.35.0
35 | uvicorn==0.24.0.post1
36 | webdataset==0.2.79
37 | ```
38 | 
39 | ### Download the checkpoints
40 | 
41 | The 🦦 Otter checkpoint and the 🦩 Open Flamingo checkpoint can be auto-downloaded with the code below.
42 | 
43 | ## Start Demo 
44 | 
45 | ### Launch a controller
46 | 
47 | ```Shell
48 | python -m pipeline.serve.controller --host 0.0.0.0 --port 10000
49 | ```
50 | 
51 | ### Launch a model worker
52 | 
53 | ```Shell
54 | # Init our 🦦 Otter model on GPU
55 | CUDA_VISIBLE_DEVICES=0,1 python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model_name otter --checkpoint_path luodian/otter-9b-hf --num_gpus 2 --limit_model_concurrency 200
56 | # Init our 🦦 Otter video model on CPU
57 | CUDA_VISIBLE_DEVICES=0,1 python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40002 --worker http://localhost:40002 --model_name otter_video --checkpoint_path checkpoint/otter9B_DC_fullset_16frames/ --num_gpus 2 --limit_model_concurrency 200 --load_bit 16
58 | # Init original open flamingo model on GPU
59 | CUDA_VISIBLE_DEVICES=2,3 python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model_name open_flamingo --checkpoint_path luodian/openflamingo-9b-hf --num_gpus 2 --limit_model_concurrency 200
60 | 
61 | # Init original open flamingo model on CPU
62 | python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model_name open_flamingo_original --checkpoint_path luodian/openflamingo-9b-hf --num_gpus 0
63 | ```
64 | 
65 | Wait until the process finishes loading the model and you see "Uvicorn running on ...".
66 | 
67 | ### Launch a gradio web server
68 | 
69 | ```Shell
70 | # Image demo
71 | python -m pipeline.serve.gradio_web_server --controller http://localhost:10000 --port 7861
72 | # Video demo
73 | python -m pipeline.serve.gradio_web_server_video --controller http://localhost:10000 --port 7862
74 | ```
75 | 
76 | Now, you can open your browser and chat with the model!
77 | 
78 | ### Examples
79 | If you encounter error stating `FileNotFoundError: [Errno 2] No such file or directory: '/home/luodian/projects/Otter/pipeline/serve/examples/Apple Vision Pro - Reveal Trailer.mp4'`
80 | 
81 | That's because we didnt upload the video examples on Github. You could visit the following [folder](https://entuedu-my.sharepoint.com/:f:/g/personal/libo0013_e_ntu_edu_sg/EjjDhJm4G35EgVHo0Pxi7dEBM7rqdN3e0ZcBCskWuIubUQ?e=C58jI3) to download our used examples and put them to the right place.
82 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: otter
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - python=3.9
6 |   - conda-forge::openjdk
7 |   - pip
8 |   - pip:
9 |     - -r requirements.txt


--------------------------------------------------------------------------------
/mimic-it/convert-it/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/mimic-it/convert-it/__init__.py


--------------------------------------------------------------------------------
/mimic-it/convert-it/datasets/2d.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from abstract_dataset import AbstractDataset
 4 | 
 5 | 
 6 | class Llava(AbstractDataset):
 7 |     def __init__(
 8 |         self,
 9 |         name: str = "Llava",
10 |         short_name="LA",
11 |         *,
12 |         image_root: str,
13 |         image_path: str,
14 |         num_threads: int,
15 |     ):
16 |         """
17 |         Initializes a Llava in-context dataset.
18 | 
19 |         Args:
20 |             name (str): The name of the dataset. Defaults to "Llava".
21 |             short_name (str): The short name of the dataset. Defaults to "LA".
22 |             image_root (str): The root path to the COCO image train split.
23 |             image_path (str): The path to the JSON file containing the dataset images.
24 |                               The images can be downloaded from:
25 |                               https://drive.google.com/file/d/1OVb4_3Uec_xbyUk90aWC6LFpKsIOtR7v/view?usp=sharing.
26 |             num_threads (int): The number of threads to use for processing the images.
27 |         """
28 |         self.image_root = image_root
29 |         super().__init__(name, short_name, image_path, num_threads)
30 | 
31 |     def _load_images(self, image_path: str, num_thread: int) -> dict[str, bytes]:
32 |         """
33 |         Loads the images from the dataset.
34 | 
35 |         Args:
36 |             image_path (str): The path to the JSON file containing the dataset images.
37 |             num_threads (int): The number of threads to use for processing the images.
38 | 
39 |         Returns:
40 |             dict[str, bytes]: A dictionary where the keys are image identifiers and the values are bytes objects representing the images.
41 |         """
42 | 
43 |         def read_image(file_name) -> bytes:
44 |             with open(file_name, "rb") as f:
45 |                 return f.read()
46 | 
47 |         images = {}
48 |         with open(image_path) as f:
49 |             image_ids = json.load(f).keys()
50 | 
51 |         for cur_image_id in image_ids:
52 |             images[cur_image_id] = read_image(f"{self.image_root}/{cur_image_id}.jpg")
53 | 
54 |         return images
55 | 


--------------------------------------------------------------------------------
/mimic-it/convert-it/datasets/3d.py:
--------------------------------------------------------------------------------
 1 | from abstract_dataset import AbstractDataset
 2 | 
 3 | 
 4 | class SceneNavigation(AbstractDataset):
 5 |     def __init__(
 6 |         self,
 7 |         name: str = "SceneNavigation",
 8 |         short_name="SN",
 9 |         *,
10 |         image_path: str,
11 |         num_threads: int,
12 |     ):
13 |         """
14 |         Initializes a SceneNavigation dataset.
15 | 
16 |         Args:
17 |             name (str): The name of the dataset. Defaults to "SceneNavigation".
18 |             short_name (str): The short name of the dataset. Defaults to "SN".
19 |             image_path (str): The directory path of the folder named "scannet_frames_25k" obtained by downloading a compressed file from http://www.scan-net.org/ and extracting it.
20 |             num_threads (int): The number of threads to use for processing the images.
21 |         """
22 |         super().__init__(name, short_name, image_path, num_threads)
23 | 
24 |     def _load_images(self, image_path: str, num_thread: int) -> dict[str, bytes]:
25 |         """
26 |         Loads the images from the dataset.
27 | 
28 |         Args:
29 |             image_path (str): The path to the directory containing the images downloaded from http://www.scan-net.org/.
30 |             num_threads (int): The number of threads to use for processing the images.
31 | 
32 |         Returns:
33 |             dict[str, bytes]: A dictionary where the keys are image identifiers and the values are byte strings representing the images.
34 |         """
35 |         from datasets.utils.scene_navigation_utils import process_data
36 | 
37 |         return process_data(image_path, num_thread)
38 | 


--------------------------------------------------------------------------------
/mimic-it/convert-it/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/mimic-it/convert-it/datasets/__init__.py


--------------------------------------------------------------------------------
/mimic-it/convert-it/datasets/fpv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from glob import glob
 4 | 
 5 | from abstract_dataset import AbstractDataset
 6 | from image_utils import frame_video
 7 | 
 8 | from tqdm import tqdm
 9 | from concurrent.futures import ThreadPoolExecutor
10 | 
11 | 
12 | class EGO4D(AbstractDataset):
13 |     def __init__(
14 |         self,
15 |         name: str = "EGO4D",
16 |         short_name="E4D",
17 |         *,
18 |         image_path: str,
19 |         num_threads: int,
20 |     ):
21 |         """
22 |         Initializes an EGO4D dataset.
23 | 
24 |         Args:
25 |             name (str): The name of the dataset. Defaults to "EGO4D".
26 |             short_name (str): The short name of the dataset. Defaults to "E4D".
27 |             image_path (str): The directory path of the folder downloaded from https://ego4d-data.org/#download.
28 |             num_threads (int): The number of threads to use for processing the images.
29 |         """
30 |         super().__init__(name, short_name, image_path, num_threads)
31 | 
32 |     def _load_images(self, image_path: str, num_thread: int) -> dict[str, bytes]:
33 |         """
34 |         Loads the images from the dataset.
35 | 
36 |         Args:
37 |             image_path (str): The path to the directory containing the images downloaded from https://ego4d-data.org/#download.
38 |             num_threads (int): The number of threads to use for processing the images.
39 | 
40 |         Returns:
41 |             dict[str, bytes]: A dictionary where the keys are image identifiers and the values are image bytes.
42 | 
43 |         Raises:
44 |             FileNotFoundError: If the specified image path does not exist.
45 |         """
46 |         video_paths = glob(os.path.join(image_path, "*"))
47 | 
48 |         def get_image(video_path):
49 |             images = frame_video(video_path)
50 |             images_dict = {}
51 |             video_name = os.path.basename(video_path).split(".")[0]
52 |             for index, image in enumerate(images):
53 |                 images_dict[f"{video_name}_{index:08d}"] = image
54 |             return images_dict
55 | 
56 |         final_images_dict = {}
57 | 
58 |         with ThreadPoolExecutor(max_workers=num_thread) as executor:
59 |             process_bar = tqdm(
60 |                 total=len(video_paths),
61 |                 unit="video",
62 |                 desc="Processing videos into images",
63 |             )
64 |             for images_dict in executor.map(get_image, video_paths):
65 |                 final_images_dict.update(images_dict)
66 |                 process_bar.update()
67 |             process_bar.close()
68 | 
69 |         return final_images_dict
70 | 


--------------------------------------------------------------------------------
/mimic-it/convert-it/datasets/utils/scene_navigation_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from glob import glob
 4 | from tqdm import tqdm
 5 | from concurrent.futures import ThreadPoolExecutor
 6 | 
 7 | from image_utils import process_image
 8 | 
 9 | 
10 | def process(cur_dir, img_root):
11 |     """
12 |     Process images in a directory.
13 | 
14 |     Args:
15 |         cur_dir (str): The name of the current directory.
16 |         img_root (str): The root directory of the images.
17 | 
18 |     Returns:
19 |         dict: A dictionary containing processed images. The keys are unique identifiers
20 |         for each image, and the values are the processed images.
21 | 
22 |     """
23 |     root = os.path.join(img_root, cur_dir, "color")
24 |     file_list = os.listdir(root)
25 |     images = {}
26 |     for cur_file in file_list:
27 |         file_name = os.path.join(img_root, cur_dir, "color", cur_file)
28 |         with open(file_name, "rb") as f:
29 |             img = f.read()
30 |         image_id = f"{cur_dir}_color_{cur_file[:-4]}"
31 |         images[image_id] = process_image(img)
32 |     return images
33 | 
34 | 
35 | def process_data(img_root: str, num_threads: int):
36 |     """
37 |     Process images in parallel using multiple threads.
38 | 
39 |     Args:
40 |         img_root (str): The root directory of the images.
41 |         num_threads (int): The number of threads to use for parallel processing.
42 | 
43 |     Returns:
44 |         dict: A dictionary containing processed images. The keys are unique identifiers
45 |         for each image, and the values are the processed images.
46 | 
47 |     """
48 |     keys_dir = glob(os.path.join(img_root, "scene*_00"))
49 |     keys = list(map(os.path.basename, keys_dir))
50 |     all_images = {}
51 |     process_bar = tqdm(total=len(keys), unit="image", desc="Loading images")
52 |     with ThreadPoolExecutor(max_workers=num_threads) as executor:
53 |         for images in executor.map(process, keys, [img_root] * len(keys)):
54 |             all_images.update(images)
55 |             process_bar.update()
56 |     process_bar.close()
57 |     return all_images
58 | 


--------------------------------------------------------------------------------
/mimic-it/convert-it/datasets/utils/visual_story_telling_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | 
 4 | from tqdm import tqdm
 5 | from concurrent.futures import ThreadPoolExecutor
 6 | from image_utils import resize_image, create_folder
 7 | 
 8 | 
 9 | def get_url(image: dict[str]):
10 |     """
11 |     Retrieve the URL of the image.
12 | 
13 |     Args:
14 |         image: A dictionary containing image information.
15 | 
16 |     Returns:
17 |         The URL of the image.
18 | 
19 |     """
20 |     if "url_o" in image:
21 |         return image["url_o"]
22 |     else:
23 |         return image["url_m"]
24 | 
25 | 
26 | def download_single_image(image: dict[str]) -> tuple[str, bytes]:
27 |     """
28 |     Download a single image and resize it.
29 | 
30 |     Args:
31 |         image: A dictionary containing image information.
32 | 
33 |     Returns:
34 |         A tuple containing the image ID and the resized image as bytes.
35 | 
36 |     """
37 |     url = get_url(image)
38 |     id = image["id"]
39 |     try:
40 |         pic = requests.get(url)
41 |         return id, resize_image(pic.content)
42 |     except:
43 |         return id, None
44 | 
45 | 
46 | def download(images: list[dict[str]], num_threads: int):
47 |     """
48 |     Download multiple images concurrently using thread pooling.
49 | 
50 |     Args:
51 |         images: A list of dictionaries, each containing image information.
52 |         num_threads: The number of threads to use for concurrent downloading.
53 | 
54 |     Returns:
55 |         A dictionary mapping image IDs to their corresponding resized images as bytes.
56 | 
57 |     """
58 |     output = {}
59 |     process_bar = tqdm(total=len(images), unit="image", desc="Downloading images")
60 |     expired_images = []
61 |     with ThreadPoolExecutor(max_workers=num_threads) as executor:
62 |         for id, image in executor.map(download_single_image, images):
63 |             if image is not None:
64 |                 output[id] = image
65 |             else:
66 |                 expired_images.append(id)
67 |             process_bar.update(1)
68 |     process_bar.close()
69 |     create_folder("output")
70 |     with open("output/expired_images.json", "w") as f:
71 |         json.dump(expired_images, f, indent=4)
72 |     return output
73 | 


--------------------------------------------------------------------------------
/mimic-it/convert-it/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import orjson
 3 | 
 4 | from abstract_dataset import get_dataset_by_path
 5 | from image_utils import get_json_data_generator, create_folder
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--name", type=str, required=True, help="Path to the dataset class.")
11 |     parser.add_argument("--num_threads", type=int, default=8, help="Number of threads.")
12 |     parser.add_argument("--image_path", help="Path to the prompt file.")
13 |     parser.add_argument("--image_root", default=None, help="Path to the image root.")
14 | 
15 |     args = parser.parse_args()
16 |     dataset_args = {}
17 |     if args.image_path is not None:
18 |         dataset_args["image_path"] = args.image_path
19 |     if args.num_threads is not None:
20 |         dataset_args["num_threads"] = args.num_threads
21 |     if args.image_root is not None:
22 |         dataset_args["image_root"] = args.image_root
23 |     dataset = get_dataset_by_path(args.name, dataset_args)
24 |     dataset_short_name = dataset.short_name
25 |     dataset = dict(dataset)
26 |     create_folder("output")
27 | 
28 |     # Open the output JSON file in text mode, since we'll be writing strings
29 |     with open(f"output/{dataset_short_name}.json", "w") as f:
30 |         # Write the opening brace for the JSON object
31 |         f.write("{")
32 | 
33 |         # Use a flag to track whether a comma is needed before the next key-value pair
34 |         need_comma = False
35 | 
36 |         # Iterate over the generator, which yields key-value pairs one at a time
37 |         for image_key, base64_data in get_json_data_generator(dataset, dataset_short_name, args.num_threads):
38 |             # Write a comma before the next key-value pair if needed
39 |             if need_comma:
40 |                 f.write(", ")
41 | 
42 |             # Write the key-value pair as a string to the file
43 |             f.write(f'"{image_key}": "{base64_data}"')
44 | 
45 |             # Set the flag to True so that a comma is written before the next key-value pair
46 |             need_comma = True
47 | 
48 |         # Write the closing brace for the JSON object
49 |         f.write("}")
50 | 


--------------------------------------------------------------------------------
/mimic-it/syphus/datasets/3d.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains the implementation of the SceneNavigation and SceneRef and Scene QA datasets.
 3 | """
 4 | 
 5 | import json
 6 | import random
 7 | 
 8 | from abstract_dataset import AbstractDataset
 9 | 
10 | 
11 | class SceneNavigation(AbstractDataset):
12 |     def __init__(
13 |         self,
14 |         name: str = "SceneNavigation",
15 |         in_context_path: str = "prompts/scene_navigation.json",
16 |         query_inputs_path: str = "annotations/scene_navigation/scan_info.json",
17 |     ):
18 |         super().__init__(name, in_context_path, query_inputs_path)
19 | 
20 |     def _load_query_inputs(self, path: str) -> list[str]:
21 |         with open(path, "r") as f:
22 |             json_data = json.load(f)
23 |         results = []
24 |         counter = 0
25 |         for scene_id, inner_dict in json_data.items():
26 |             # if counter > 7:
27 |             #     break
28 |             descriptions = inner_dict["description"]
29 |             random.shuffle(descriptions)
30 |             real_descriptions = []
31 |             for cur_description in descriptions[:50]:
32 |                 real_descriptions.append(cur_description[1])
33 |             results.append(
34 |                 {
35 |                     "id": scene_id,
36 |                     "sentences": "\n".join(real_descriptions),
37 |                 }
38 |             )
39 |             counter += 1
40 |         return results
41 | 


--------------------------------------------------------------------------------
/mimic-it/syphus/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/mimic-it/syphus/datasets/__init__.py


--------------------------------------------------------------------------------
/mimic-it/syphus/datasets/change.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains the implementation of the SpotTheDifference and CleverChange datasets.
 3 | """
 4 | 
 5 | import importlib
 6 | import json
 7 | 
 8 | from abstract_dataset import AbstractDataset
 9 | 
10 | 
11 | class SpotTheDifference(AbstractDataset):
12 |     def __init__(
13 |         self,
14 |         name: str = "SpotTheDifference",
15 |         prompt_path: str = "prompts/spot_the_difference.json",
16 |         query_inputs_path: str = "annotations/spot_the_difference/train.json",
17 |     ):
18 |         super().__init__(name, prompt_path, query_inputs_path)
19 | 
20 |     def _load_query_inputs(self, path: str) -> list[str]:
21 |         with open(path, "r") as f:
22 |             json_data = json.load(f)
23 |         results = []
24 |         for inner_dict in json_data:
25 |             file_id = inner_dict["img_id"]
26 |             sentences = inner_dict["sentences"]
27 |             results.append(
28 |                 {
29 |                     "id": file_id,
30 |                     "sentences": "\n".join(sentences),
31 |                 }
32 |             )
33 |         return results
34 | 
35 | 
36 | class CocoSpotTheDifference(AbstractDataset):
37 |     def __init__(
38 |         self,
39 |         name: str = "CocoSpotTheDifference",
40 |         prompt_path: str = "prompts.coco_spot_the_difference_prompt",
41 |         query_inputs_path: str = "annotations/coco_spot_the_difference/csd_query.json",
42 |     ):
43 |         super().__init__(name, prompt_path, query_inputs_path)
44 | 
45 |     def _load_query_inputs(self, path: str) -> list[dict[str, str]]:
46 |         with open(path) as f:
47 |             json_data = json.load(f)
48 |         results = []
49 |         for file_id, inner_dict in json_data.items():
50 |             sentences = inner_dict["sentences"]
51 |             results.append(
52 |                 {
53 |                     "id": file_id,
54 |                     "sentences": sentences,
55 |                 }
56 |             )
57 |         return results
58 | 
59 |     def _load_prompt(self, path: str) -> dict[str, str]:
60 |         prompt_file = importlib.import_module(path)
61 |         return {
62 |             "system_message": prompt_file.system_message,
63 |             "in_context": prompt_file.in_context,
64 |         }
65 | 


--------------------------------------------------------------------------------
/mimic-it/syphus/datasets/translate.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import json
 3 | 
 4 | from abstract_dataset import AbstractDataset
 5 | 
 6 | 
 7 | class TranslationDataset(AbstractDataset):
 8 |     def __init__(
 9 |         self,
10 |         name: str = "Translations",
11 |         prompt_path: str = "prompts.translation_prompt",
12 |         query_inputs_path: str = None,
13 |     ):
14 |         super().__init__(name, prompt_path, query_inputs_path)
15 | 
16 |     def _load_prompt(self, path: str) -> dict[str, str]:
17 |         prompt_file = importlib.import_module(path)
18 |         return {
19 |             "system_message": prompt_file.system_message,
20 |             "in_context": prompt_file.in_context,
21 |         }
22 | 
23 |     def _load_query_inputs(self, path: str) -> list[str]:
24 |         with open(path, "r") as f:
25 |             json_data = json.load(f)["data"]
26 |         temp = []
27 |         for file_id, i in json_data.items():
28 |             all_string = "<a>" + i["instruction"] + "</a>" + "<b>" + i["answer"] + "</b>"
29 |             temp.extend([{"id": file_id, "sentences": all_string}])
30 |         return temp
31 | 


--------------------------------------------------------------------------------
/mimic-it/syphus/prompts/funqa_dia.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system_message": "You are now my dialogue generation assistant, you have done a lot of video understanding and mastered this ability, I will give you a 'description' of the counter-intuitive video and a 'counter-intuitive reason' of why it is counter-intuitive.\nWhere 'description' is an objective description of a video, including what happens in the video and 'counter-intuitive reason' is an explanation of why the video is counter-intuitive.\nYou're going to use that information to generate a question and answer for me, using the text that I give you, start with descriptive questions like what, when, which, who, where, and then start with why, how, and then you're going to give some open questions,\nThat is to say, the question is gradual, and you should try to use the previous information later. All of the above answers are short sentences, and try not to exceed 30 words"
 3 | ,
 4 |     "in_context": [
 5 |         {
 6 |             "role": "user",
 7 |             "content": "description: The barber gave the woman a haircut. After the woman cut her hair, her hair stood on end. Next to her appeared a parrot with the same hairstyle as the woman\n why counter-intuitive: Women look funny when they get their hair cut and look like parrot hair."
 8 |         },
 9 |         {
10 |             "role": "assistant",
11 |             "content":  [
12 |                 {
13 |                   "question": "What is happening in the video?",
14 |                   "answer": "A girl is getting her hair cut by a hairdresser. After the haircut, her hair stands up and a parrot appears next to her with the same hairstyle."
15 |                 },
16 |                 {
17 |                   "question": "Why is the video humorous?",
18 |                   "answer": "The girl's hair standing up and matching the parrot's hairstyle is unexpected and funny."
19 |                 },
20 |                 {
21 |                   "question": "What this girl's mood might be like?",
22 |                   "answer": "She may be dissatisfied because generally speaking, a human's hair does not look like a parrot, which is not considered aesthetically pleasing."
23 |                 },
24 |                 {
25 |                   "question": "Does this parrot feel humor because the girl's hair?",
26 |                   "answer": "Probably not, it does not feel this, probably because its intelligence can not understand the relationship between the girl's hair and its hair."
27 |                 },
28 |                 {
29 |                   "question": "Do you think the parrot intentionally matched the girl's hairstyle?",
30 |                   "answer": "It's unlikely that the parrot intentionally matched the girl's hairstyle. It's more likely a coincidence or the result of the parrot's natural feathers."
31 |                 },
32 |                 {
33 |                   "question": "What do you think the girl will do afterwards?",
34 |                   "answer": "It's unlikely that the parrot intentionally matched the girl's hairstyle. It's more likely a coincidence or the result of the parrot's natural feathers."
35 |                 }
36 |             ]
37 |         }
38 |     ]
39 | }


--------------------------------------------------------------------------------
/mimic-it/syphus/prompts/funqa_mcqa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system_message": "You are now a data augmentation assistant. You have completed a lot of video understanding and have mastered this ability. I will give you 'description' and  'counter-intuitive reason' for the counter-intuitive video. You will help me to complete the understanding of this paragraph of text and generate 5 English question and answer pairs from it. The form is multiple choice. Each question has 5 choices, one of which is the correct answer and the other three are interference items.\n You need to mark the correct answer. The question can only be what, how, which or who. The answer is a word, phrase, or sentence of 15 words or less."
 3 | ,
 4 |     "in_context": [
 5 |         {
 6 |             "role": "user",
 7 |             "content": "description: A white cat is lying flat inside the toilet bowl.\n counter-intuitive reason: The cat is so comfortable lying on its stomach in the toilet that it is both helpless and funny."
 8 |         },
 9 |         {
10 |             "role": "assistant",
11 |             "content": [
12 |                 {
13 |                     "Question": "What is the main subject of the video?",
14 |                     "Options 1": "A black cat",
15 |                     "Options 2": "A white cat",
16 |                     "Options 3": "A dog",
17 |                     "Options 4": "A bird",
18 |                     "Options 5": "A fish",
19 |                     "Answer": "2"
20 |                 },
21 |                 {
22 |                     "Question": "How is the white cat positioned inside the toilet bowl?",
23 |                     "Options 1": "Standing on its hind legs",
24 |                     "Options 2": "Sleeping on its side",
25 |                     "Options 3": "Lying flat on its stomach",
26 |                     "Options 4": "Sitting upright",
27 |                     "Options 5": "Hanging upside down",
28 |                     "Answer": "3"
29 |                 },
30 |                 {
31 |                     "Question": "Which word best describes the cat's behavior in the video?",
32 |                     "Options 1": "Annoyed",
33 |                     "Options 2": "Excited",
34 |                     "Options 3": "Curious",
35 |                     "Options 4": "Comfortable",
36 |                     "Options 5": "Scared",
37 |                     "Answer": "4"
38 |                 },
39 |                 {
40 |                     "Question": "What is the cat's posture inside the toilet bowl?",
41 |                     "Options 1": "Upside down",
42 |                     "Options 2": "Leaning against the edge",
43 |                     "Options 3": "Sitting like a human",
44 |                     "Options 4": "Balanced on one leg",
45 |                     "Options 5": "Lying flat on its stomach",
46 |                     "answer": "5"
47 |                 },
48 |                 {
49 |                     "Question": "Who or what finds the cat's position in the toilet bowl funny?",
50 |                     "Options 1": "The cat itself",
51 |                     "Options 2": "A person watching the video",
52 |                     "Options 3": "Another animal in the room",
53 |                     "Options 4": "Nobody finds it funny",
54 |                     "Options 5": "The cat\"s owner",
55 |                     "Answer": "2"
56 |                 }
57 |             ]
58 |         }
59 |     ]
60 | }


--------------------------------------------------------------------------------
/mimic-it/syphus/prompts/funqa_translation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system_message": "Now I have two tasks for you to complete at once.  Task 1. 'CH2Eng Translation': Please translate the content into English. Task 2. 'Expansion_with_Eng': Please generate five synonyms according to the content just translated into English.",
 3 |     "in_context": [
 4 |         {
 5 |             "role": "user",
 6 |             "content": "一只白色的猫咪整个趴在厕所蹲厕里。"
 7 |         },
 8 |         {
 9 |             "role": "assistant",
10 |             "content": [
11 |                 {
12 |                     "CH2Eng Translation": "A white cat is lying flat inside the toilet bowl.",
13 |                     "Expansion_with_Eng_1": "A white cat resting entirely in a toilet bowl.",
14 |                     "Expansion_with_Eng_2": "The entire body of a white cat is sprawled across the toilet seat.",
15 |                     "Expansion_with_Eng_3": "The white cat is The white cat is lying flat within the toilet's squatting position.",
16 |                     "Expansion_with_Eng_4": "A white-coloured feline is stretched out inside the toilet bowl.",
17 |                     "Expansion_with_Eng_5": "The complete posture of a white cat can be seen in the toilet, lying down."
18 |                 }
19 |             ]
20 |         }
21 |     ]
22 | }


--------------------------------------------------------------------------------
/mimic-it/syphus/prompts/spot_the_difference.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system_message": "You are an AI assistant playing the spot the difference game. You will be presented with several sentences that describe the differences between two images. The two images are frames extracted from the video-surveillance footage.\n\nYour response shall always contain a pair of question and answer. The question you generate should have definite and concrete answer according to the description. When generating answer, you should answer as if you are watching the images. You are not allowed to say \"according to descrption\" or similar words in your outputs. You do not have to provide a specific answer and you should not ask for more details.\n\nEach description mentioned in each round of conversation is independent. You should not assume that the descriptions in a new round of conversation are related to the previous context.\n\nIf there are no differences, you still have to follow the previous format.",
 3 |     "in_context": [
 4 |         {
 5 |             "role": "user",
 6 |             "content": "the car in the upper left corner of the picture is gone\nthe two people in the upper part of the picture are gone\nthe person near the handicapped parking place is not in the second picture\nthere appears to be a figure maybe man that was near handicapped parking place straining near a pole in the bottom of the picture\nthe two people who were on th sidewalk are not in the second picture\nthere are two people in the seconde picture that are near the handicapped parking place"
 7 |         },
 8 |         {
 9 |             "role": "assistant",
10 |             "content": [
11 |                 {
12 |                     "Question": "Could you summarize the differences between the two images?",
13 |                     "Answer": "The left image has five people and the right image has only three. The people are not the same. The car in the upper left corner of the picture is gone. The remaining elements in the two images are the same."
14 |                 },
15 |                 {
16 |                     "Question": "Does the right image have more people than the left image?",
17 |                     "Answer": "Yes, the left image has two more people than the right image."
18 |                 },
19 |                 {
20 |                     "Question": "Does the car in the upper left corner of the picture still exist in the second picture?",
21 |                     "Answer": "No, the car in the upper left corner of the picture is gone."
22 |                 },
23 |                 {
24 |                     "Question": "Does the right image have more cars than the left image?",
25 |                     "Answer": "Yes, the right image has one car while the left image has no car."
26 |                 },
27 |                 {
28 |                     "Question": "Does the people in the left image the same as the people in the right image?",
29 |                     "Answer": "No, the people in the left image are not the same as the people in the right image."
30 |                 }
31 |             ]
32 |         },
33 |         {
34 |             "role": "user",
35 |             "content": "There is no difference between the two images."
36 |         },
37 |         {
38 |             "role": "assistant",
39 |             "content": [
40 |                 {
41 |                     "Question": "Could you summarize the differences between the two images?",
42 |                     "Answer": "There is no difference between the two images."
43 |                 }
44 |             ]
45 |         }
46 |     ]
47 | }


--------------------------------------------------------------------------------
/mimic-it/syphus/prompts/translation_prompt.py:
--------------------------------------------------------------------------------
 1 | system_message = """As an AI assistant, you are an expert in translating English to natural Chinese(zh), Spanish(es), Japanese(ja), German(de), French(fr), Korean(ko), and Arabic(ar). You will provide professional translations without any grammar mistakes. Your translation should be in a tone of native speaker. The input format will be <a>xxx</a><b>xxx</b>. You should translate the content inside the <a> and <b> tag. Your output should strictly follow the format of this json file:\n
 2 |                 {
 3 |                     "a": {
 4 |                         "zh": "xxx",
 5 |                         "es": "xxx",
 6 |                         "ja": "xxx",
 7 |                         "de": "xxx",
 8 |                         "fr": "xxx",
 9 |                         "ko": "xxx",
10 |                         "ar": "xxx"
11 |                     },
12 |                     "b": {
13 |                         "zh": "xxx",
14 |                         "es": "xxx",
15 |                         "ja": "xxx",
16 |                         "de": "xxx",
17 |                         "fr": "xxx",
18 |                         "ko": "xxx",
19 |                         "ar": "xxx"
20 |                     }
21 |                 }"""
22 | in_context = [
23 |     {
24 |         "role": "user",
25 |         "content": "<a>Hey, bro, I feel like having a cup of tea here!</a><b>Having tea is a great choice! Right now, you have two options in front of you: green tea and oolong tea. Which one would you like to have? Of course, if you want to enhance the tea-drinking experience even more, you can dim the lights here and play some classic Chinese music on your phone!</b>",
26 |     },
27 |     {
28 |         "role": "assistant",
29 |         "content": """{
30 |                 "a": {
31 |                     "zh": "嘿，兄弟，我想在这里喝口茶!",
32 |                     "es": "¡eh, tío, me apetece tomar una taza de café aquí!",
33 |                     "ja": "ねえ、ここでお茶を飲みたいな！",
34 |                     "de": "Hey, ich möchte hier gerne eine Tasse Tee trinken!",
35 |                     "fr": "Bonjour, je voudrais prendre une tasse de thé ici!",
36 |                     "ko": "헤이, 브로, 난 여기에 자 한 잔 바시고 싶어!",
37 |                     "ar": "يا أخي ، أشعر وكأنني أرغب في شرب كوب من الشاي هنا!",
38 |                 },
39 |                 "b":{
40 |                     "zh": "嗨！喝茶是个不错的选择！现在你面前有绿茶和乌龙茶，你想喝哪种呢？当然，如果你想有更好的品茶氛围的话，你可以把这里的灯光调暗，再用你的手机来点经典的中国音乐",
41 |                     "es": "¡Tomar té es una muy buena elección! ¿Cuál te gustaría tomar? Por supuesto, si quieres disfrutar aún más de la experiencia de tomar té, puedes atenuar las luces de aquí y reproducir música clásica china  desde tu teléfono móvil.",
42 |                     "ja": "お茶を飲むのはいい選択だよ！今、目の前には緑茶とウーロン茶がありますが、どちらが飲みたいですか？もちろん、もっと良いお茶の時間を過ごしたいなら、ここでの照明を暗くして、あなたの携帯で中国のクラシック音楽を流すといいよ！",
43 |                     "de": "Tee zu trinken ist eine gute Wahl! Vor dir hast du jetzt die Wahl zwischen grünem Tee und Oolong-Tee. Welche Sorte möchtest du trinken? Natürlich, wenn du ein noch besseres Teetrinkerlebnis haben möchtest, kannst du hier das Licht abdunkeln und mit deinem Handy klassische chinesische Musik abspielen!",
44 |                     "fr": "Boire du thé est un excellent choix ! Maintenant, vous avez deux options devant vous, le thé vert et le thé oolong. Lequel aimeriez-vous prendre? Bien sûr, si vous souhaitez améliorer votre expérience de dégustation, vous pouvez baisser les lumières ici et écouter de la musique chinoise classique sur votre téléphone!",
45 |                     "ko": "하이, 좋은 선택이야. 지금 녹차와 우롱차가 있는데 뭘 마시고 싶어? 물론 차를 더욱 풍미있게 즐기고 싶으면 여기 조명을 어둡게 조절하고, 핸드폰으로 중국 클래식 음악을 틀어도 좋아!",
46 |                     "ar": "شرب الشاي هو اختيار رائع! الآن ، لديك خياران أمامك: الشاي الأخضر والشاي الأوولونغ. أيهما تفضل؟ بالطبع ، إذا أردت تعزيز تجربة شرب الشاي أكثر ، يمكنك خفت الأنوار هنا وتشغيل بعض الموسيقى الصينية الكلاسيكية على هاتفك!"
47 |                 }
48 |                 }
49 |                 """,
50 |     },
51 | ]
52 | 


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | distributed_type: MULTI_GPU
 3 | downcast_bf16: false
 4 | machine_rank: 0
 5 | main_training_function: main
 6 | mixed_precision: bf16
 7 | num_machines: 1
 8 | num_processes: 2
 9 | rdzv_backend: static
10 | same_network: false
11 | tpu_use_cluster: false
12 | tpu_use_sudo: false
13 | use_cpu: false
14 | main_process_port: 20685
15 | 


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_fsdp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | distributed_type: no
 3 | downcast_bf16: true
 4 | machine_rank: 0
 5 | main_training_function: main
 6 | mixed_precision: bf16
 7 | num_machines: 1
 8 | num_processes: 1
 9 | rdzv_backend: static
10 | same_network: true
11 | tpu_use_cluster: false
12 | tpu_use_sudo: false
13 | use_cpu: false
14 | main_process_port: 20687
15 | 


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_zero1.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   gradient_accumulation_steps: 1
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 1
 9 | distributed_type: DEEPSPEED
10 | fsdp_config: {}
11 | machine_rank: 0
12 | main_process_ip: null
13 | main_process_port: null
14 | main_training_function: main
15 | mixed_precision: bf16
16 | num_machines: 1
17 | num_processes: 8
18 | use_cpu: false


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   gradient_accumulation_steps: 4
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | fsdp_config: {}
11 | machine_rank: 0
12 | main_process_ip: null
13 | main_process_port: null
14 | main_training_function: main
15 | mixed_precision: bf16
16 | num_machines: 1
17 | num_processes: 8
18 | use_cpu: false


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_zero2_slurm.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   gradient_accumulation_steps: 1
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: cpu
 6 |   offload_param_device: cpu
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | fsdp_config: {}
11 | machine_rank: 0
12 | main_process_ip: null
13 | main_process_port: null
14 | main_training_function: main
15 | mixed_precision: bf16
16 | num_machines: 1
17 | num_processes: 8
18 | use_cpu: false
19 | 


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config: 
 3 |   gradient_accumulation_steps: 1
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | fsdp_config: {}
12 | machine_rank: 0
13 | main_process_ip: null
14 | main_process_port: 20333
15 | main_training_function: main
16 | mixed_precision: bf16
17 | num_machines: 1
18 | num_processes: 8
19 | use_cpu: false


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_zero3_offload.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config: 
 3 |   gradient_accumulation_steps: 1
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: cpu
 6 |   offload_param_device: cpu
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | fsdp_config: {}
12 | machine_rank: 0
13 | main_process_ip: null
14 | main_process_port: 20333
15 | main_training_function: main
16 | mixed_precision: bf16
17 | num_machines: 1
18 | num_processes: 8
19 | use_cpu: false


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/accelerate_config_zero3_slurm.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   deepspeed_multinode_launcher: standard
 4 |   gradient_accumulation_steps: 2
 5 |   gradient_clipping: 1.0
 6 |   offload_optimizer_device: none
 7 |   offload_param_device: none
 8 |   zero3_init_flag: true
 9 |   zero3_save_16bit_model: true
10 |   zero_stage: 3
11 | distributed_type: DEEPSPEED
12 | fsdp_config: {}
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 2
16 | num_processes: 16
17 | use_cpu: false
18 | 


--------------------------------------------------------------------------------
/pipeline/accelerate_configs/ds_zero3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/pipeline/benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | config.yaml


--------------------------------------------------------------------------------
/pipeline/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/__init__.py


--------------------------------------------------------------------------------
/pipeline/benchmarks/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/datasets/__init__.py


--------------------------------------------------------------------------------
/pipeline/benchmarks/datasets/base_eval_dataset.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from PIL import Image
 3 | from typing import Dict, List, Any
 4 | 
 5 | import importlib
 6 | 
 7 | AVAILABLE_EVAL_DATASETS: Dict[str, str] = {
 8 |     "mmbench": "MMBenchDataset",
 9 |     "mme": "MMEDataset",
10 |     "mathvista": "MathVistaDataset",
11 |     "mmvet": "MMVetDataset",
12 |     "seedbench": "SEEDBenchDataset",
13 |     "pope": "PopeDataset",
14 |     "scienceqa": "ScienceQADataset",
15 |     "magnifierbench": "MagnifierBenchDataset",
16 | }
17 | 
18 | 
19 | class BaseEvalDataset(ABC):
20 |     def __init__(self, name: str, dataset_path: str, *, max_batch_size: int = 1):
21 |         self.name = name
22 |         self.dataset_path = dataset_path
23 |         self.max_batch_size = max_batch_size
24 | 
25 |     def evaluate(self, model, **kwargs):
26 |         return self._evaluate(model, **kwargs)
27 |         # batch = min(model.max_batch_size, self.max_batch_size)
28 |         # if batch == 1:
29 |         #     return self._evaluate(model, **kwargs)
30 |         # else:
31 |         #     kwargs["batch"] = batch
32 |         #     return self._evaluate(model, **kwargs)
33 | 
34 |     @abstractmethod
35 |     def _evaluate(self, model: str):
36 |         pass
37 | 
38 | 
39 | def load_dataset(dataset_name: str, dataset_args: Dict[str, str] = {}) -> BaseEvalDataset:
40 |     assert dataset_name in AVAILABLE_EVAL_DATASETS, f"{dataset_name} is not an available eval dataset."
41 |     module_path = "pipeline.benchmarks.datasets." + dataset_name
42 |     dataset_formal_name = AVAILABLE_EVAL_DATASETS[dataset_name]
43 |     imported_module = importlib.import_module(module_path)
44 |     dataset_class = getattr(imported_module, dataset_formal_name)
45 |     print(f"Imported class: {dataset_class}")
46 |     # import pdb;pdb.set_trace()
47 |     # get dataset args without "name"
48 |     init_args = dataset_args.copy()
49 |     init_args.pop("name")
50 |     return dataset_class(**init_args)
51 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/datasets/seedbench.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tqdm import tqdm
 3 | from .base_eval_dataset import BaseEvalDataset
 4 | from datasets import load_dataset
 5 | import json
 6 | import os
 7 | import datetime
 8 | 
 9 | 
10 | class SEEDBenchDataset(BaseEvalDataset):
11 |     def __init__(self, data_path: str = "Otter-AI/SEEDBench", split="test", default_output_path="./logs", cache_dir=None):
12 |         super().__init__("SEEDBenchDataset", data_path)
13 |         print("Loading dataset from", data_path)
14 |         self.data = load_dataset(data_path, split=split, cache_dir=cache_dir)
15 |         self.default_output_path = default_output_path
16 |         if not os.path.exists(default_output_path):
17 |             os.makedirs(default_output_path)
18 | 
19 |     def _evaluate(self, model):
20 |         count = 0
21 |         num_correct = 0
22 |         cur_datetime = datetime.datetime.now().strftime("%Y%m%d-%H%M")
23 |         output_path = os.path.join(self.default_output_path, f"seedbench_{model.name}_test_submit_{cur_datetime}.json")
24 |         output_f = open(output_path, "a")
25 |         with tqdm(total=len(self.data), desc="Evaluating") as pbar:
26 |             for data_dict in self.data:
27 |                 image = data_dict["image"]
28 |                 question = data_dict["question"] + " There are several options:"
29 |                 option_index = ["A", "B", "C", "D"]
30 |                 for cur_idx in range(4):
31 |                     question += f" {option_index[cur_idx]}. {data_dict[f'choice_{option_index[cur_idx].lower()}']}"
32 | 
33 |                 answer = data_dict["answer"]
34 |                 options = [
35 |                     data_dict["choice_a"],
36 |                     data_dict["choice_b"],
37 |                     data_dict["choice_c"],
38 |                     data_dict["choice_d"],
39 |                 ]
40 | 
41 |                 option_losses = []
42 |                 for idx, option in enumerate(options):
43 |                     option = option_index[idx] + ". " + option
44 |                     loss = model.eval_forward(question, option, image)
45 |                     option_losses.append(loss.item())
46 | 
47 |                 prediction_idx = np.argmin(option_losses)
48 |                 prediction = ["A", "B", "C", "D"][prediction_idx]
49 |                 if prediction == answer:
50 |                     num_correct += 1
51 |                 count += 1
52 | 
53 |             answer_record = {"question_id": data_dict["question_id"], "prediction": prediction}
54 |             output_f.write(json.dumps(answer_record) + "\n")
55 | 
56 |             answer_record = {"question_id": data_dict["question_id"], "prediction": prediction}
57 |             output_f.write(json.dumps(answer_record) + "\n")
58 | 
59 |             accuracy = num_correct / count * 100
60 |             pbar.set_postfix(accuracy=f"{accuracy:.2f}")
61 |             pbar.update(1)
62 | 
63 |         accuracy = num_correct / count * 100
64 |         print(f"Accuracy: {accuracy:.2f}%")
65 |         return accuracy
66 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/models/__init__.py


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/base_model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from PIL import Image
 3 | from typing import Dict
 4 | 
 5 | import importlib
 6 | 
 7 | AVAILABLE_MODELS: Dict[str, str] = {
 8 |     "video_chat": "VideoChat",
 9 |     "otter_video": "OtterVideo",
10 |     "llama_adapter": "LlamaAdapter",
11 |     "mplug_owl": "mPlug_owl",
12 |     "video_chatgpt": "Video_ChatGPT",
13 |     "otter_image": "OtterImage",
14 |     "frozen_bilm": "FrozenBilm",
15 |     "idefics": "Idefics",
16 |     "fuyu": "Fuyu",
17 |     "otterhd": "OtterHD",
18 |     "instructblip": "InstructBLIP",
19 |     "qwen_vl": "QwenVL",
20 |     "llava_model": "LLaVA_Model",
21 |     "instructblip": "InstructBLIP",
22 |     "gpt4v": "OpenAIGPT4Vision",
23 | }
24 | 
25 | 
26 | class BaseModel(ABC):
27 |     def __init__(self, model_name: str, model_path: str, *, max_batch_size: int = 1):
28 |         self.name = model_name
29 |         self.model_path = model_path
30 |         self.max_batch_size = max_batch_size
31 | 
32 |     @abstractmethod
33 |     def generate(self, **kwargs):
34 |         pass
35 | 
36 |     @abstractmethod
37 |     def eval_forward(self, **kwargs):
38 |         pass
39 | 
40 | 
41 | def load_model(model_name: str, model_args: Dict[str, str]) -> BaseModel:
42 |     assert model_name in AVAILABLE_MODELS, f"{model_name} is not an available model."
43 |     module_path = "pipeline.benchmarks.models." + model_name
44 |     model_formal_name = AVAILABLE_MODELS[model_name]
45 |     imported_module = importlib.import_module(module_path)
46 |     model_class = getattr(imported_module, model_formal_name)
47 |     print(f"Imported class: {model_class}")
48 |     model_args.pop("name")
49 |     return model_class(**model_args)
50 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/frozen_bilm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/models/frozen_bilm.py


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/fuyu.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from transformers import AutoTokenizer, FuyuImageProcessor
 3 | from transformers import FuyuForCausalLM
 4 | from src.otter_ai.models.fuyu.processing_fuyu import FuyuProcessor
 5 | from PIL import Image
 6 | from .base_model import BaseModel
 7 | import torch
 8 | import numpy as np
 9 | import warnings
10 | import io
11 | import base64
12 | import math
13 | 
14 | warnings.filterwarnings("ignore")
15 | 
16 | 
17 | def get_pil_image(raw_image_data) -> Image.Image:
18 |     if isinstance(raw_image_data, Image.Image):
19 |         return raw_image_data
20 | 
21 |     elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data:
22 |         return Image.open(io.BytesIO(raw_image_data["bytes"]))
23 | 
24 |     elif isinstance(raw_image_data, str):  # Assuming this is a base64 encoded string
25 |         image_bytes = base64.b64decode(raw_image_data)
26 |         return Image.open(io.BytesIO(image_bytes))
27 | 
28 |     else:
29 |         raise ValueError("Unsupported image data format")
30 | 
31 | 
32 | class Fuyu(BaseModel):
33 |     def __init__(self, model_path: str = "adept/fuyu-8b", cuda_id: int = 0, resolution: int = -1, max_new_tokens=256):
34 |         super().__init__("fuyu", model_path)
35 |         self.resolution = resolution
36 |         self.device = f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu"
37 |         self.model = FuyuForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(self.device)
38 |         self.tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
39 |         self.image_processor = FuyuImageProcessor()
40 |         self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
41 |         self.max_new_tokens = max_new_tokens
42 |         self.bad_words_list = ["User:", "Assistant:"]
43 |         self.bad_words_ids = self.tokenizer(self.bad_words_list, add_special_tokens=False).input_ids
44 | 
45 |     def generate(self, text_prompt: str, raw_image_data: str):
46 |         raw_image_data = get_pil_image(raw_image_data)
47 |         raw_image_data = raw_image_data.convert("RGB")
48 |         # make sure the image is in RGB format and resize to match the width
49 |         if self.resolution != -1:
50 |             width, height = raw_image_data.size
51 |             short_edge = min(width, height)
52 |             scaling_factor = self.resolution / short_edge
53 |             new_width = math.ceil(width * scaling_factor)
54 |             new_height = math.ceil(height * scaling_factor)
55 |             raw_image_data = raw_image_data.resize((new_width, new_height), Image.ANTIALIAS)
56 |         # formated_prompt = f"User: {text_prompt} Assistant:"
57 |         model_inputs = self.processor(text=text_prompt, images=[raw_image_data], device=self.device)
58 |         for k, v in model_inputs.items():
59 |             model_inputs[k] = v.to(self.device)
60 | 
61 |         model_inputs["image_patches"] = model_inputs["image_patches"].to(dtype=next(self.model.parameters()).dtype)
62 |         generation_output = self.model.generate(**model_inputs, max_new_tokens=self.max_new_tokens, pad_token_id=self.tokenizer.eos_token_id, bad_words_ids=self.bad_words_ids)
63 |         generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True)
64 |         return generation_text[0].split("\x04")[1].strip(" ").strip("\n")
65 | 
66 |     def eval_forward(self, **kwargs):
67 |         return super().eval_forward(**kwargs)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     model = Fuyu()
72 |     print(model.generate("Generate a coco-style caption.\n", Image.open("/home/luodian/projects/Otter/archived/test_images/rabbit.png").convert("RGB")))
73 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/gpt4v.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import base64
 3 | from .base_model import BaseModel
 4 | from PIL import Image
 5 | import io
 6 | import time
 7 | 
 8 | 
 9 | def get_pil_image(raw_image_data) -> Image.Image:
10 |     if isinstance(raw_image_data, Image.Image):
11 |         return raw_image_data
12 | 
13 |     elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data:
14 |         return Image.open(io.BytesIO(raw_image_data["bytes"]))
15 | 
16 |     elif isinstance(raw_image_data, str):  # Assuming this is a base64 encoded string
17 |         image_bytes = base64.b64decode(raw_image_data)
18 |         return Image.open(io.BytesIO(image_bytes))
19 | 
20 |     else:
21 |         raise ValueError("Unsupported image data format")
22 | 
23 | 
24 | class OpenAIGPT4Vision(BaseModel):
25 |     def __init__(self, api_key: str, max_new_tokens: int = 256):
26 |         super().__init__("openai-gpt4", "gpt-4-vision-preview")
27 |         self.api_key = api_key
28 |         self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
29 |         self.max_new_tokens = max_new_tokens
30 | 
31 |     @staticmethod
32 |     def encode_image_to_base64(raw_image_data) -> str:
33 |         if isinstance(raw_image_data, Image.Image):
34 |             buffered = io.BytesIO()
35 |             raw_image_data.save(buffered, format="JPEG")
36 |             return base64.b64encode(buffered.getvalue()).decode("utf-8")
37 |         raise ValueError("The input image data must be a PIL.Image.Image")
38 | 
39 |     def generate(self, text_prompt: str, raw_image_data):
40 |         raw_image_data = get_pil_image(raw_image_data).convert("RGB")
41 |         base64_image = self.encode_image_to_base64(raw_image_data)
42 | 
43 |         payload = {
44 |             "model": "gpt-4-vision-preview",
45 |             "messages": [
46 |                 {
47 |                     "role": "user",
48 |                     "content": [
49 |                         {"type": "text", "text": text_prompt},
50 |                         {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
51 |                     ],
52 |                 }
53 |             ],
54 |             "max_tokens": self.max_new_tokens,
55 |         }
56 | 
57 |         retry = True
58 |         retry_times = 0
59 |         while retry and retry_times < 5:
60 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
61 |             if response.status_code == 200:
62 |                 response_data = response.json()
63 |                 return response_data["choices"][0]["message"]["content"]
64 |             else:
65 |                 print(f"Failed to connect to OpenAI API: {response.status_code} - {response.text}. Retrying...")
66 |                 time.sleep(10)
67 |                 retry_times += 1
68 |         return "Failed to connect to OpenAI GPT4V API"
69 | 
70 |     def eval_forward(self, **kwargs):
71 |         return super().eval_forward(**kwargs)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     # Use your own API key here
76 |     api_key = "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"
77 |     model = OpenAIGPT4Vision(api_key)
78 |     image = Image.open("/home/luodian/projects/Otter/archived/data/G4_IMG_00001.png").convert("RGB")
79 |     print(model.generate("What’s in this image?", image))
80 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/instructblip.py:
--------------------------------------------------------------------------------
 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
 2 | from PIL import Image
 3 | from .base_model import BaseModel
 4 | import torch
 5 | import numpy as np
 6 | import warnings
 7 | import io
 8 | import base64
 9 | 
10 | warnings.filterwarnings("ignore")
11 | 
12 | 
13 | def get_pil_image(raw_image_data) -> Image.Image:
14 |     if isinstance(raw_image_data, Image.Image):
15 |         return raw_image_data
16 | 
17 |     elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data:
18 |         return Image.open(io.BytesIO(raw_image_data["bytes"]))
19 | 
20 |     elif isinstance(raw_image_data, str):  # Assuming this is a base64 encoded string
21 |         image_bytes = base64.b64decode(raw_image_data)
22 |         return Image.open(io.BytesIO(image_bytes))
23 | 
24 |     else:
25 |         raise ValueError("Unsupported image data format")
26 | 
27 | 
28 | class InstructBLIP(BaseModel):
29 |     def __init__(self, model_path: str = "Salesforce/instructblip-vicuna-7b", cuda_id: int = 0, max_new_tokens=32):
30 |         super().__init__("instructblip", model_path)
31 |         self.device = f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu"
32 |         self.model = InstructBlipForConditionalGeneration.from_pretrained(model_path).to(self.device)
33 |         self.processor = InstructBlipProcessor.from_pretrained(model_path)
34 |         self.max_new_tokens = max_new_tokens
35 | 
36 |     def generate(self, text_prompt: str, raw_image_data: str):
37 |         raw_image_data = get_pil_image(raw_image_data)
38 |         raw_image_data = raw_image_data.convert("RGB")
39 |         formatted_prompt = f"{text_prompt}\nAnswer:"
40 |         # Accordling to https://huggingface.co/Salesforce/instructblip-vicuna-7b . Seems that is is no special prompt format for instruct blip
41 |         model_inputs = self.processor(images=raw_image_data, text=formatted_prompt, return_tensors="pt").to(self.device)
42 |         # We follow the recommended parameter here:https://huggingface.co/Salesforce/instructblip-vicuna-7b
43 |         generation_output = self.model.generate(**model_inputs, do_sample=False, max_new_tokens=self.max_new_tokens, min_length=1)
44 |         generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True)
45 |         return generation_text[0]
46 | 
47 |     def eval_forward(self, question, answer, image):
48 |         raise NotImplementedError
49 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/llama_adapter.py:
--------------------------------------------------------------------------------
 1 | from .LLaMA_Adapter.imagebind_LLM.ImageBind import data as data_utils
 2 | from .LLaMA_Adapter.imagebind_LLM import llama
 3 | 
 4 | from .base_model import BaseModel
 5 | 
 6 | import os
 7 | 
 8 | 
 9 | llama_dir = "/mnt/petrelfs/share_data/zhangyuanhan/llama_adapter_v2_multimodal"
10 | 
11 | 
12 | class LlamaAdapter(BaseModel):
13 |     # checkpoint will be automatically downloaded
14 |     def __init__(self, model_path: str):
15 |         super().__init__("llama_adapter", model_path)
16 |         self.model = llama.load(model_path, llama_dir)
17 |         self.model.eval()
18 | 
19 |     def generate(self, input_data):
20 |         inputs = {}
21 |         video_dir = input_data.get("video_root", "")
22 |         image = data_utils.load_and_transform_video_data([input_data["video_path"]], device="cuda")
23 |         inputs["Image"] = [image, 1]
24 | 
25 |         object_description = input_data["object_description"]
26 |         if object_description != "None":
27 |             context = f"Given context:{object_description}. "
28 |         else:
29 |             context = ""
30 |         prompts_input = context + input_data["question"]
31 | 
32 |         results = self.model.generate(inputs, [llama.format_prompt(prompts_input)], max_gen_len=256)
33 |         result = results[0].strip()
34 |         return result
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     model = LlamaAdapter("", "")
39 |     data = {
40 |         "video_idx": "03f2ed96-1719-427d-acf4-8bf504f1d66d.mp4",
41 |         "question": "What is in this image?",
42 |     }
43 |     print(model.generate(data))
44 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/llava_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torchvision.transforms as T
 4 | from torchvision.io import read_video
 5 | 
 6 | from .base_model import BaseModel
 7 | from .llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 8 | from .llava.conversation import conv_templates, SeparatorStyle
 9 | from .llava.model.builder import load_pretrained_model
10 | from .llava.utils import disable_torch_init
11 | from .llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
12 | 
13 | default_model_path = "liuhaotian/llava-v1.5-7b"
14 | 
15 | 
16 | class LLaVA_Model(BaseModel):
17 |     def __init__(
18 |         self,
19 |         model_path: str = default_model_path,
20 |         model_base: str = None,
21 |         model_name: str = "llava-v1.5",
22 |         conv_mode: str = "llava_v1",
23 |     ):
24 |         super().__init__(model_name, model_path)
25 |         init_model_name = get_model_name_from_path(model_path)
26 |         self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(model_path, model_base, init_model_name)
27 |         self.conv_mode = conv_mode
28 | 
29 |     def generate(self, text_prompt: str, raw_image_data: str):
30 |         if self.model.config.mm_use_im_start_end:
31 |             prompts_input = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + text_prompt
32 |         else:
33 |             prompts_input = DEFAULT_IMAGE_TOKEN + "\n" + text_prompt
34 | 
35 |         input_data = self.image_processor.preprocess(raw_image_data, return_tensors="pt")["pixel_values"][0]
36 | 
37 |         conv = conv_templates[self.conv_mode].copy()
38 |         conv.append_message(conv.roles[0], prompts_input)
39 |         conv.append_message(conv.roles[1], None)
40 |         prompt = conv.get_prompt()
41 |         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
42 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
43 |         keywords = [stop_str]
44 |         stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
45 | 
46 |         with torch.inference_mode():
47 |             output_ids = self.model.generate(
48 |                 input_ids,
49 |                 images=input_data.unsqueeze(0).half().cuda(),
50 |                 do_sample=True,
51 |                 temperature=0.2,
52 |                 top_p=None,
53 |                 num_beams=1,
54 |                 # no_repeat_ngram_size=3,
55 |                 max_new_tokens=512,
56 |                 use_cache=True,
57 |             )
58 | 
59 |         input_token_len = input_ids.shape[1]
60 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
61 |         if n_diff_input_output > 0:
62 |             print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
63 |         outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
64 |         outputs = outputs.strip()
65 |         if outputs.endswith(stop_str):
66 |             outputs = outputs[: -len(stop_str)]
67 |         outputs = outputs.strip()
68 | 
69 |         return outputs
70 | 
71 |     def eval_forward(self, text_prompt: str, raw_image_data: str):
72 |         pass
73 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/mplug_owl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from transformers import AutoTokenizer
 5 | from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration
 6 | from mplug_owl_video.processing_mplug_owl import (
 7 |     MplugOwlImageProcessor,
 8 |     MplugOwlProcessor,
 9 | )
10 | 
11 | from .base_model import BaseModel
12 | 
13 | pretrained_ckpt = "MAGAer13/mplug-owl-llama-7b-video"
14 | 
15 | 
16 | class mPlug_owl(BaseModel):
17 |     def __init__(self, model_path: str):
18 |         super().__init__("mplug_owl", model_path)
19 |         self.model = MplugOwlForConditionalGeneration.from_pretrained(
20 |             pretrained_ckpt,
21 |             torch_dtype=torch.bfloat16,
22 |         )
23 |         self.image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
24 |         self.tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt)
25 |         self.processor = MplugOwlProcessor(self.image_processor, self.tokenizer)
26 |         self.model.eval()
27 | 
28 |     def format_prompt(self, question):
29 |         prompts = [f" <|video|> Question : {question} Answer : "]
30 |         return prompts
31 | 
32 |     def generate(self, input_data: dict):
33 |         questions = input_data["question"]
34 |         video_dir = input_data.get("video_root", "")
35 |         video_list = input_data["video_path"]
36 |         generate_kwargs = {"do_sample": True, "top_k": 5, "max_length": 512}
37 | 
38 |         object_description = input_data["object_description"]
39 |         if object_description != "None":
40 |             context = f"Given context:{object_description}. "
41 |         else:
42 |             context = ""
43 |         prompts_input = context + input_data["question"]
44 | 
45 |         prompt = self.format_prompt(prompts_input)
46 |         inputs = self.processor(text=prompt, videos=video_list, num_frames=4, return_tensors="pt")
47 |         inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
48 |         inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
49 |         with torch.no_grad():
50 |             res = self.model.generate(**inputs, **generate_kwargs)
51 |         sentence = self.tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
52 |         return sentence
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     model = mPlug_owl("")
57 |     device = torch.device("cuda")
58 |     model.model = model.model.to(device)
59 |     data = {
60 |         "video_idx": ["./data_source/multi_hop_reasoning/03f2ed96-1719-427d-acf4-8bf504f1d66d.mp4"],
61 |         "question": "What is in this image?",
62 |     }
63 |     print(model.generate(data))
64 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/otterhd.py:
--------------------------------------------------------------------------------
 1 | from transformers import FuyuForCausalLM, AutoTokenizer, FuyuImageProcessor, FuyuProcessor
 2 | from PIL import Image
 3 | from .base_model import BaseModel
 4 | import torch
 5 | import numpy as np
 6 | import warnings
 7 | import io
 8 | import base64
 9 | 
10 | warnings.filterwarnings("ignore")
11 | 
12 | 
13 | def get_pil_image(raw_image_data) -> Image.Image:
14 |     if isinstance(raw_image_data, Image.Image):
15 |         return raw_image_data
16 | 
17 |     elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data:
18 |         return Image.open(io.BytesIO(raw_image_data["bytes"]))
19 | 
20 |     elif isinstance(raw_image_data, str):  # Assuming this is a base64 encoded string
21 |         image_bytes = base64.b64decode(raw_image_data)
22 |         return Image.open(io.BytesIO(image_bytes))
23 | 
24 |     else:
25 |         raise ValueError("Unsupported image data format")
26 | 
27 | 
28 | import math
29 | 
30 | 
31 | class OtterHD(BaseModel):
32 |     def __init__(self, model_path: str = "Otter-AI/OtterHD-8B", cuda_id: int = 0, resolution: int = -1, max_new_tokens=256):
33 |         super().__init__("otterhd", model_path)
34 |         self.resolution = resolution
35 |         self.device = f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu"
36 |         self.model = FuyuForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=self.device)
37 |         self.model.eval()
38 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
39 |         self.image_processor = FuyuImageProcessor()
40 |         self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
41 |         self.max_new_tokens = max_new_tokens
42 | 
43 |     def generate(self, text_prompt: str, raw_image_data: str):
44 |         raw_image_data = get_pil_image(raw_image_data)
45 |         # make sure the image is in RGB format and resize to match the width
46 |         raw_image_data = raw_image_data.convert("RGB")
47 |         if self.resolution != -1:
48 |             width, height = raw_image_data.size
49 |             short_edge = min(width, height)
50 |             scaling_factor = self.resolution / short_edge
51 |             new_width = math.ceil(width * scaling_factor)
52 |             new_height = math.ceil(height * scaling_factor)
53 |             raw_image_data = raw_image_data.resize((new_width, new_height), Image.ANTIALIAS)
54 | 
55 |         formated_prompt = f"User: {text_prompt} Assistant:"
56 |         model_inputs = self.processor(text=formated_prompt, images=[raw_image_data], device=self.device)
57 |         for k, v in model_inputs.items():
58 |             model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v]
59 | 
60 |         model_inputs["image_patches"][0] = model_inputs["image_patches"][0].to(dtype=next(self.model.parameters()).dtype)
61 |         generation_output = self.model.generate(**model_inputs, max_new_tokens=self.max_new_tokens, pad_token_id=self.tokenizer.eos_token_id)
62 |         generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True)
63 |         response = generation_text[0].split("\x04")[1].strip(" ").strip("\n")
64 |         return response
65 | 
66 |     def eval_forward(self, text_prompt: str, image_path: str):
67 |         # Similar to the Idefics' eval_forward but adapted for Fuyu
68 |         pass
69 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/qwen_vl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | from transformers.generation import GenerationConfig
 6 | 
 7 | from .base_model import BaseModel
 8 | 
 9 | default_path = "Qwen/Qwen-VL-Chat"
10 | 
11 | 
12 | class QwenVL(BaseModel):
13 |     def __init__(self, model_name: str = "qwen_vl", model_path: str = default_path):
14 |         super().__init__(model_name, model_path)
15 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
16 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval()
17 |         self.model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
18 |         self.temp_dir = ".log/temp"
19 |         if not os.path.exists(self.temp_dir):
20 |             os.makedirs(self.temp_dir)
21 | 
22 |     def generate(self, text_prompt: str, raw_image_data: str):
23 |         image_path = os.path.join(self.temp_dir, "temp.jpg")
24 |         raw_image_data.save(image_path)
25 |         query = []
26 |         query.append({"image": image_path})
27 |         query.append({"text": text_prompt})
28 |         query = self.tokenizer.from_list_format(query)
29 |         response, history = self.model.chat(self.tokenizer, query=query, history=None)
30 |         return response
31 | 
32 |     def eval_forward(self, text_prompt: str, image_path: str):
33 |         # Similar to the Idefics' eval_forward but adapted for QwenVL
34 |         pass
35 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/models/video_chatgpt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .video_chatgpt.eval.model_utils import load_video, initialize_model
 3 | from .video_chatgpt.inference import video_chatgpt_infer
 4 | 
 5 | from .base_model import BaseModel
 6 | 
 7 | model_name = "/mnt/lustre/yhzhang/kaichen/video_ChatGPT/LLaVA-Lightening-7B-v1-1."
 8 | projection_path = "/mnt/lustre/yhzhang/kaichen/video_ChatGPT/video_chatgpt-7B.bin"
 9 | 
10 | 
11 | class Video_ChatGPT(BaseModel):
12 |     def __init__(self, model_path: str):
13 |         super().__init__("video_chatgpt", model_path)
14 |         (
15 |             self.model,
16 |             self.vision_tower,
17 |             self.tokenizer,
18 |             self.image_processor,
19 |             self.video_token_len,
20 |         ) = initialize_model(model_name, projection_path)
21 | 
22 |     def generate(self, input_data: dict):
23 |         video_dir = input_data.get("video_root", "")
24 |         video_frames = load_video(input_data["video_path"])
25 | 
26 |         object_description = input_data["object_description"]
27 |         if object_description != "None":
28 |             context = f"Given context:{object_description}. "
29 |         else:
30 |             context = ""
31 |         prompts_input = context + input_data["question"]
32 | 
33 |         output = video_chatgpt_infer(
34 |             video_frames, prompts_input, conv_mode="video-chatgpt_v1", model=self.model, vision_tower=self.vision_tower, tokenizer=self.tokenizer, image_processor=self.image_processor, video_token_len=self.video_token_len
35 |         )
36 |         return output
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     model = Video_ChatGPT("")
41 |     device = torch.device("cuda")
42 |     model.model = model.model.to(device)
43 |     model.vision_tower = model.vision_tower.to(device)
44 |     data = {
45 |         "video_idx": "./data_source/multi_hop_reasoning/03f2ed96-1719-427d-acf4-8bf504f1d66d.mp4",
46 |         "question": "What is in this image?",
47 |     }
48 |     print(model.generate(data))
49 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/README.md:
--------------------------------------------------------------------------------
 1 | # OpenFlamingo Evaluation Suite
 2 | 
 3 | This is the evaluation module of OpenFlamingo. It contains a set of utilities for evaluating multimodal models on various benchmarking datasets.
 4 | 
 5 | *This module is a work in progress! We will be updating this README as it develops. In the meantime, if you notice an issue, please file a Bug Report or Feature Request [here](https://github.com/mlfoundations/open_flamingo/issues/new/choose).*
 6 | 
 7 | ## Supported datasets
 8 | 
 9 | |Dataset|Task|Metric|Evaluation method|
10 | |-------|----|------|-----------------|
11 | |[COCO](https://arxiv.org/abs/1405.0312)|Captioning|CIDEr|Generation|
12 | |[Flickr-30K](https://aclanthology.org/Q14-1006/)|Captioning|CIDEr|Generation|
13 | |[VQAv2](https://arxiv.org/abs/1612.00837v3)|VQA|VQA accuracy|Generation|
14 | |[OK-VQA](https://arxiv.org/abs/1906.00067)|VQA|VQA accuracy|Generation|
15 | |[TextVQA](https://arxiv.org/abs/1904.08920)|VQA|VQA accuracy|Generation|
16 | |[VizWiz](https://arxiv.org/abs/1802.08218)|VQA|VQA accuracy|Generation|
17 | |[Hateful Memes](https://arxiv.org/abs/2005.04790)|Classification|ROC AUC|Logprobs|
18 | |[ImageNet](https://arxiv.org/abs/1409.0575)|Classification|Top-1 accuracy|Logprobs|
19 | 
20 | When evaluating a model using `num_shots` shots, we sample the exemplars from the training split. Performance is evaluated on a disjoint test split, subsampled to `--num_samples` examples (or using the full test split if `--num_samples=-1`).
21 | 
22 | Warning: we have found that classification evaluation does not work as well as expected for MPT models (e.g. OpenFlamingo-3B, OpenFlamingo-9B) and we are not sure why yet. We will update this README when we have more information.
23 | 
24 | ## Sample scripts
25 | Our codebase uses DistributedDataParallel to parallelize evaluation by default, so please make sure to set the `MASTER_ADDR` and `MASTER_PORT` environment variables or use `torchrun`. We provide a sample Slurm evaluation script in `open_flamingo/open_flamingo/scripts/run_eval.sh`. 
26 | 
27 | We also support evaluating at a lower precision using the `--precision` flag. We find minimal difference between evaluating at full precision vs. amp_bf16.
28 | 
29 | To evaluate one of our pretrained checkpoints, we suggest first downloading a local copy of the weights, as follows:
30 | 
31 | ```
32 | # grab model checkpoint from huggingface hub
33 | from huggingface_hub import hf_hub_download
34 | HF_TOKEN="<your-hf-token-here>"
35 | 
36 | checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
37 | checkpoint_path= hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", 
38 |   "checkpoint.pt", 
39 |   local_dir="openflamingo/OpenFlamingo-3B-vitl-mpt1b", 
40 |   cache_dir="openflamingo/OpenFlamingo-3B-vitl-mpt1b", 
41 |   local_dir_use_symlinks=False,
42 |   token=HF_TOKEN)
43 | print(checkpoint_path)
44 | ## openflamingo/OpenFlamingo-3B-vitl-mpt1b/checkpoint.pt
45 | ```
46 | 
47 | This should place the OpenFlamingo model at the expected location in the evaluation script.
48 | 
49 | For TextVQA and VizWiz we expect annotations to be formatted differently than the original datasets. We provide the custom annotations in `open_flamingo/open_flamingo/eval/data/`.


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/public_datasets_suite/__init__.py


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/coco_metric.py:
--------------------------------------------------------------------------------
 1 | from pycocoevalcap.eval import COCOEvalCap
 2 | from pycocotools.coco import COCO
 3 | 
 4 | 
 5 | def compute_cider(
 6 |     result_path,
 7 |     annotations_path,
 8 | ):
 9 |     # create coco object and coco_result object
10 |     coco = COCO(annotations_path)
11 |     coco_result = coco.loadRes(result_path)
12 | 
13 |     # create coco_eval object by taking coco and coco_result
14 |     coco_eval = COCOEvalCap(coco, coco_result)
15 |     coco_eval.params["image_id"] = coco_result.getImgIds()
16 |     coco_eval.evaluate()
17 | 
18 |     return coco_eval.eval
19 | 
20 | 
21 | def postprocess_captioning_generation(predictions):
22 |     return predictions.split("Output", 1)[0]
23 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/config.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: otter
 3 |   path: /data/bli/checkpoints/OTTER-Image-MPT7B
 4 |   checkpoint: /data/bli/checkpoints/OTTER-Image-MPT7B/final_weights.pt
 5 |   device_map: auto
 6 |   precision: fp32
 7 |   batch_size: 8
 8 | 
 9 | dataset:
10 |   coco:
11 |     test: true
12 |     train_image: /path/to/mscoco_karpathy/train2014
13 |     val_image: /path/to/mscoco_karpathy/val2014
14 |     karpathy: /path/to/mscoco_karpathy/dataset_coco.json
15 |     annotations: /path/to/mscoco_karpathy/annotations/captions_val2014.json
16 |   vqav2:
17 |     test: false
18 |     train_image: /path/to/vqav2/train2014
19 |     train_annotations: /path/to/vqav2/v2_mscoco_train2014_annotations.json
20 |     train_questions: /path/to/vqav2/v2_OpenEnded_mscoco_train2014_questions.json
21 |     test_image: /path/to/vqav2/val2014
22 |     test_annotations: /path/to/vqav2/v2_mscoco_val2014_annotations.json
23 |     test_questions: /path/to/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
24 |   flickr:
25 |     test: false
26 |     image: /path/to/flickr30k/flickr30k-images
27 |     karpathy: /path/to/flickr30k/dataset_flickr30k.json
28 |     annotations: /path/to/flickr30k/dataset_flickr30k_coco_style.json
29 |   ok_vqa:
30 |     test: false
31 |     train_image: /path/to/okvqa/train2014
32 |     train_annotations: /path/to/okvqa/mscoco_train2014_annotations.json
33 |     train_questions: /path/to/okvqa/OpenEnded_mscoco_train2014_questions.json
34 |     test_image: /path/to/okvqa/val2014
35 |     test_annotations: /path/to/okvqa/mscoco_val2014_annotations.json
36 |     test_questions: /path/to/okvqa/OpenEnded_mscoco_val2014_questions.json
37 |   textvqa:
38 |     test: false
39 |     train_image: /path/to/textvqa/train_images
40 |     train_annotations: /path/to/textvqa/train_annotations_vqa_format.json
41 |     train_questions: /path/to/textvqa/train_questions_vqa_format.json
42 |     test_image: /path/to/textvqa/val_images
43 |     test_annotations: /path/to/textvqa/val_annotations_vqa_format.json
44 |     test_questions: /path/to/textvqa/val_questions_vqa_format.json
45 |   vizwiz:
46 |     test: false
47 |     train_image: /path/to/vizwiz/train
48 |     train_annotations: /path/to/vizwiz/train_annotations_vqa_format.json
49 |     train_questions: /path/to/vizwiz/train_questions_vqa_format.json
50 |     test_image: /path/to/vizwiz/val
51 |     test_annotations: /path/to/vizwiz/val_annotations_vqa_format.json
52 |     test_questions: /path/to/vizwiz/val_questions_vqa_format.json
53 |   hateful_memes:
54 |     test: false
55 |     image: /path/to/hateful_memes/img
56 |     train_annotations: /path/to/hateful_memes/train.json
57 |     test_annotations: /path/to/hateful_memes/dev.json
58 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/eval_model.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import argparse
 3 | from typing import List
 4 | from torch.nn.parallel import DistributedDataParallel as DDP
 5 | from PIL import Image
 6 | 
 7 | 
 8 | class BaseEvalModel(abc.ABC):
 9 |     """Base class encapsulating functionality needed to evaluate a model."""
10 | 
11 |     def __init__(self, args: List[str]):
12 |         """Initialize model.
13 | 
14 |         Args:
15 |             args: arguments to model. These should be parsed, or if the model
16 |                 has no applicable arguments, an error should be thrown if `args`
17 |                 is non-empty.
18 |         """
19 | 
20 |     def init_distributed(self):
21 |         """Wrap model as DDP."""
22 |         self.model = DDP(self.model, device_ids=[self.device])
23 | 
24 |     def set_device(self, device):
25 |         """Set device for model."""
26 |         self.device = device
27 |         self.model = self.model.to(device)
28 | 
29 |     def get_outputs(
30 |         self,
31 |         batch_text: List[str],
32 |         batch_images: List[List[Image.Image]],
33 |         min_generation_length: int,
34 |         max_generation_length: int,
35 |         num_beams: int,
36 |         length_penalty: float,
37 |     ) -> List[str]:
38 |         """Get outputs for a batch of images and text.
39 | 
40 |         Args:
41 |             batch_text: list of text strings, with the text "<image>" in place
42 |                 of any images to be included.
43 |             batch_images: images to provide to model. Should be a list of lists,
44 |               where each list contains the images for a single example.
45 |             max_generation_length: maximum length of the generated caption.
46 |                 Defaults to 10.
47 |             num_beams: number of beams to use for beam search. Defaults to 3.
48 |             length_penalty: length penalty for beam search. Defaults to -2.0.
49 | 
50 |         Returns:
51 |             List of decoded output strings.
52 |         """
53 | 
54 |     def vqa_prompt(self, question, answer=None) -> str:
55 |         """Get the prompt to use for VQA evaluation. If the answer is not provided, it should be left blank to be generated by the model.
56 | 
57 |         Returns:
58 |             The prompt to use for VQA.
59 |         """
60 | 
61 |     def caption_prompt(self, caption=None) -> str:
62 |         """Get the prompt to use for caption evaluation. If the caption is not provided, it should be left blank to be generated by the model.
63 | 
64 |         Returns:
65 |             The prompt to use for captioning.
66 |         """
67 | 
68 |     def classification_prompt(self, class_str=None) -> str:
69 |         """Get the prompt to use for classification evaluation. If the class_str is not provided, it should be left blank to be generated by the model.
70 | 
71 |         Returns:
72 |             The prompt to use for classification.
73 |         """
74 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/get_args.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import json"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "file_name = \"./run_eval_coco.sh\"\n",
19 |     "args = []\n",
20 |     "with open(file_name, \"r\") as f:\n",
21 |     "    lines = f.readlines()\n",
22 |     "    for line in lines:\n",
23 |     "        line = line.strip()\n",
24 |     "        if line.endswith(\"\\\\\"):\n",
25 |     "            line = line[:-1].strip()\n",
26 |     "        if line.startswith(\"--\"):\n",
27 |     "            args.append(line)"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": [
36 |     "print(json.dumps(args, indent=4))"
37 |    ]
38 |   }
39 |  ],
40 |  "metadata": {
41 |   "kernelspec": {
42 |    "display_name": "Python 3",
43 |    "language": "python",
44 |    "name": "python3"
45 |   },
46 |   "language_info": {
47 |    "codemirror_mode": {
48 |     "name": "ipython",
49 |     "version": 3
50 |    },
51 |    "file_extension": ".py",
52 |    "mimetype": "text/x-python",
53 |    "name": "python",
54 |    "nbconvert_exporter": "python",
55 |    "pygments_lexer": "ipython3",
56 |    "version": "3.10.12"
57 |   }
58 |  },
59 |  "nbformat": 4,
60 |  "nbformat_minor": 2
61 | }
62 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/public_datasets_suite/models/__init__.py


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/models/blip.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from PIL import Image
 4 | import torch
 5 | 
 6 | from transformers import Blip2Processor, Blip2ForConditionalGeneration
 7 | from pipeline.benchmarks.public_datasets_suite.eval_model import BaseEvalModel
 8 | from pipeline.benchmarks.public_datasets_suite.models.utils import unwrap_model
 9 | 
10 | 
11 | class EvalModel(BaseEvalModel):
12 |     """BLIP-2 model evaluation.
13 | 
14 |     Attributes:
15 |       model (nn.Module): Underlying Torch model.
16 |       tokenizer (transformers.PreTrainedTokenizer): Tokenizer for model.
17 |       device: Index of GPU to use, or the string "cpu"
18 |     """
19 | 
20 |     def __init__(self, model_args):
21 |         assert "processor_path" in model_args and "lm_path" in model_args and "device" in model_args, "BLIP-2 requires processor_path, lm_path, and device arguments to be specified"
22 | 
23 |         self.device = int(model_args["device"]) if ("device" in model_args and model_args["device"] >= 0) else "cpu"
24 |         self.processor = Blip2Processor.from_pretrained(model_args["processor_path"])
25 |         self.model = Blip2ForConditionalGeneration.from_pretrained(model_args["lm_path"])
26 |         self.model.to(self.device)
27 |         self.model.eval()
28 |         self.processor.tokenizer.padding_side = "left"
29 | 
30 |     def _prepare_images(self, batch: List[List[torch.Tensor]]) -> torch.Tensor:
31 |         """Preprocess images and stack them.
32 | 
33 |         Args:
34 |             batch: A list of lists of images.
35 | 
36 |         Returns:
37 |             A Tensor of shape
38 |             (batch_size, channels, height, width).
39 |         """
40 |         batch_images = None
41 |         assert all(len(example) == 1 for example in batch), "BLIP-2 only supports one image per example"
42 | 
43 |         for example in batch:
44 |             assert len(example) == 1, "BLIP-2 only supports one image per example"
45 |             batch_images = torch.cat(
46 |                 [
47 |                     batch_images,
48 |                     self.processor.image_processor(example, return_tensors="pt")["pixel_values"],
49 |                 ]
50 |                 if batch_images is not None
51 |                 else [self.processor.image_processor(example, return_tensors="pt")["pixel_values"]],
52 |                 dim=0,
53 |             )
54 |         return batch_images
55 | 
56 |     def get_outputs(
57 |         self,
58 |         batch_text: List[str],
59 |         batch_images: List[List[Image.Image]],
60 |         max_generation_length: int,
61 |         num_beams: int,
62 |         length_penalty: float,
63 |     ) -> List[str]:
64 |         encodings = self.processor.tokenizer(
65 |             batch_text,
66 |             padding="longest",
67 |             truncation=True,
68 |             return_tensors="pt",
69 |             max_length=2000,
70 |         )
71 |         input_ids = encodings["input_ids"]
72 |         attention_mask = encodings["attention_mask"]
73 | 
74 |         with torch.inference_mode():
75 |             outputs = unwrap_model(self.model).generate(
76 |                 self._prepare_images(batch_images).to(self.device),
77 |                 input_ids.to(self.device),
78 |                 attention_mask=attention_mask.to(self.device),
79 |                 max_new_tokens=max_generation_length,
80 |                 min_new_tokens=8,
81 |                 num_beams=num_beams,
82 |                 length_penalty=length_penalty,
83 |             )
84 | 
85 |         return self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
86 | 
87 |     def get_vqa_prompt(self, question, answer=None) -> str:
88 |         return f"Question:{question} Short answer:{answer if answer is not None else ''}"
89 | 
90 |     def get_caption_prompt(self, caption=None) -> str:
91 |         return f"A photo of {caption if caption is not None else ''}"
92 | 
93 |     def get_classification_prompt(self, class_str=None) -> str:
94 |         raise NotImplementedError
95 | 


--------------------------------------------------------------------------------
/pipeline/benchmarks/public_datasets_suite/models/utils.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | def unwrap_model(model):
 5 |     """
 6 |     Unwrap a model from a DataParallel or DistributedDataParallel wrapper.
 7 |     """
 8 |     if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
 9 |         return model.module
10 |     else:
11 |         return model
12 | 


--------------------------------------------------------------------------------
/pipeline/demos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/demos/__init__.py


--------------------------------------------------------------------------------
/pipeline/demos/demo_utils.py:
--------------------------------------------------------------------------------
 1 | import mimetypes
 2 | import sys
 3 | from typing import Union
 4 | 
 5 | import requests
 6 | from PIL import Image
 7 | 
 8 | requests.packages.urllib3.disable_warnings()
 9 | 
10 | 
11 | # --- Utility Functions ---
12 | def print_colored(text, color_code):
13 |     end_code = "\033[0m"  # Reset to default color
14 |     print(f"{color_code}{text}{end_code}")
15 | 
16 | 
17 | def get_content_type(file_path):
18 |     content_type, _ = mimetypes.guess_type(file_path)
19 |     return content_type
20 | 
21 | 
22 | def get_image(url: str) -> Union[Image.Image, list]:
23 |     if not url.strip():  # Blank input, return a blank Image
24 |         return Image.new("RGB", (224, 224))  # Assuming 224x224 is the default size for the model. Adjust if needed.
25 |     elif "://" not in url:  # Local file
26 |         content_type = get_content_type(url)
27 |     else:  # Remote URL
28 |         content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type")
29 | 
30 |     if "image" in content_type:
31 |         if "://" not in url:  # Local file
32 |             return Image.open(url)
33 |         else:  # Remote URL
34 |             return Image.open(requests.get(url, stream=True, verify=False).raw)
35 |     else:
36 |         raise ValueError("Invalid content type. Expected image.")
37 | 


--------------------------------------------------------------------------------
/pipeline/demos/inference.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | import json
 4 | import sys
 5 | 
 6 | import requests
 7 | import yaml
 8 | 
 9 | from .demo_models import TestIdefics, TestOtter, TestOtterHD
10 | from .demo_utils import get_image, print_colored
11 | 
12 | requests.packages.urllib3.disable_warnings()
13 | 
14 | import pytz
15 | 
16 | # Initialize the time zone
17 | utc_plus_8 = pytz.timezone("Asia/Singapore")  # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
18 | # Get the current time in UTC
19 | utc_now = pytz.utc.localize(datetime.datetime.utcnow())
20 | # Convert to UTC+8
21 | utc_plus_8_time = utc_now.astimezone(utc_plus_8)
22 | 
23 | 
24 | def parse_args():
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--model_name", type=str, default="otter", required=True, help="The model name.")
27 |     parser.add_argument("--checkpoint", type=str, help="The path to the checkpoint.")
28 |     parser.add_argument("--output_dir", type=str, help="The dir path to the output file.", default="./logs")
29 |     parser.add_argument("--yaml_file", type=str, help="The dir path to the eval yaml, contains question, answer pairs.", default="")
30 |     args = parser.parse_args()
31 |     return args
32 | 
33 | 
34 | def eval_yaml(args, yaml_file, model):
35 |     with open(yaml_file, "r") as file:
36 |         test_data_list = yaml.safe_load(file)
37 | 
38 |     cur_date = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
39 |     log_json_path = f"{args.output_dir}/inference_log_{cur_date}.json"
40 |     log_json = {
41 |         "model_name": args.model_name,
42 |         "checkpoint": args.checkpoint,
43 |         "results": {},
44 |     }
45 |     for test_id, test_data in enumerate(test_data_list):
46 |         image_path = test_data.get("image_path", "")
47 |         question = test_data.get("question", "")
48 | 
49 |         image = get_image(image_path)
50 |         no_image_flag = not bool(image_path)
51 | 
52 |         response = model.generate(prompt=question, image=image, no_image_flag=no_image_flag)
53 | 
54 |         # Print results to console
55 |         print(f"image_path: {image_path}")
56 |         print_colored(f"question: {question}", color_code="\033[92m")
57 |         print_colored(f"answer: {response}", color_code="\033[94m")
58 |         print("-" * 150)
59 | 
60 |         log_json["results"].update(
61 |             {
62 |                 str(test_id).zfill(3): {
63 |                     "image_path": image_path,
64 |                     "question": question,
65 |                     "answer": response,
66 |                 }
67 |             }
68 |         )
69 | 
70 |     with open(log_json_path, "w") as file:
71 |         json.dump(log_json, file, indent=4, sort_keys=False)
72 | 
73 | 
74 | def main():
75 |     args = parse_args()
76 |     if args.model_name == "otter":
77 |         model = TestOtter(checkpoint=args.checkpoint)
78 |     elif args.model_name == "otterhd":
79 |         model = TestOtterHD(checkpoint=args.checkpoint)
80 |     elif args.model_name == "idefics":
81 |         model = TestIdefics(checkpoint=args.checkpoint)
82 |     else:
83 |         raise NotImplementedError(f"model_name: {args.model_name} is not implemented.")
84 | 
85 |     if args.yaml_file:
86 |         eval_yaml(args, args.yaml_file, model)
87 |     else:
88 |         while True:
89 |             yaml_file = input("Enter the path to the yaml file: (or 'q' to quit): ")
90 |             if yaml_file == "q":
91 |                 break
92 |             eval_yaml(args, yaml_file, model)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/pipeline/serve/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pipeline/serve/deploy/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.handlers
 3 | import os
 4 | import sys
 5 | 
 6 | handler = None
 7 | 
 8 | 
 9 | def build_logger(logger_name, logger_dir):
10 |     global handler
11 | 
12 |     formatter = logging.Formatter(
13 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
14 |         datefmt="%Y-%m-%d %H:%M:%S",
15 |     )
16 | 
17 |     # Set the format of root handlers
18 |     if not logging.getLogger().handlers:
19 |         logging.basicConfig(level=logging.INFO)
20 |     logging.getLogger().handlers[0].setFormatter(formatter)
21 | 
22 |     # Redirect stdout and stderr to loggers
23 |     stdout_logger = logging.getLogger("stdout")
24 |     stdout_logger.setLevel(logging.INFO)
25 |     sl = StreamToLogger(stdout_logger, logging.INFO)
26 |     sys.stdout = sl
27 | 
28 |     stderr_logger = logging.getLogger("stderr")
29 |     stderr_logger.setLevel(logging.ERROR)
30 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
31 |     sys.stderr = sl
32 | 
33 |     # Get logger
34 |     logger = logging.getLogger(logger_name)
35 |     logger.setLevel(logging.INFO)
36 | 
37 |     # Add a file handler for all loggers
38 |     if handler is None:
39 |         os.makedirs(logger_dir, exist_ok=True)
40 |         filename = os.path.join(logger_dir, logger_name + ".log")
41 |         handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True)
42 |         handler.setFormatter(formatter)
43 | 
44 |         for name, item in logging.root.manager.loggerDict.items():
45 |             if isinstance(item, logging.Logger):
46 |                 item.addHandler(handler)
47 | 
48 |     return logger
49 | 
50 | 
51 | class StreamToLogger(object):
52 |     """
53 |     Fake file-like stream object that redirects writes to a logger instance.
54 |     """
55 | 
56 |     def __init__(self, logger, log_level=logging.INFO):
57 |         self.terminal = sys.stdout
58 |         self.logger = logger
59 |         self.log_level = log_level
60 |         self.linebuf = ""
61 | 
62 |     def __getattr__(self, attr):
63 |         return getattr(self.terminal, attr)
64 | 
65 |     def write(self, buf):
66 |         temp_linebuf = self.linebuf + buf
67 |         self.linebuf = ""
68 |         for line in temp_linebuf.splitlines(True):
69 |             # From the io.TextIOWrapper docs:
70 |             #   On output, if newline is None, any '\n' characters written
71 |             #   are translated to the system default line separator.
72 |             # By default sys.stdout.write() expects '\n' newlines and then
73 |             # translates them so this is still cross platform.
74 |             if line[-1] == "\n":
75 |                 self.logger.log(self.log_level, line.rstrip())
76 |             else:
77 |                 self.linebuf += line
78 | 
79 |     def flush(self):
80 |         if self.linebuf != "":
81 |             self.logger.log(self.log_level, self.linebuf.rstrip())
82 |         self.linebuf = ""
83 | 


--------------------------------------------------------------------------------
/pipeline/serve/gradio_css.py:
--------------------------------------------------------------------------------
 1 | code_highlight_css = """
 2 | #chatbot .hll { background-color: #ffffcc }
 3 | #chatbot .c { color: #408080; font-style: italic }
 4 | #chatbot .err { border: 1px solid #FF0000 }
 5 | #chatbot .k { color: #008000; font-weight: bold }
 6 | #chatbot .o { color: #666666 }
 7 | #chatbot .ch { color: #408080; font-style: italic }
 8 | #chatbot .cm { color: #408080; font-style: italic }
 9 | #chatbot .cp { color: #BC7A00 }
10 | #chatbot .cpf { color: #408080; font-style: italic }
11 | #chatbot .c1 { color: #408080; font-style: italic }
12 | #chatbot .cs { color: #408080; font-style: italic }
13 | #chatbot .gd { color: #A00000 }
14 | #chatbot .ge { font-style: italic }
15 | #chatbot .gr { color: #FF0000 }
16 | #chatbot .gh { color: #000080; font-weight: bold }
17 | #chatbot .gi { color: #00A000 }
18 | #chatbot .go { color: #888888 }
19 | #chatbot .gp { color: #000080; font-weight: bold }
20 | #chatbot .gs { font-weight: bold }
21 | #chatbot .gu { color: #800080; font-weight: bold }
22 | #chatbot .gt { color: #0044DD }
23 | #chatbot .kc { color: #008000; font-weight: bold }
24 | #chatbot .kd { color: #008000; font-weight: bold }
25 | #chatbot .kn { color: #008000; font-weight: bold }
26 | #chatbot .kp { color: #008000 }
27 | #chatbot .kr { color: #008000; font-weight: bold }
28 | #chatbot .kt { color: #B00040 }
29 | #chatbot .m { color: #666666 }
30 | #chatbot .s { color: #BA2121 }
31 | #chatbot .na { color: #7D9029 }
32 | #chatbot .nb { color: #008000 }
33 | #chatbot .nc { color: #0000FF; font-weight: bold }
34 | #chatbot .no { color: #880000 }
35 | #chatbot .nd { color: #AA22FF }
36 | #chatbot .ni { color: #999999; font-weight: bold }
37 | #chatbot .ne { color: #D2413A; font-weight: bold }
38 | #chatbot .nf { color: #0000FF }
39 | #chatbot .nl { color: #A0A000 }
40 | #chatbot .nn { color: #0000FF; font-weight: bold }
41 | #chatbot .nt { color: #008000; font-weight: bold }
42 | #chatbot .nv { color: #19177C }
43 | #chatbot .ow { color: #AA22FF; font-weight: bold }
44 | #chatbot .w { color: #bbbbbb }
45 | #chatbot .mb { color: #666666 }
46 | #chatbot .mf { color: #666666 }
47 | #chatbot .mh { color: #666666 }
48 | #chatbot .mi { color: #666666 }
49 | #chatbot .mo { color: #666666 }
50 | #chatbot .sa { color: #BA2121 }
51 | #chatbot .sb { color: #BA2121 }
52 | #chatbot .sc { color: #BA2121 }
53 | #chatbot .dl { color: #BA2121 }
54 | #chatbot .sd { color: #BA2121; font-style: italic }
55 | #chatbot .s2 { color: #BA2121 }
56 | #chatbot .se { color: #BB6622; font-weight: bold }
57 | #chatbot .sh { color: #BA2121 }
58 | #chatbot .si { color: #BB6688; font-weight: bold }
59 | #chatbot .sx { color: #008000 }
60 | #chatbot .sr { color: #BB6688 }
61 | #chatbot .s1 { color: #BA2121 }
62 | #chatbot .ss { color: #19177C }
63 | #chatbot .bp { color: #008000 }
64 | #chatbot .fm { color: #0000FF }
65 | #chatbot .vc { color: #19177C }
66 | #chatbot .vg { color: #19177C }
67 | #chatbot .vi { color: #19177C }
68 | #chatbot .vm { color: #19177C }
69 | #chatbot .il { color: #666666 }
70 | """
71 | # .highlight  { background: #f8f8f8; }
72 | 


--------------------------------------------------------------------------------
/pipeline/serve/multiplex_script/otter_image_server.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | # Commands to run
 4 | commands = ["command1", "command2", "command3"]
 5 | 
 6 | # Iterate over commands and execute them
 7 | for command in commands:
 8 |     # Launch tmux with each command; this will keep the command running
 9 |     subprocess.Popen(["tmux", "new-session", "-d", command])
10 | 


--------------------------------------------------------------------------------
/pipeline/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/pipeline/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from pipeline.serve.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name})
21 |         worker_addr = ret.json()["address"]
22 |         print(f"worker_addr: {worker_addr}")
23 | 
24 |     if worker_addr == "":
25 |         return
26 | 
27 |     conv = default_conversation.copy()
28 |     conv.append_message(conv.roles[0], args.message)
29 |     prompt = conv.get_prompt()
30 | 
31 |     headers = {"User-Agent": "Otter Client"}
32 |     pload = {
33 |         "model": args.model_name,
34 |         "prompt": prompt,
35 |         "max_new_tokens": args.max_new_tokens,
36 |         "temperature": 0.7,
37 |         "stop": conv.sep,
38 |     }
39 |     response = requests.post(
40 |         worker_addr + "/worker_generate_stream",
41 |         headers=headers,
42 |         json=pload,
43 |         stream=True,
44 |     )
45 | 
46 |     print(prompt.replace(conv.sep, "\n"), end="")
47 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
48 |         if chunk:
49 |             data = json.loads(chunk.decode("utf-8"))
50 |             output = data["text"].split(conv.sep)[-1]
51 |             print(output, end="\r")
52 |     print("")
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     parser = argparse.ArgumentParser()
57 |     parser.add_argument("--controller_address", type=str, default="http://localhost:21001")
58 |     parser.add_argument("--worker_address", type=str)
59 |     parser.add_argument("--model_name", type=str, default="facebook/opt-350m")
60 |     parser.add_argument("--max_new_tokens", type=int, default=32)
61 |     parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.")
62 |     args = parser.parse_args()
63 | 
64 |     main()
65 | 


--------------------------------------------------------------------------------
/pipeline/train/.gitignore:
--------------------------------------------------------------------------------
1 | config.yaml


--------------------------------------------------------------------------------
/pipeline/train/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pipeline/train/config.yaml:
--------------------------------------------------------------------------------
1 | mimicit_vt_path:
2 |   - /data/pufanyi/training_data/SD/SD_instructions.json
3 |   - /data/pufanyi/training_data/CGD/CGD_instructions.json
4 | 
5 | images_vt_path:
6 |   - /data/pufanyi/training_data/SD/SD.json
7 |   - /data/pufanyi/training_data/CGD/CGD.json
8 | 


--------------------------------------------------------------------------------
/pipeline/train/distributed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | 
 5 | def is_global_master(args):
 6 |     return args.rank == 0
 7 | 
 8 | 
 9 | def is_local_master(args):
10 |     return args.local_rank == 0
11 | 
12 | 
13 | def is_master(args, local=False):
14 |     return is_local_master(args) if local else is_global_master(args)
15 | 
16 | 
17 | def is_using_distributed():
18 |     if "WORLD_SIZE" in os.environ:
19 |         return int(os.environ["WORLD_SIZE"]) > 1
20 |     if "SLURM_NTASKS" in os.environ:
21 |         return int(os.environ["SLURM_NTASKS"]) > 1
22 |     return False
23 | 
24 | 
25 | def world_info_from_env():
26 |     local_rank = 0
27 |     for v in (
28 |         "LOCAL_RANK",
29 |         "MPI_LOCALRANKID",
30 |         "SLURM_LOCALID",
31 |         "OMPI_COMM_WORLD_LOCAL_RANK",
32 |     ):
33 |         if v in os.environ:
34 |             local_rank = int(os.environ[v])
35 |             break
36 |     global_rank = 0
37 |     for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"):
38 |         if v in os.environ:
39 |             global_rank = int(os.environ[v])
40 |             break
41 |     world_size = 1
42 |     for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"):
43 |         if v in os.environ:
44 |             world_size = int(os.environ[v])
45 |             break
46 |     return local_rank, global_rank, world_size
47 | 
48 | 
49 | def init_distributed_device(args):
50 |     # Distributed training = training on more than one GPU.
51 |     # Works in both single and multi-node scenarios.
52 |     args.distributed = False
53 |     args.world_size = 1
54 |     args.rank = 0  # global rank
55 |     args.local_rank = 0
56 |     if is_using_distributed():
57 |         if "SLURM_PROCID" in os.environ:
58 |             # DDP via SLURM
59 |             args.local_rank, args.rank, args.world_size = world_info_from_env()
60 |             # SLURM var -> torch.distributed vars in case needed
61 |             os.environ["LOCAL_RANK"] = str(args.local_rank)
62 |             os.environ["RANK"] = str(args.rank)
63 |             os.environ["WORLD_SIZE"] = str(args.world_size)
64 |             torch.distributed.init_process_group(
65 |                 backend=args.dist_backend,
66 |                 init_method=args.dist_url,
67 |                 world_size=args.world_size,
68 |                 rank=args.rank,
69 |             )
70 |         else:
71 |             # DDP via torchrun, torch.distributed.launch
72 |             args.local_rank, _, _ = world_info_from_env()
73 |             torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url)
74 |             args.world_size = torch.distributed.get_world_size()
75 |             args.rank = torch.distributed.get_rank()
76 |         args.distributed = True
77 |     else:
78 |         # needed to run on single gpu
79 |         torch.distributed.init_process_group(
80 |             backend=args.dist_backend,
81 |             init_method=args.dist_url,
82 |             world_size=1,
83 |             rank=0,
84 |         )
85 | 
86 |     if torch.cuda.is_available():
87 |         if args.distributed and not args.no_set_device_rank:
88 |             device = "cuda:%d" % args.local_rank
89 |         else:
90 |             device = "cuda:0"
91 |         torch.cuda.set_device(device)
92 |     else:
93 |         device = "cpu"
94 |     args.device = device
95 |     device = torch.device(device)
96 |     return device
97 | 


--------------------------------------------------------------------------------
/pipeline/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pipeline/utils/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | 
11 | 
12 | def apply_delta(base_model_path, target_model_path, delta_path):
13 |     print("Loading base model")
14 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
15 | 
16 |     print("Loading delta")
17 |     delta = AutoModelForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
18 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
19 | 
20 |     print("Applying delta")
21 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
22 |         if name not in base.state_dict():
23 |             assert name in [
24 |                 "model.mm_projector.weight",
25 |                 "model.mm_projector.bias",
26 |             ], f"{name} not in base model"
27 |             continue
28 |         if param.data.shape == base.state_dict()[name].shape:
29 |             param.data += base.state_dict()[name]
30 |         else:
31 |             assert name in [
32 |                 "model.embed_tokens.weight",
33 |                 "lm_head.weight",
34 |             ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
35 |             bparam = base.state_dict()[name]
36 |             param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
37 | 
38 |     print("Saving target model")
39 |     delta.save_pretrained(target_model_path)
40 |     delta_tokenizer.save_pretrained(target_model_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument("--base-model-path", type=str, required=True)
46 |     parser.add_argument("--target-model-path", type=str, required=True)
47 |     parser.add_argument("--delta-path", type=str, required=True)
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
52 | 


--------------------------------------------------------------------------------
/pipeline/utils/convert_mmc4_to_wds.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import base64
 3 | import json
 4 | import os
 5 | import tarfile
 6 | import uuid
 7 | import sys
 8 | import braceexpand
 9 | import webdataset as wds
10 | 
11 | arg_parser = argparse.ArgumentParser()
12 | arg_parser.add_argument("--output_dir", type=str)
13 | arg_parser.add_argument(
14 |     "--image_shards",
15 |     type=str,
16 |     help="Pass in a list of shards in the format path_to_shard/shard_{0..23098}_images_v2.tar",
17 | )
18 | arg_parser.add_argument(
19 |     "--doc_shards",
20 |     type=str,
21 |     help="Pass in a list of shards in the format path_to_shard/docs_shard_{0..23098}_v2.jsonl",
22 | )
23 | args = arg_parser.parse_args()
24 | 
25 | from tqdm import tqdm
26 | 
27 | 
28 | def main(args, start_number=0):
29 |     os.makedirs(args.output_dir, exist_ok=True)
30 | 
31 |     doc_shards = list(braceexpand.braceexpand(args.doc_shards))
32 |     image_shards = list(braceexpand.braceexpand(args.image_shards))
33 | 
34 |     assert len(doc_shards) == len(image_shards), "Each doc shard must have a corresponding image shard"
35 |     with wds.ShardWriter(args.output_dir + f"/%09d.tar", maxcount=30000, maxsize=1e10) as sink:
36 |         for idx in tqdm(range(start_number, len(doc_shards)), desc="Converting shards"):
37 |             try:
38 |                 image_tar = tarfile.open(image_shards[idx])
39 |             except Exception as e:
40 |                 print(e)
41 |                 continue
42 | 
43 |             # Read the JSONL file
44 |             try:
45 |                 with open(doc_shards[idx], "r") as json_file:
46 |                     for sample_data in json_file:
47 |                         # get image names from json
48 |                         sample_data = json.loads(sample_data)
49 |                         image_info = sample_data["image_info"]
50 |                         image_names = [image["image_name"] for image in image_info]
51 | 
52 |                         # Add each image to the tar file
53 |                         for img_idx, image_name in enumerate(image_names):
54 |                             image = image_tar.extractfile(f"{image_tar.getnames()[0]}/{image_name}")
55 | 
56 |                             # convert to base64
57 |                             image_bytes = image.read()
58 |                             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
59 |                             sample_data["image_info"][img_idx]["image_base64"] = image_base64
60 | 
61 |                         key_str = uuid.uuid4().hex
62 |                         sink.write({"__key__": key_str, "json": sample_data})
63 |             except Exception as e:
64 |                 print(e)
65 |                 image_tar.close()
66 |                 continue
67 | 
68 |             image_tar.close()
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main(args=args)
73 | 


--------------------------------------------------------------------------------
/pipeline/utils/general.py:
--------------------------------------------------------------------------------
 1 | class DualOutput:
 2 |     def __init__(self, file, stdout):
 3 |         self.file = file
 4 |         self.stdout = stdout
 5 | 
 6 |     def write(self, data):
 7 |         self.file.write(data)
 8 |         self.stdout.write(data)
 9 | 
10 |     def flush(self):
11 |         self.file.flush()
12 |         self.stdout.flush()
13 | 


--------------------------------------------------------------------------------
/pipeline/utils/make_a_train.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import orjson
 4 | import argparse
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def main(input_file, output_file):
 9 |     # Load the JSON file
10 |     with open(input_file, "rb") as file:
11 |         data = orjson.loads(file.read())
12 | 
13 |     # Create a set to store seen keys
14 |     seen_keys = set()
15 | 
16 |     # Create a new dictionary with the keys from the original JSON and rel_ins_ids as values
17 |     new_dict = {}
18 |     for key, value in tqdm(data["data"].items()):
19 |         if key not in seen_keys:
20 |             try:
21 |                 # Check if rel_ins_ids are in the original JSON
22 |                 if args.remove_rel_ins_ids:
23 |                     valid_rel_ins_ids = []
24 |                 else:
25 |                     valid_rel_ins_ids = [rel_ins_id for rel_ins_id in value["rel_ins_ids"] if rel_ins_id in data["data"]]
26 | 
27 |                 # Add the valid rel_ins_ids to the new_dict
28 |                 new_dict[key] = valid_rel_ins_ids
29 |                 seen_keys.update(valid_rel_ins_ids)
30 |             except Exception as e:
31 |                 print("Error with key %s and value %s" % (key, value))
32 | 
33 |     # Write the new dictionary to a new JSON file
34 |     with open(output_file, "wb") as file:
35 |         file.write(orjson.dumps(new_dict))
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     # Parse command-line arguments
40 |     parser = argparse.ArgumentParser(description="Process a JSON file.")
41 |     parser.add_argument("--input_file", type=str, help="Path to the input JSON file")
42 |     parser.add_argument("--output_file", type=str, help="Path to the output JSON file")
43 |     parser.add_argument(
44 |         "--remove_rel_ins_ids",
45 |         action="store_true",
46 |         help="Remove rel_ins_ids from the output JSON file",
47 |     )
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     # Run the main function with the provided arguments
52 |     main(args.input_file, args.output_file)
53 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 240
3 | 
4 | [build-system]
5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     prerun: mark a test as a prerun check.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=0.19.0
 2 | braceexpand>=0.1.7
 3 | einops>=0.6.1
 4 | einops_exts>=0.0.4
 5 | fastapi>=0.95.2
 6 | gradio>=3.33.1
 7 | huggingface_hub>=0.13.3
 8 | importlib_metadata>=6.6.0
 9 | inflection>=0.5.1
10 | markdown2>=2.4.8
11 | more_itertools>=9.1.0
12 | nltk>=3.8.1
13 | numpy>=1.23.5
14 | open_clip_torch>=2.16.0
15 | openai>=1.1.1
16 | opencv_python_headless>=4.5.5.64
17 | Pillow>=9.5.0
18 | pycocoevalcap>=1
19 | pycocotools>=2.0.6
20 | Requests>=2.31.0
21 | scipy>=1.10.1
22 | timm>=0.9.2
23 | tqdm>=4.65.0
24 | transformers==4.35.1
25 | uvicorn>=0.22.0
26 | webdataset>=0.2.48
27 | natsort>=8.4.0
28 | peft>=0.4.0
29 | ijson>=3.2.3
30 | yajl>=0.3.5
31 | deepspeed>=0.10.0
32 | wandb>=0.15.8
33 | trl>=0.5.0
34 | cffi>=1.15.1
35 | pyyaml>=6.0.1
36 | pytest>=7.4.2
37 | prettytable>=3.9.0
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | with open("requirements.txt") as f:
 6 |     requirements = f.read().splitlines()
 7 | 
 8 | setup(
 9 |     name="otter-ai",
10 |     version="0.0.0-alpha-7",
11 |     packages=find_packages(where="src"),
12 |     package_dir={"": "src"},
13 |     install_requires=requirements,
14 |     # package_data={
15 |     #     "otter": ["resources/**/*"],
16 |     # },
17 |     # include_package_data=True,
18 |     author="Otter Team",
19 |     author_email="drluodian@gmail.com",
20 |     description="Otter: A Multi-Modal Model with In-Context Instruction Tuning",
21 |     long_description=open("README.md").read(),
22 |     long_description_content_type="text/markdown",
23 |     url="https://github.com/Luodian/Otter",
24 |     classifiers=[
25 |         "Development Status :: 3 - Alpha",
26 |         "Intended Audience :: Developers",
27 |         "Programming Language :: Python :: 3",
28 |         "Programming Language :: Python :: 3.9",
29 |         "Programming Language :: Python :: 3.10",
30 |         "Programming Language :: Python :: 3.11",
31 |     ],
32 |     # entry_points={
33 |     #     "console_scripts": [
34 |     #         # "syphus = syphus.cli.syphus_cli:main",
35 |     #     ],
36 |     # },
37 | )
38 | 


--------------------------------------------------------------------------------
/shared_scripts/Demo_Data.yaml:
--------------------------------------------------------------------------------
 1 | IMAGE_TEXT: # Group name should be in [IMAGE_TEXT, TEXT_ONLY, IMAGE_TEXT_IN_CONTEXT]
 2 |   LADD: # LLaVA Detailed Description, dataset name can be assigned at any name you want
 3 |       mimicit_path: azure_storage/json/LA/LADD_instructions.json # Path of the instruction json file
 4 |       images_path: azure_storage/Parquets/LA.parquet # Path of the image parquet file
 5 |       num_samples: -1 # Number of samples you want to use, -1 means use all samples, if not set, default is -1.
 6 |   M3IT_CAPTIONING:
 7 |       mimicit_path: azure_storage/json/M3IT/captioning/coco/coco_instructions.json
 8 |       images_path: azure_storage/Parquets/coco.parquet
 9 |       num_samples: 20000
10 |   LACR_T2T:
11 |     mimicit_path: azure_storage/json/LA/LACR_T2T_instructions.json
12 |     images_path: azure_storage/Parquets/LA.parquet
13 |     num_samples: -1
14 |   # M3IT_VQA:
15 |   #   mimicit_path: azure_storage/json/M3IT/vqa/vqav2/vqav2_instructions.json
16 |   #   images_path: azure_storage/json/M3IT/vqa/vqav2/vqav2.json
17 |   #   num_samples: 20000
18 |   M3IT_COCOGOI:
19 |     mimicit_path: azure_storage/json/M3IT/classification/coco-goi/coco-goi_instructions.json
20 |     images_path: azure_storage/Parquets/coco-goi.parquet
21 |     num_samples: 20000
22 |   M3IT_COCOITM:
23 |     mimicit_path: azure_storage/json/M3IT/classification/coco-itm/coco-itm_instructions.json
24 |     images_path: azure_storage/Parquets/coco-itm.parquet
25 |     num_samples: 20000
26 |   M3IT_IMAGENET:
27 |     mimicit_path: azure_storage/json/M3IT/classification/imagenet/imagenet_instructions.json
28 |     images_path: azure_storage/Parquets/imagenet.parquet
29 |     num_samples: 20000
30 |   # # M3IT_IQA:
31 |   # #   mimicit_path: azure_storage/json/M3IT/classification/iqa/iqa_instructions.json
32 |   # #   images_path: azure_storage/json/M3IT/classification/iqa/iqa.json
33 |   # #   num_samples: 20000
34 |   M3IT_REFCOCO:
35 |     mimicit_path: azure_storage/json/M3IT/classification/refcoco/refcoco_instructions.json
36 |     images_path: azure_storage/Parquets/refcoco.parquet
37 |     num_samples: 20000
38 |   # M3IT_VSR:
39 |   #   mimicit_path: azure_storage/json/M3IT/classification/vsr/vsr_instructions.json
40 |   #   images_path: azure_storage/json/M3IT/classification/vsr/vsr.json
41 |   #   num_samples: 20000
42 |   M3IT_TEXT_VQA:
43 |     mimicit_path: azure_storage/json/M3IT/vqa/text-vqa/text-vqa_instructions.json
44 |     images_path: azure_storage/Parquets/text-vqa.parquet
45 |     num_samples: 20000
46 |   M3IT_OKVQA:
47 |     mimicit_path: azure_storage/json/M3IT/vqa/okvqa/okvqa_instructions.json
48 |     images_path: azure_storage/Parquets/okvqa.parquet
49 |     num_samples: 20000
50 |   M3IT_A_OKVQA:
51 |     mimicit_path: azure_storage/json/M3IT/vqa/a-okvqa/a-okvqa_instructions.json
52 |     images_path: azure_storage/Parquets/a-okvqa.parquet
53 |     num_samples: 20000
54 |   M3IT_SIENCEQA:
55 |     mimicit_path: azure_storage/json/M3IT/reasoning/scienceqa/scienceqa_instructions.json
56 |     images_path: azure_storage/Parquets/scienceqa.parquet
57 |     num_samples: 20000
58 |   # SVIT:
59 |   #   mimicit_path: azure_storage/json/SVIT/SVIT_instructions.json
60 |   #   images_path: azure_storage/json/SVIT/SVIT.json
61 |   #   num_samples: 20000
62 |   # PF:
63 |   #   mimicit_path: azure_storage/json/PF/PF_instructions.json
64 |   #   images_path: azure_storage/json/PF/PF.json
65 |   #   num_samples: 20000
66 | 
67 | # TEXT_ONLY:
68 | #   LIMA:
69 | #     mimicit_path: azure_storage/json/LANG_Only/LIMA/LIMA_instructions_max_1K_tokens.json
70 | #     num_samples: 20000
71 | #   SHAREGPT:
72 | #     mimicit_path: azure_storage/json/LANG_Only/SHAREGPT/SHAREGPT_instructions_max_1K_tokens.json
73 | #     num_samples: 10000
74 | #   AL:
75 | #     mimicit_path: azure_storage/json/LANG_Only/AL/AL_instructions_max_1K_tokens.json
76 | #     num_samples: 20000
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/shared_scripts/Demo_OtterHD.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd /root/of/Otter
 3 | 
 4 | export PYTHONPATH=.
 5 | 
 6 | # sent to sub script
 7 | export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
 8 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 9 | export MASTER_PORT=12955
10 | export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
11 | export NCCL_NET=IB
12 | 
13 | echo HOSTNAMES = $HOSTNAMES
14 | echo hostname = $(hostname)
15 | echo MASTER_ADDR= $MASTER_ADDR
16 | echo MASTER_PORT= $MASTER_PORT
17 | 
18 | GPU=$((${COUNT_NODE} * 8))
19 | WORKERS=$((${COUNT_NODE} * 8))
20 | 
21 | if [ $WORKERS -gt 112 ]; then
22 |     WORKERS=112
23 | fi
24 | 
25 | RUN_NAME="RunNamePlaceHolder"
26 | 
27 | echo GPU=${GPU}
28 | echo COUNT_NODE=$COUNT_NODE
29 | echo WORKERS=8
30 | echo "Running ${RUN_NAME}"
31 | 
32 | H=$(hostname)
33 | THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]")
34 | export THEID=$THEID
35 | echo $THEID
36 | 
37 | pkill python
38 | 
39 | 
40 | accelerate launch --config_file=./pipeline/accelerate_configs/accelerate_config_zero2.yaml \
41 |     --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
42 |     --num_machines=${COUNT_NODE} --num_processes=${GPU} \
43 |     pipeline/train/instruction_following.py \
44 |     --pretrained_model_name_or_path=adept/fuyu-8b \
45 |     --training_data_yaml=./Demo_Data.yaml \
46 |     --model_name=fuyu \
47 |     --instruction_format=fuyu \
48 |     --batch_size=8 \
49 |     --gradient_accumulation_steps=2 \
50 |     --num_epochs=3 \
51 |     --report_to_wandb \
52 |     --wandb_entity=libo0013 \
53 |     --external_save_dir=./checkpoints \
54 |     --run_name=${RUN_NAME} \
55 |     --wandb_project=Fuyu \
56 |     --workers=${WORKERS} \
57 |     --lr_scheduler=cosine \
58 |     --learning_rate=1e-5 \
59 |     --warmup_steps_ratio=0.03 \
60 |     --save_hf_model \
61 |     --max_seq_len=1024 \
62 |     --logging_steps=1000 \
63 |     --keep_symbols \
64 |     --save_ckpt_each_epoch \
65 |     --dynamic_resolution \
66 |     --with_task_description
67 | 


--------------------------------------------------------------------------------
/shared_scripts/Demo_OtterMPT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd /root/of/Otter
 3 | 
 4 | export PYTHONPATH=.
 5 | 
 6 | # sent to sub script
 7 | export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
 8 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 9 | export MASTER_PORT=12955
10 | export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
11 | export NCCL_NET=IB
12 | 
13 | echo HOSTNAMES = $HOSTNAMES
14 | echo hostname = $(hostname)
15 | echo MASTER_ADDR= $MASTER_ADDR
16 | echo MASTER_PORT= $MASTER_PORT
17 | 
18 | GPU=$((${COUNT_NODE} * 8))
19 | WORKERS=$((${COUNT_NODE} * 8))
20 | 
21 | if [ $WORKERS -gt 112 ]; then
22 |     WORKERS=112
23 | fi
24 | 
25 | RUN_NAME="RunNamePlaceHolder"
26 | 
27 | echo GPU=${GPU}
28 | echo COUNT_NODE=$COUNT_NODE
29 | echo WORKERS=8
30 | echo "Running ${RUN_NAME}"
31 | 
32 | H=$(hostname)
33 | THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]")
34 | export THEID=$THEID
35 | echo $THEID
36 | 
37 | pkill python
38 | 
39 | 
40 | # --customized_config=./shared_scripts/Otter_MPT7B_Train_Decoder.json
41 | accelerate launch --config_file=./pipeline/accelerate_configs/accelerate_config_zero2.yaml \
42 |     --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
43 |     --num_machines=${COUNT_NODE} --num_processes=${GPU} \
44 |     pipeline/train/instruction_following.py \
45 |     --pretrained_model_name_or_path=adept/fuyu-8b \
46 |     --training_data_yaml=./Demo_Data.yaml \
47 |     --model_name=otter \
48 |     --instruction_format=simple \
49 |     --batch_size=8 \
50 |     --gradient_accumulation_steps=2 \
51 |     --num_epochs=3 \
52 |     --report_to_wandb \
53 |     --wandb_entity=libo0013 \
54 |     --external_save_dir=./checkpoints \
55 |     --run_name=${RUN_NAME} \
56 |     --wandb_project=Fuyu \
57 |     --workers=${WORKERS} \
58 |     --lr_scheduler=cosine \
59 |     --learning_rate=1e-5 \
60 |     --warmup_steps_ratio=0.03 \
61 |     --save_hf_model \
62 |     --max_seq_len=2048 \
63 |     --logging_steps=1000 \
64 |     --keep_symbols \
65 |     --save_ckpt_each_epoch \
66 |     --with_task_description
67 | 


--------------------------------------------------------------------------------
/src/otter_ai/__init__.py:
--------------------------------------------------------------------------------
1 | from . import models
2 | from .models.otter.modeling_otter import OtterForConditionalGeneration
3 | from .models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration
4 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .falcon.modelling_RW import RWForCausalLM
2 | from .mpt.modeling_mpt import MPTForCausalLM
3 | from .mpt_redpajama.mosaic_gpt import MosaicGPT
4 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/falcon/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/falcon/__init__.py


--------------------------------------------------------------------------------
/src/otter_ai/models/falcon/configuration_RW.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Bloom configuration"""
16 | from transformers.configuration_utils import PretrainedConfig
17 | from transformers.utils import logging
18 | 
19 | 
20 | logger = logging.get_logger(__name__)
21 | 
22 | 
23 | class RWConfig(PretrainedConfig):
24 |     model_type = "RefinedWebModel"
25 |     keys_to_ignore_at_inference = ["past_key_values"]
26 |     attribute_map = {
27 |         "num_hidden_layers": "n_layer",
28 |         "num_attention_heads": "n_head",
29 |     }
30 | 
31 |     def __init__(
32 |         self,
33 |         vocab_size=250880,
34 |         hidden_size=64,
35 |         n_layer=2,
36 |         n_head=8,
37 |         layer_norm_epsilon=1e-5,
38 |         initializer_range=0.02,
39 |         use_cache=True,
40 |         bos_token_id=1,
41 |         eos_token_id=2,
42 |         apply_residual_connection_post_layernorm=False,
43 |         hidden_dropout=0.0,
44 |         attention_dropout=0.0,
45 |         multi_query=False,
46 |         alibi=False,
47 |         bias=False,
48 |         parallel_attn=False,
49 |         **kwargs,
50 |     ):
51 |         self.vocab_size = vocab_size
52 |         # Backward compatibility with n_embed kwarg
53 |         n_embed = kwargs.pop("n_embed", None)
54 |         self.hidden_size = hidden_size if n_embed is None else n_embed
55 |         self.n_layer = n_layer
56 |         self.n_head = n_head
57 |         self.layer_norm_epsilon = layer_norm_epsilon
58 |         self.initializer_range = initializer_range
59 |         self.use_cache = use_cache
60 |         self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
61 |         self.hidden_dropout = hidden_dropout
62 |         self.attention_dropout = attention_dropout
63 | 
64 |         self.bos_token_id = bos_token_id
65 |         self.eos_token_id = eos_token_id
66 |         self.multi_query = multi_query
67 |         self.alibi = alibi
68 |         self.bias = bias
69 |         self.parallel_attn = parallel_attn
70 | 
71 |         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
72 | 
73 |     @property
74 |     def head_dim(self):
75 |         return self.hidden_size // self.n_head
76 | 
77 |     @property
78 |     def rotary(self):
79 |         return not self.alibi
80 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from transformers.utils import (
 4 |     OptionalDependencyNotAvailable,
 5 |     _LazyModule,
 6 |     is_torch_available,
 7 | )
 8 | 
 9 | 
10 | _import_structure = {
11 |     "configuration_flamingo": [
12 |         "FlamingoConfig",
13 |     ],
14 | }
15 | 
16 | try:
17 |     if not is_torch_available():
18 |         raise OptionalDependencyNotAvailable()
19 | except OptionalDependencyNotAvailable:
20 |     pass
21 | else:
22 |     _import_structure["modeling_flamingo"] = [
23 |         "FlamingoModel",
24 |         "FlamingoPreTrainedModel",
25 |         "FlamingoForConditionalGeneration",
26 |     ]
27 | 
28 | if TYPE_CHECKING:
29 |     from .configuration_flamingo import FlamingoConfig
30 | 
31 |     # from .processing_flamingo import FlamingoProcessor
32 | 
33 |     try:
34 |         if not is_torch_available():
35 |             raise OptionalDependencyNotAvailable()
36 |     except OptionalDependencyNotAvailable:
37 |         pass
38 |     else:
39 |         from .modeling_flamingo import (
40 |             FlamingoForConditionalGeneration,
41 |             FlamingoModel,
42 |             FlamingoPreTrainedModel,
43 |         )
44 | 
45 | else:
46 |     import sys
47 | 
48 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
49 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "flamingo",
 3 |     "cross_attn_every_n_layers": 4,
 4 |     "tie_word_embeddings": false,
 5 |     "use_media_placement_augmentation": true,
 6 |     "only_attend_previous": true,
 7 |     "text_config": {
 8 |         "_name_or_path": "luodian/llama-7b-hf",
 9 |         "model_type": "llama"
10 |     },
11 |     "vision_config": {
12 |         "_name_or_path": "openai/clip-vit-large-patch14",
13 |         "model_type": "clip_vision_model",
14 |         "hidden_size": 1024,
15 |         "intermediate_size": 4096,
16 |         "num_attention_heads": 16,
17 |         "num_hidden_layers": 24,
18 |         "image_size": 224,
19 |         "patch_size": 14
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/converting_flamingo_to_bf16.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | 
 6 | from .configuration_flamingo import FlamingoConfig
 7 | from .modeling_flamingo import FlamingoForConditionalGeneration
 8 | 
 9 | parser = argparse.ArgumentParser(description="Load model with precision")
10 | parser.add_argument("--load_bit", type=str, choices=["fp16", "bf16"], required=True, help="Choose either 'fp16' or 'bf16'")
11 | parser.add_argument("--pretrained_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True)
12 | parser.add_argument("--saved_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True)
13 | args = parser.parse_args()
14 | 
15 | load_bit = args.load_bit
16 | pretrained_model_path = args.pretrained_model_path
17 | 
18 | if load_bit == "fp16":
19 |     precision = {"torch_dtype": torch.float16}
20 | elif load_bit == "bf16":
21 |     precision = {"torch_dtype": torch.bfloat16}
22 | 
23 | root_dir = os.environ["AZP"]
24 | print(root_dir)
25 | device_id = "cpu"
26 | model = FlamingoForConditionalGeneration.from_pretrained(pretrained_model_path, device_map={"": device_id}, **precision)
27 | 
28 | # save model to same folder
29 | checkpoint_path = pretrained_model_path + f"-{load_bit}"
30 | model.save_pretrained(checkpoint_path, max_shard_size="10GB")
31 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/converting_flamingo_to_hf.py:
--------------------------------------------------------------------------------
 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train.
 2 | """
 3 | 
 4 | import re
 5 | import argparse
 6 | import os
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
11 | 
12 | import sys
13 | from modeling_flamingo import FlamingoForConditionalGeneration
14 | 
15 | from configuration_flamingo import FlamingoConfig
16 | 
17 | 
18 | @torch.no_grad()
19 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
20 |     old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
21 |     if old_ckpt.get("model_state_dict", None) is not None:
22 |         old_ckpt = old_ckpt["model_state_dict"]
23 |     new_ckpt = old_ckpt
24 |     folder_path = os.path.dirname(old_ckpt_path)
25 |     # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "flamingo/config.json"
26 |     model = FlamingoForConditionalGeneration.from_pretrained(
27 |         args.pretrained_model_path,
28 |         device_map="auto",
29 |     )
30 |     _ = model.load_state_dict(new_ckpt, strict=False)
31 |     print(f"Saving HF model to {new_folder_path}")
32 |     model.save_pretrained(new_folder_path)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument(
38 |         "--old_ckpt_path",
39 |         "-old",
40 |         type=str,
41 |         required=True,
42 |         help="Path to the pt checkpoint",
43 |     )
44 |     parser.add_argument(
45 |         "--new_hf_path",
46 |         "-new",
47 |         type=str,
48 |         required=True,
49 |         help="Path to the hf folder",
50 |     )
51 |     parser.add_argument(
52 |         "--pretrained_model_path",
53 |         "-pretrained",
54 |         type=str,
55 |         required=True,
56 |         help="Path to the pretrained model folder",
57 |     )
58 |     args = parser.parse_args()
59 |     if not os.path.exists(os.path.dirname(args.new_hf_path)):
60 |         os.makedirs(os.path.dirname(args.new_hf_path))
61 |     dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)
62 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/converting_flamingo_to_lora.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import sys
 4 | 
 5 | from .modeling_flamingo import FlamingoForConditionalGeneration
 6 | from peft import get_peft_model, LoraConfig, TaskType
 7 | 
 8 | MODEL_CLASSES = {
 9 |     "LlamaForCausalLM": "llama",
10 |     "OPTForCausalLM": "opt",
11 |     "GPTJForCausalLM": "gptj",
12 |     "GPTNeoXForCausalLM": "gpt_neox",
13 |     "MPTForCausalLM": "mpt",
14 | }
15 | 
16 | # Define argument parser
17 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
18 | 
19 | # Add arguments
20 | parser.add_argument(
21 |     "--checkpoint_path",
22 |     type=str,
23 |     help="Path to the pre-trained model checkpoint.",
24 |     default="",
25 | )
26 | parser.add_argument(
27 |     "--save_path",
28 |     type=str,
29 |     default="",
30 |     help="Path to the converted model checkpoint.",
31 | )
32 | 
33 | # Parse the input arguments
34 | args = parser.parse_args()
35 | 
36 | load_bit = "bf16"
37 | if load_bit == "fp16":
38 |     precision = {"torch_dtype": torch.float16}
39 | elif load_bit == "bf16":
40 |     precision = {"torch_dtype": torch.bfloat16}
41 | 
42 | # Load the model
43 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
44 | 
45 | # adding lora
46 | standard_modules = ["q_proj", "v_proj"]
47 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
48 | model_to_lora_modules = {
49 |     "llama": standard_modules,
50 |     "opt": standard_modules,
51 |     "gptj": standard_modules,
52 |     "gpt_neox": ["query_key_value"],
53 |     "mpt": ["Wqkv"],
54 | }
55 | lora_config = LoraConfig(
56 |     r=16,
57 |     lora_alpha=32,
58 |     lora_dropout=0.05,
59 |     task_type=TaskType.CAUSAL_LM,
60 |     target_modules=model_to_lora_modules[lang_encoder_short_name],
61 | )
62 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
63 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
64 | model.lang_encoder.print_trainable_parameters()
65 | 
66 | # Save the model
67 | checkpoint_path = args.save_path
68 | FlamingoForConditionalGeneration.save_pretrained(model, checkpoint_path)
69 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/flamingo-falcon-7B.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_commit_hash": null,
  3 |   "architectures": [
  4 |     "FlamingoModel"
  5 |   ],
  6 |   "cross_attn_every_n_layers": 4,
  7 |   "model_type": "flamingo",
  8 |   "text_config": {
  9 |     "architectures": [
 10 |       "RWForCausalLM"
 11 |     ],
 12 |     "apply_residual_connection_post_layernorm": false,
 13 |     "attention_dropout": 0.0,
 14 |     "bias": false,
 15 |     "bos_token_id": 11,
 16 |     "eos_token_id": 11,
 17 |     "hidden_dropout": 0.0,
 18 |     "hidden_size": 4544,
 19 |     "initializer_range": 0.02,
 20 |     "layer_norm_epsilon": 1e-05,
 21 |     "model_type": "RefinedWebModel",
 22 |     "multi_query": true,
 23 |     "n_head": 71,
 24 |     "n_layer": 32,
 25 |     "parallel_attn": true,
 26 |     "torch_dtype": "bfloat16",
 27 |     "transformers_version": "4.27.4",
 28 |     "use_cache": true,
 29 |     "vocab_size": 65024
 30 |   },
 31 |   "tie_word_embeddings": false,
 32 |   "torch_dtype": "float32",
 33 |   "transformers_version": null,
 34 |   "use_media_placement_augmentation": true,
 35 |   "vision_config": {
 36 |     "_name_or_path": "openai/clip-vit-large-patch14",
 37 |     "add_cross_attention": false,
 38 |     "architectures": null,
 39 |     "attention_dropout": 0.0,
 40 |     "bad_words_ids": null,
 41 |     "begin_suppress_tokens": null,
 42 |     "bos_token_id": null,
 43 |     "chunk_size_feed_forward": 0,
 44 |     "cross_attention_hidden_size": null,
 45 |     "decoder_start_token_id": null,
 46 |     "diversity_penalty": 0.0,
 47 |     "do_sample": false,
 48 |     "early_stopping": false,
 49 |     "encoder_no_repeat_ngram_size": 0,
 50 |     "eos_token_id": null,
 51 |     "exponential_decay_length_penalty": null,
 52 |     "finetuning_task": null,
 53 |     "forced_bos_token_id": null,
 54 |     "forced_eos_token_id": null,
 55 |     "hidden_act": "quick_gelu",
 56 |     "hidden_size": 1024,
 57 |     "id2label": {
 58 |       "0": "LABEL_0",
 59 |       "1": "LABEL_1"
 60 |     },
 61 |     "image_size": 224,
 62 |     "initializer_factor": 1.0,
 63 |     "initializer_range": 0.02,
 64 |     "intermediate_size": 4096,
 65 |     "is_decoder": false,
 66 |     "is_encoder_decoder": false,
 67 |     "label2id": {
 68 |       "LABEL_0": 0,
 69 |       "LABEL_1": 1
 70 |     },
 71 |     "layer_norm_eps": 1e-05,
 72 |     "length_penalty": 1.0,
 73 |     "max_length": 20,
 74 |     "min_length": 0,
 75 |     "model_type": "clip_vision_model",
 76 |     "no_repeat_ngram_size": 0,
 77 |     "num_attention_heads": 16,
 78 |     "num_beam_groups": 1,
 79 |     "num_beams": 1,
 80 |     "num_channels": 3,
 81 |     "num_hidden_layers": 24,
 82 |     "num_return_sequences": 1,
 83 |     "output_attentions": false,
 84 |     "output_hidden_states": false,
 85 |     "output_scores": false,
 86 |     "pad_token_id": null,
 87 |     "patch_size": 14,
 88 |     "prefix": null,
 89 |     "problem_type": null,
 90 |     "projection_dim": 512,
 91 |     "pruned_heads": {},
 92 |     "remove_invalid_values": false,
 93 |     "repetition_penalty": 1.0,
 94 |     "return_dict": true,
 95 |     "return_dict_in_generate": false,
 96 |     "sep_token_id": null,
 97 |     "suppress_tokens": null,
 98 |     "task_specific_params": null,
 99 |     "temperature": 1.0,
100 |     "tf_legacy_loss": false,
101 |     "tie_encoder_decoder": false,
102 |     "tie_word_embeddings": true,
103 |     "tokenizer_class": null,
104 |     "top_k": 50,
105 |     "top_p": 1.0,
106 |     "torch_dtype": null,
107 |     "torchscript": false,
108 |     "transformers_version": "4.28.1",
109 |     "typical_p": 1.0,
110 |     "use_bfloat16": false
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/injecting_falcon_into_flamingo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from .configuration_flamingo import FlamingoConfig
 4 | from .modeling_flamingo import FlamingoForConditionalGeneration
 5 | 
 6 | root_dir = os.environ["AZP"]
 7 | print(root_dir)
 8 | 
 9 | 
10 | config = FlamingoConfig.from_json_file(".flamingo-falcon-7B.json")
11 | model = FlamingoForConditionalGeneration(config=config)
12 | 
13 | 
14 | state_dict_files = [
15 |     f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00001-of-00002.bin",
16 |     f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00002-of-00002.bin",
17 | ]
18 | 
19 | state_dict = {}
20 | for file in state_dict_files:
21 |     state_dict_part = torch.load(file, map_location="cpu")
22 |     state_dict.update(state_dict_part)
23 | 
24 | 
25 | state_dict_3 = torch.load("{root_dir}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
26 | for cur_key in list(state_dict_3.keys()):
27 |     if "vision_encoder" not in cur_key:
28 |         del state_dict_3[cur_key]
29 | 
30 | _ = model.load_state_dict(
31 |     state_dict_3,
32 |     False,
33 | )
34 | print(_[1])
35 | 
36 | save_state_dict_1 = {}
37 | for key in state_dict:
38 |     if ".h." in key:
39 |         _, _, layer_num, *remain_names = key.split(".")
40 |         target_key = f"transformer.h.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
41 |     else:
42 |         target_key = key
43 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
44 | _ = model.lang_encoder.load_state_dict(
45 |     save_state_dict_1,
46 |     False,
47 | )
48 | print(_[1])
49 | model.save_pretrained(f"{root_dir}/otter/checkpoints/flamingo-falcon-7b/")
50 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/injecting_llama2_into_flamingo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | import sys
 8 | 
 9 | from .configuration_flamingo import FlamingoConfig
10 | from .modeling_flamingo import FlamingoForConditionalGeneration
11 | 
12 | # from .configuration_flamingo import FlamingoConfig
13 | # from .modeling_flamingo import FlamingoForConditionalGeneration
14 | 
15 | parser = argparse.ArgumentParser(description="Convert Vicuna model")
16 | parser.add_argument("--model_choice", type=str, default="13B", help="Choose either '7B' or '13B'")
17 | parser.add_argument("--llama2_root_dir", type=str, default="/home/luodian/projects/checkpoints")
18 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
19 | args = parser.parse_args()
20 | 
21 | # os.environ["TOKENIZERS_PARALLELISM"] = "false"
22 | 
23 | root_dir = args.llama2_root_dir
24 | model_choice = args.model_choice
25 | save_root_dir = args.save_root_dir
26 | 
27 | # prepare vicuna model at first
28 | # you can visit https://huggingface.co/lmsys/Llama-2-33b-chat-hf to download 7B and 30B instruct checkpoints.
29 | if model_choice == "7B":
30 |     config_file = "./flamingo/flamingo-llama2-chat-7B.json"
31 |     state_dict_files = [
32 |         f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00001-of-00002.bin",
33 |         f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00002-of-00002.bin",
34 |     ]
35 |     save_path = f"{save_root_dir}/flamingo-llama2-chat-7B-init"
36 | elif model_choice == "13B":
37 |     config_file = "./flamingo/flamingo-llama2-chat-13B.json"
38 |     state_dict_files = [
39 |         f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00001-of-00003.bin",
40 |         f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00002-of-00003.bin",
41 |         f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00003-of-00003.bin",
42 |     ]
43 |     save_path = f"{save_root_dir}/flamingo-llama2-chat-13B-init"
44 | else:
45 |     raise ValueError("Invalid model_choice. Choose either '13B' or '7B'.")
46 | 
47 | config = FlamingoConfig.from_json_file(config_file)
48 | model = FlamingoForConditionalGeneration(config=config)
49 | 
50 | # load flamingo's vision encoder from last checkpoint.
51 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
52 | # AZP = "os.environ["AZP"]"
53 | AZP = os.environ["AZP"]
54 | state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
55 | for cur_key in list(state_dict_3.keys()):
56 |     if "vision_encoder" not in cur_key:
57 |         del state_dict_3[cur_key]
58 | 
59 | load_msg = model.load_state_dict(
60 |     state_dict_3,
61 |     False,
62 | )
63 | # print incompatible keys
64 | print(load_msg[1])
65 | 
66 | # Loading vicuna weights
67 | state_dict = {}
68 | for file in tqdm(state_dict_files, desc="Loading state dict"):
69 |     state_dict_part = torch.load(file, map_location="cpu")
70 |     state_dict.update(state_dict_part)
71 | 
72 | save_state_dict_1 = {}
73 | for key in state_dict:
74 |     if ".layers." in key:
75 |         _, _, layer_num, *remain_names = key.split(".")
76 |         target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
77 |     else:
78 |         target_key = key
79 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
80 | 
81 | # Reshape the token embedding to 50280 for compatible
82 | model.lang_encoder.resize_token_embeddings(32000)
83 | 
84 | load_msg = model.lang_encoder.load_state_dict(
85 |     save_state_dict_1,
86 |     False,
87 | )
88 | # Reshape the token embedding to 32002 for compatible
89 | model.lang_encoder.resize_token_embeddings(32002)
90 | # print incompatible keys
91 | print(load_msg[1])
92 | 
93 | 
94 | print(f"Saving model to {save_path}...")
95 | model.save_pretrained(save_path, max_shard_size="10GB")
96 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | import sys
 8 | 
 9 | from configuration_flamingo import FlamingoConfig
10 | from modeling_flamingo import FlamingoForConditionalGeneration
11 | from utils import rename_flamingo_checkpoint
12 | 
13 | 
14 | parser = argparse.ArgumentParser(description="Convert MPT model")
15 | parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints")
16 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
17 | parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
18 | args = parser.parse_args()
19 | 
20 | 
21 | root_dir = args.mpt_root_dir
22 | save_root_dir = args.save_root_dir
23 | 
24 | # prepare mpt model at first
25 | # you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints.
26 | config_file = "./flamingo/flamingo-mpt-1B-redpajama.json"
27 | state_dict_file = f"{root_dir}/pytorch_model.bin"
28 | save_path = f"{save_root_dir}/flamingo-mpt-1b-redpajama-200b-dolly"
29 | 
30 | config = FlamingoConfig.from_json_file(config_file)
31 | 
32 | model = FlamingoForConditionalGeneration(config=config)
33 | 
34 | # Loading mpt weights
35 | state_dict = torch.load(state_dict_file, map_location="cpu")
36 | save_state_dict_1 = {}
37 | for key in state_dict:
38 |     if ".blocks." in key:
39 |         _, _, layer_num, *remain_names = key.split(".")
40 |         target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
41 |     else:
42 |         target_key = key
43 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
44 | 
45 | load_msg = model.lang_encoder.load_state_dict(
46 |     save_state_dict_1,
47 |     False,
48 | )
49 | 
50 | # load flamingo's vision encoder from last checkpoint.
51 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
52 | AZP = os.environ["AZP"]
53 | state_dict_3 = torch.load(f"{AZP}/pytorch_model-00004-of-00004.bin", map_location="cpu")
54 | for cur_key in list(state_dict_3.keys()):
55 |     if "vision_encoder" not in cur_key:
56 |         del state_dict_3[cur_key]
57 | 
58 | load_msg = model.load_state_dict(
59 |     state_dict_3,
60 |     False,
61 | )
62 | # print incompatible keys
63 | print(load_msg[1])
64 | 
65 | save_state_dict_1 = {}
66 | for key in state_dict:
67 |     if ".blocks." in key:
68 |         _, _, layer_num, *remain_names = key.split(".")
69 |         target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
70 |     else:
71 |         target_key = key
72 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
73 | 
74 | load_msg = model.lang_encoder.load_state_dict(
75 |     save_state_dict_1,
76 |     False,
77 | )
78 | # print incompatible keys
79 | print(load_msg[1])
80 | if args.flamingo_dir is not None:
81 |     state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu")
82 |     save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2)
83 |     real_vocab_size = config.text_config.vocab_size
84 |     # Reshape the token embedding to 50280 for compatible
85 |     model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0])
86 | 
87 |     load_msg = model.load_state_dict(
88 |         save_state_dict_2,
89 |         False,
90 |     )
91 |     # print incompatible keys
92 |     print(load_msg[1])
93 |     # Reshape the token embedding to 50432
94 |     model.lang_encoder.resize_token_embeddings(real_vocab_size)
95 | 
96 | print(f"Saving model to {save_path}...")
97 | model.save_pretrained(save_path, max_shard_size="10GB")
98 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import torch
 3 | 
 4 | 
 5 | def rename_flamingo_checkpoint(old_ckpt: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
 6 |     """Rename some keys in the public flamingo checkpoint"""
 7 |     perceiver_pattern1 = re.compile(r"perceiver\.layers\.[0-9]\.0")
 8 |     perceiver_pattern2 = re.compile(r"perceiver\.layers\.[0-9]\.1")
 9 |     new_ckpt = old_ckpt.copy()
10 |     for key, value in old_ckpt.items():
11 |         if re.match(perceiver_pattern1, key):
12 |             new_key = re.sub(r"([0-9])\.0", r"\1", key)
13 |             new_ckpt.pop(key)
14 |             new_ckpt[new_key] = value
15 |         elif re.match(perceiver_pattern2, key):
16 |             new_key = re.sub(r"([0-9])\.1", r"\1.feed_forward", key)
17 |             new_ckpt.pop(key)
18 |             new_ckpt[new_key] = value
19 |         elif key.startswith("lang_encoder.gated_cross_attn_layers."):
20 |             new_ckpt.pop(key)
21 |         elif key.startswith("lang_encoder.") and "ff_gate" not in key:
22 |             new_key = key.replace("ff", "feed_forward")
23 |             new_ckpt.pop(key)
24 |             new_ckpt[new_key] = value
25 |     return new_ckpt
26 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/converting_flamingo_to_bf16.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | 
 6 | from ..configuration_flamingo import FlamingoConfig
 7 | from ..modeling_flamingo import FlamingoForConditionalGeneration
 8 | 
 9 | parser = argparse.ArgumentParser(description="Load model with precision")
10 | parser.add_argument(
11 |     "--load_bit",
12 |     type=str,
13 |     choices=["fp16", "bf16"],
14 |     required=True,
15 |     help="Choose either 'fp16' or 'bf16'",
16 | )
17 | parser.add_argument(
18 |     "--pretrained_model_path",
19 |     type=str,
20 |     default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init",
21 |     required=True,
22 | )
23 | parser.add_argument(
24 |     "--saved_model_path",
25 |     type=str,
26 |     default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init",
27 |     required=True,
28 | )
29 | args = parser.parse_args()
30 | 
31 | load_bit = args.load_bit
32 | pretrained_model_path = args.pretrained_model_path
33 | 
34 | if load_bit == "fp16":
35 |     precision = {"torch_dtype": torch.float16}
36 | elif load_bit == "bf16":
37 |     precision = {"torch_dtype": torch.bfloat16}
38 | 
39 | root_dir = os.environ["AZP"]
40 | print(root_dir)
41 | device_id = "cpu"
42 | model = FlamingoForConditionalGeneration.from_pretrained(pretrained_model_path, device_map={"": device_id}, **precision)
43 | 
44 | # save model to same folder
45 | checkpoint_path = pretrained_model_path + f"-{load_bit}"
46 | model.save_pretrained(checkpoint_path, max_shard_size="10GB")
47 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/converting_flamingo_to_hf.py:
--------------------------------------------------------------------------------
 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train.
 2 | """
 3 | 
 4 | import re
 5 | import argparse
 6 | import os
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
11 | 
12 | import sys
13 | from ..configuration_flamingo import FlamingoConfig
14 | from ..modeling_flamingo import FlamingoForConditionalGeneration
15 | 
16 | 
17 | @torch.no_grad()
18 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
19 |     old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
20 |     if old_ckpt.get("model_state_dict", None) is not None:
21 |         old_ckpt = old_ckpt["model_state_dict"]
22 |     new_ckpt = old_ckpt
23 |     folder_path = os.path.dirname(old_ckpt_path)
24 |     # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "flamingo/config.json"
25 |     model = FlamingoForConditionalGeneration.from_pretrained(
26 |         args.pretrained_model_path,
27 |         device_map="auto",
28 |     )
29 |     _ = model.load_state_dict(new_ckpt, strict=False)
30 |     print(f"Saving HF model to {new_folder_path}")
31 |     model.save_pretrained(new_folder_path)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument(
37 |         "--old_ckpt_path",
38 |         "-old",
39 |         type=str,
40 |         required=True,
41 |         help="Path to the pt checkpoint",
42 |     )
43 |     parser.add_argument(
44 |         "--new_hf_path",
45 |         "-new",
46 |         type=str,
47 |         required=True,
48 |         help="Path to the hf folder",
49 |     )
50 |     parser.add_argument(
51 |         "--pretrained_model_path",
52 |         "-pretrained",
53 |         type=str,
54 |         required=True,
55 |         help="Path to the pretrained model folder",
56 |     )
57 |     args = parser.parse_args()
58 |     if not os.path.exists(os.path.dirname(args.new_hf_path)):
59 |         os.makedirs(os.path.dirname(args.new_hf_path))
60 |     dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)
61 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/converting_flamingo_to_lora.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import sys
 4 | 
 5 | from ..configuration_flamingo import FlamingoConfig
 6 | from ..modeling_flamingo import FlamingoForConditionalGeneration
 7 | from peft import get_peft_model, LoraConfig, TaskType
 8 | 
 9 | MODEL_CLASSES = {
10 |     "LlamaForCausalLM": "llama",
11 |     "OPTForCausalLM": "opt",
12 |     "GPTJForCausalLM": "gptj",
13 |     "GPTNeoXForCausalLM": "gpt_neox",
14 |     "MPTForCausalLM": "mpt",
15 | }
16 | 
17 | # Define argument parser
18 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
19 | 
20 | # Add arguments
21 | parser.add_argument(
22 |     "--checkpoint_path",
23 |     type=str,
24 |     help="Path to the pre-trained model checkpoint.",
25 |     default="",
26 | )
27 | parser.add_argument(
28 |     "--save_path",
29 |     type=str,
30 |     default="",
31 |     help="Path to the converted model checkpoint.",
32 | )
33 | 
34 | # Parse the input arguments
35 | args = parser.parse_args()
36 | 
37 | load_bit = "bf16"
38 | if load_bit == "fp16":
39 |     precision = {"torch_dtype": torch.float16}
40 | elif load_bit == "bf16":
41 |     precision = {"torch_dtype": torch.bfloat16}
42 | 
43 | # Load the model
44 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
45 | 
46 | # adding lora
47 | standard_modules = ["q_proj", "v_proj"]
48 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
49 | model_to_lora_modules = {
50 |     "llama": standard_modules,
51 |     "opt": standard_modules,
52 |     "gptj": standard_modules,
53 |     "gpt_neox": ["query_key_value"],
54 |     "mpt": ["Wqkv"],
55 | }
56 | lora_config = LoraConfig(
57 |     r=16,
58 |     lora_alpha=32,
59 |     lora_dropout=0.05,
60 |     task_type=TaskType.CAUSAL_LM,
61 |     target_modules=model_to_lora_modules[lang_encoder_short_name],
62 | )
63 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
64 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
65 | model.lang_encoder.print_trainable_parameters()
66 | 
67 | # Save the model
68 | checkpoint_path = args.save_path
69 | FlamingoForConditionalGeneration.save_pretrained(model, checkpoint_path)
70 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/flamingo-falcon-7B.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_commit_hash": null,
  3 |   "architectures": [
  4 |     "FlamingoModel"
  5 |   ],
  6 |   "cross_attn_every_n_layers": 4,
  7 |   "model_type": "flamingo",
  8 |   "text_config": {
  9 |     "architectures": [
 10 |       "RWForCausalLM"
 11 |     ],
 12 |     "apply_residual_connection_post_layernorm": false,
 13 |     "attention_dropout": 0.0,
 14 |     "bias": false,
 15 |     "bos_token_id": 11,
 16 |     "eos_token_id": 11,
 17 |     "hidden_dropout": 0.0,
 18 |     "hidden_size": 4544,
 19 |     "initializer_range": 0.02,
 20 |     "layer_norm_epsilon": 1e-05,
 21 |     "model_type": "RefinedWebModel",
 22 |     "multi_query": true,
 23 |     "n_head": 71,
 24 |     "n_layer": 32,
 25 |     "parallel_attn": true,
 26 |     "torch_dtype": "bfloat16",
 27 |     "transformers_version": "4.27.4",
 28 |     "use_cache": true,
 29 |     "vocab_size": 65024
 30 |   },
 31 |   "tie_word_embeddings": false,
 32 |   "torch_dtype": "float32",
 33 |   "transformers_version": null,
 34 |   "use_media_placement_augmentation": true,
 35 |   "vision_config": {
 36 |     "_name_or_path": "openai/clip-vit-large-patch14",
 37 |     "add_cross_attention": false,
 38 |     "architectures": null,
 39 |     "attention_dropout": 0.0,
 40 |     "bad_words_ids": null,
 41 |     "begin_suppress_tokens": null,
 42 |     "bos_token_id": null,
 43 |     "chunk_size_feed_forward": 0,
 44 |     "cross_attention_hidden_size": null,
 45 |     "decoder_start_token_id": null,
 46 |     "diversity_penalty": 0.0,
 47 |     "do_sample": false,
 48 |     "early_stopping": false,
 49 |     "encoder_no_repeat_ngram_size": 0,
 50 |     "eos_token_id": null,
 51 |     "exponential_decay_length_penalty": null,
 52 |     "finetuning_task": null,
 53 |     "forced_bos_token_id": null,
 54 |     "forced_eos_token_id": null,
 55 |     "hidden_act": "quick_gelu",
 56 |     "hidden_size": 1024,
 57 |     "id2label": {
 58 |       "0": "LABEL_0",
 59 |       "1": "LABEL_1"
 60 |     },
 61 |     "image_size": 224,
 62 |     "initializer_factor": 1.0,
 63 |     "initializer_range": 0.02,
 64 |     "intermediate_size": 4096,
 65 |     "is_decoder": false,
 66 |     "is_encoder_decoder": false,
 67 |     "label2id": {
 68 |       "LABEL_0": 0,
 69 |       "LABEL_1": 1
 70 |     },
 71 |     "layer_norm_eps": 1e-05,
 72 |     "length_penalty": 1.0,
 73 |     "max_length": 20,
 74 |     "min_length": 0,
 75 |     "model_type": "clip_vision_model",
 76 |     "no_repeat_ngram_size": 0,
 77 |     "num_attention_heads": 16,
 78 |     "num_beam_groups": 1,
 79 |     "num_beams": 1,
 80 |     "num_channels": 3,
 81 |     "num_hidden_layers": 24,
 82 |     "num_return_sequences": 1,
 83 |     "output_attentions": false,
 84 |     "output_hidden_states": false,
 85 |     "output_scores": false,
 86 |     "pad_token_id": null,
 87 |     "patch_size": 14,
 88 |     "prefix": null,
 89 |     "problem_type": null,
 90 |     "projection_dim": 512,
 91 |     "pruned_heads": {},
 92 |     "remove_invalid_values": false,
 93 |     "repetition_penalty": 1.0,
 94 |     "return_dict": true,
 95 |     "return_dict_in_generate": false,
 96 |     "sep_token_id": null,
 97 |     "suppress_tokens": null,
 98 |     "task_specific_params": null,
 99 |     "temperature": 1.0,
100 |     "tf_legacy_loss": false,
101 |     "tie_encoder_decoder": false,
102 |     "tie_word_embeddings": true,
103 |     "tokenizer_class": null,
104 |     "top_k": 50,
105 |     "top_p": 1.0,
106 |     "torch_dtype": null,
107 |     "torchscript": false,
108 |     "transformers_version": "4.28.1",
109 |     "typical_p": 1.0,
110 |     "use_bfloat16": false
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/injecting_falcon_into_flamingo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from ..configuration_flamingo import FlamingoConfig
 4 | from ..modeling_flamingo import FlamingoForConditionalGeneration
 5 | 
 6 | root_dir = os.environ["AZP"]
 7 | print(root_dir)
 8 | 
 9 | 
10 | config = FlamingoConfig.from_json_file(".flamingo-falcon-7B.json")
11 | model = FlamingoForConditionalGeneration(config=config)
12 | 
13 | 
14 | state_dict_files = [
15 |     f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00001-of-00002.bin",
16 |     f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00002-of-00002.bin",
17 | ]
18 | 
19 | state_dict = {}
20 | for file in state_dict_files:
21 |     state_dict_part = torch.load(file, map_location="cpu")
22 |     state_dict.update(state_dict_part)
23 | 
24 | 
25 | state_dict_3 = torch.load(
26 |     "{root_dir}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin",
27 |     map_location="cpu",
28 | )
29 | for cur_key in list(state_dict_3.keys()):
30 |     if "vision_encoder" not in cur_key:
31 |         del state_dict_3[cur_key]
32 | 
33 | _ = model.load_state_dict(
34 |     state_dict_3,
35 |     False,
36 | )
37 | print(_[1])
38 | 
39 | save_state_dict_1 = {}
40 | for key in state_dict:
41 |     if ".h." in key:
42 |         _, _, layer_num, *remain_names = key.split(".")
43 |         target_key = f"transformer.h.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
44 |     else:
45 |         target_key = key
46 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
47 | _ = model.lang_encoder.load_state_dict(
48 |     save_state_dict_1,
49 |     False,
50 | )
51 | print(_[1])
52 | model.save_pretrained(f"{root_dir}/otter/checkpoints/flamingo-falcon-7b/")
53 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/injecting_llama2_into_flamingo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | import sys
 8 | 
 9 | from ..configuration_flamingo import FlamingoConfig
10 | from ..modeling_flamingo import FlamingoForConditionalGeneration
11 | 
12 | # from .configuration_flamingo import FlamingoConfig
13 | # from .modeling_flamingo import FlamingoForConditionalGeneration
14 | 
15 | parser = argparse.ArgumentParser(description="Convert Vicuna model")
16 | parser.add_argument("--model_choice", type=str, default="13B", help="Choose either '7B' or '13B'")
17 | parser.add_argument("--llama2_root_dir", type=str, default="/home/luodian/projects/checkpoints")
18 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
19 | args = parser.parse_args()
20 | 
21 | # os.environ["TOKENIZERS_PARALLELISM"] = "false"
22 | 
23 | root_dir = args.llama2_root_dir
24 | model_choice = args.model_choice
25 | save_root_dir = args.save_root_dir
26 | 
27 | # prepare vicuna model at first
28 | # you can visit https://huggingface.co/lmsys/Llama-2-33b-chat-hf to download 7B and 30B instruct checkpoints.
29 | if model_choice == "7B":
30 |     config_file = "./flamingo/flamingo-llama2-chat-7B.json"
31 |     state_dict_files = [
32 |         f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00001-of-00002.bin",
33 |         f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00002-of-00002.bin",
34 |     ]
35 |     save_path = f"{save_root_dir}/flamingo-llama2-chat-7B-init"
36 | elif model_choice == "13B":
37 |     config_file = "./flamingo/flamingo-llama2-chat-13B.json"
38 |     state_dict_files = [
39 |         f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00001-of-00003.bin",
40 |         f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00002-of-00003.bin",
41 |         f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00003-of-00003.bin",
42 |     ]
43 |     save_path = f"{save_root_dir}/flamingo-llama2-chat-13B-init"
44 | else:
45 |     raise ValueError("Invalid model_choice. Choose either '13B' or '7B'.")
46 | 
47 | config = FlamingoConfig.from_json_file(config_file)
48 | model = FlamingoForConditionalGeneration(config=config)
49 | 
50 | # load flamingo's vision encoder from last checkpoint.
51 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
52 | # AZP = "os.environ["AZP"]"
53 | AZP = os.environ["AZP"]
54 | state_dict_3 = torch.load(
55 |     f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin",
56 |     map_location="cpu",
57 | )
58 | for cur_key in list(state_dict_3.keys()):
59 |     if "vision_encoder" not in cur_key:
60 |         del state_dict_3[cur_key]
61 | 
62 | load_msg = model.load_state_dict(
63 |     state_dict_3,
64 |     False,
65 | )
66 | # print incompatible keys
67 | print(load_msg[1])
68 | 
69 | # Loading vicuna weights
70 | state_dict = {}
71 | for file in tqdm(state_dict_files, desc="Loading state dict"):
72 |     state_dict_part = torch.load(file, map_location="cpu")
73 |     state_dict.update(state_dict_part)
74 | 
75 | save_state_dict_1 = {}
76 | for key in state_dict:
77 |     if ".layers." in key:
78 |         _, _, layer_num, *remain_names = key.split(".")
79 |         target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
80 |     else:
81 |         target_key = key
82 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
83 | 
84 | # Reshape the token embedding to 50280 for compatible
85 | model.lang_encoder.resize_token_embeddings(32000)
86 | 
87 | load_msg = model.lang_encoder.load_state_dict(
88 |     save_state_dict_1,
89 |     False,
90 | )
91 | # Reshape the token embedding to 32002 for compatible
92 | model.lang_encoder.resize_token_embeddings(32002)
93 | # print incompatible keys
94 | print(load_msg[1])
95 | 
96 | 
97 | print(f"Saving model to {save_path}...")
98 | model.save_pretrained(save_path, max_shard_size="10GB")
99 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/flamingo/utils/injecting_mpt-1B-redpajama_into_flamingo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | import sys
  8 | 
  9 | from ..configuration_flamingo import FlamingoConfig
 10 | from ..modeling_flamingo import FlamingoForConditionalGeneration
 11 | from utils import rename_flamingo_checkpoint
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser(description="Convert MPT model")
 15 | parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints")
 16 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
 17 | parser.add_argument(
 18 |     "--flamingo_dir",
 19 |     type=str,
 20 |     default=None,
 21 |     help="If the pretrained flamingo weights also need to be injected",
 22 | )
 23 | args = parser.parse_args()
 24 | 
 25 | 
 26 | root_dir = args.mpt_root_dir
 27 | save_root_dir = args.save_root_dir
 28 | 
 29 | # prepare mpt model at first
 30 | # you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints.
 31 | config_file = "./flamingo/flamingo-mpt-1B-redpajama.json"
 32 | state_dict_file = f"{root_dir}/pytorch_model.bin"
 33 | save_path = f"{save_root_dir}/flamingo-mpt-1b-redpajama-200b-dolly"
 34 | 
 35 | config = FlamingoConfig.from_json_file(config_file)
 36 | 
 37 | model = FlamingoForConditionalGeneration(config=config)
 38 | 
 39 | # Loading mpt weights
 40 | state_dict = torch.load(state_dict_file, map_location="cpu")
 41 | save_state_dict_1 = {}
 42 | for key in state_dict:
 43 |     if ".blocks." in key:
 44 |         _, _, layer_num, *remain_names = key.split(".")
 45 |         target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
 46 |     else:
 47 |         target_key = key
 48 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
 49 | 
 50 | load_msg = model.lang_encoder.load_state_dict(
 51 |     save_state_dict_1,
 52 |     False,
 53 | )
 54 | 
 55 | # load flamingo's vision encoder from last checkpoint.
 56 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
 57 | AZP = os.environ["AZP"]
 58 | state_dict_3 = torch.load(f"{AZP}/pytorch_model-00004-of-00004.bin", map_location="cpu")
 59 | for cur_key in list(state_dict_3.keys()):
 60 |     if "vision_encoder" not in cur_key:
 61 |         del state_dict_3[cur_key]
 62 | 
 63 | load_msg = model.load_state_dict(
 64 |     state_dict_3,
 65 |     False,
 66 | )
 67 | # print incompatible keys
 68 | print(load_msg[1])
 69 | 
 70 | save_state_dict_1 = {}
 71 | for key in state_dict:
 72 |     if ".blocks." in key:
 73 |         _, _, layer_num, *remain_names = key.split(".")
 74 |         target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
 75 |     else:
 76 |         target_key = key
 77 |     save_state_dict_1[f"{target_key}"] = state_dict[key]
 78 | 
 79 | load_msg = model.lang_encoder.load_state_dict(
 80 |     save_state_dict_1,
 81 |     False,
 82 | )
 83 | # print incompatible keys
 84 | print(load_msg[1])
 85 | if args.flamingo_dir is not None:
 86 |     state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu")
 87 |     save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2)
 88 |     real_vocab_size = config.text_config.vocab_size
 89 |     # Reshape the token embedding to 50280 for compatible
 90 |     model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0])
 91 | 
 92 |     load_msg = model.load_state_dict(
 93 |         save_state_dict_2,
 94 |         False,
 95 |     )
 96 |     # print incompatible keys
 97 |     print(load_msg[1])
 98 |     # Reshape the token embedding to 50432
 99 |     model.lang_encoder.resize_token_embeddings(real_vocab_size)
100 | 
101 | print(f"Saving model to {save_path}...")
102 | model.save_pretrained(save_path, max_shard_size="10GB")
103 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/mpt/__init__.py


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt/adapt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 3 | 
 4 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 5 | NUM_SENTINEL_TOKENS: int = 100
 6 | 
 7 | 
 8 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
 9 |     """Adds sentinel tokens and padding token (if missing).
10 | 
11 |     Expands the tokenizer vocabulary to include sentinel tokens
12 |     used in mixture-of-denoiser tasks as well as a padding token.
13 | 
14 |     All added tokens are added as special tokens. No tokens are
15 |     added if sentinel tokens and padding token already exist.
16 |     """
17 |     sentinels_to_add = [f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)]
18 |     tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
19 |     if tokenizer.pad_token is None:
20 |         tokenizer.add_tokens("<pad>", special_tokens=True)
21 |         tokenizer.pad_token = "<pad>"
22 |         assert tokenizer.pad_token_id is not None
23 |     sentinels = "".join([f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)])
24 |     _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
25 |     tokenizer.sentinel_token_ids = _sentinel_token_ids
26 | 
27 | 
28 | class AutoTokenizerForMOD(AutoTokenizer):
29 |     """AutoTokenizer + Adaptation for MOD.
30 | 
31 |     A simple wrapper around AutoTokenizer to make instantiating
32 |     an MOD-adapted tokenizer a bit easier.
33 | 
34 |     MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
35 |     a padding token, and a property to get the token ids of the
36 |     sentinel tokens.
37 |     """
38 | 
39 |     @classmethod
40 |     def from_pretrained(cls, *args, **kwargs):
41 |         """See `AutoTokenizer.from_pretrained` docstring."""
42 |         tokenizer = super().from_pretrained(*args, **kwargs)
43 |         adapt_tokenizer_for_denoising(tokenizer)
44 |         return tokenizer
45 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt/blocks.py:
--------------------------------------------------------------------------------
 1 | """GPT Blocks used for the GPT Model."""
 2 | from typing import Dict, Optional, Tuple
 3 | import torch
 4 | import torch.nn as nn
 5 | from .attention import ATTN_CLASS_REGISTRY
 6 | from .norm import NORM_CLASS_REGISTRY
 7 | 
 8 | 
 9 | class MPTMLP(nn.Module):
10 |     def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str] = None):
11 |         super().__init__()
12 |         self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
13 |         ## yh: hard code
14 |         # self.act = nn.GELU(approximate='none')
15 |         self.act = nn.GELU()
16 |         self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
17 |         self.down_proj._is_residual = True
18 | 
19 |     def forward(self, x):
20 |         return self.down_proj(self.act(self.up_proj(x)))
21 | 
22 | 
23 | class MPTBlock(nn.Module):
24 |     def __init__(
25 |         self,
26 |         d_model: int,
27 |         n_heads: int,
28 |         expansion_ratio: int,
29 |         attn_config: Dict = {
30 |             "attn_type": "multihead_attention",
31 |             "attn_pdrop": 0.0,
32 |             "attn_impl": "triton",
33 |             "qk_ln": False,
34 |             "clip_qkv": None,
35 |             "softmax_scale": None,
36 |             "prefix_lm": False,
37 |             "attn_uses_sequence_id": False,
38 |             "alibi": False,
39 |             "alibi_bias_max": 8,
40 |         },
41 |         resid_pdrop: float = 0.0,
42 |         norm_type: str = "low_precision_layernorm",
43 |         verbose: int = 0,
44 |         device: Optional[str] = None,
45 |         **kwargs
46 |     ):
47 |         del kwargs
48 |         super().__init__()
49 |         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
50 |         attn_class = ATTN_CLASS_REGISTRY[attn_config["attn_type"]]
51 |         self.norm_1 = norm_class(d_model, device=device)
52 |         self.attn = attn_class(
53 |             attn_impl=attn_config["attn_impl"],
54 |             clip_qkv=attn_config["clip_qkv"],
55 |             qk_ln=attn_config["qk_ln"],
56 |             softmax_scale=attn_config["softmax_scale"],
57 |             attn_pdrop=attn_config["attn_pdrop"],
58 |             d_model=d_model,
59 |             n_heads=n_heads,
60 |             verbose=verbose,
61 |             device=device,
62 |         )
63 |         self.norm_2 = norm_class(d_model, device=device)
64 |         self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
65 |         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
66 |         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
67 | 
68 |     def forward(
69 |         self,
70 |         x: torch.Tensor,
71 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
72 |         attn_bias: Optional[torch.Tensor] = None,
73 |         attention_mask: Optional[torch.ByteTensor] = None,
74 |         is_causal: bool = True,
75 |     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
76 |         a = self.norm_1(x)
77 |         (b, attn_weights, past_key_value) = self.attn(
78 |             a,
79 |             past_key_value=past_key_value,
80 |             attn_bias=attn_bias,
81 |             attention_mask=attention_mask,
82 |             is_causal=is_causal,
83 |         )
84 |         x = x + self.resid_attn_dropout(b)
85 |         m = self.norm_2(x)
86 |         n = self.ffn(m)
87 |         x = x + self.resid_ffn_dropout(n)
88 |         return (x, attn_weights, past_key_value)
89 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | class SharedEmbedding(nn.Embedding):
 8 |     def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)
12 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def _cast_if_autocast_enabled(tensor):
 5 |     if torch.is_autocast_enabled():
 6 |         if tensor.device.type == "cuda":
 7 |             dtype = torch.get_autocast_gpu_dtype()
 8 |         elif tensor.device.type == "cpu":
 9 |             dtype = torch.get_autocast_cpu_dtype()
10 |         else:
11 |             raise NotImplementedError()
12 |         return tensor.to(dtype=dtype)
13 |     return tensor
14 | 
15 | 
16 | class LPLayerNorm(torch.nn.LayerNorm):
17 |     def __init__(
18 |         self,
19 |         normalized_shape,
20 |         eps=1e-05,
21 |         elementwise_affine=True,
22 |         device=None,
23 |         dtype=None,
24 |     ):
25 |         super().__init__(
26 |             normalized_shape=normalized_shape,
27 |             eps=eps,
28 |             elementwise_affine=elementwise_affine,
29 |             device=device,
30 |             dtype=dtype,
31 |         )
32 | 
33 |     def forward(self, x):
34 |         module_device = x.device
35 |         downcast_x = _cast_if_autocast_enabled(x)
36 |         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
37 |         downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
38 |         with torch.autocast(enabled=False, device_type=module_device.type):
39 |             return torch.nn.functional.layer_norm(
40 |                 downcast_x,
41 |                 self.normalized_shape,
42 |                 downcast_weight,
43 |                 downcast_bias,
44 |                 self.eps,
45 |             )
46 | 
47 | 
48 | def rms_norm(x, weight=None, eps=1e-05):
49 |     output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
50 |     if weight is not None:
51 |         return output * weight
52 |     return output
53 | 
54 | 
55 | class RMSNorm(torch.nn.Module):
56 |     def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
57 |         super().__init__()
58 |         self.eps = eps
59 |         if weight:
60 |             self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
61 |         else:
62 |             self.register_parameter("weight", None)
63 | 
64 |     def forward(self, x):
65 |         return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
66 | 
67 | 
68 | class LPRMSNorm(RMSNorm):
69 |     def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
70 |         super().__init__(
71 |             normalized_shape=normalized_shape,
72 |             eps=eps,
73 |             weight=weight,
74 |             dtype=dtype,
75 |             device=device,
76 |         )
77 | 
78 |     def forward(self, x):
79 |         downcast_x = _cast_if_autocast_enabled(x)
80 |         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
81 |         with torch.autocast(enabled=False, device_type=x.device.type):
82 |             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
83 | 
84 | 
85 | NORM_CLASS_REGISTRY = {
86 |     "layernorm": torch.nn.LayerNorm,
87 |     "low_precision_layernorm": LPLayerNorm,
88 |     "rmsnorm": RMSNorm,
89 |     "low_precision_rmsnorm": LPRMSNorm,
90 | }
91 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt_redpajama/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/mpt_redpajama/__init__.py


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt_redpajama/gpt_blocks.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 MosaicML Examples authors
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | """GPT Blocks used for the GPT Model."""
 5 | 
 6 | from typing import Optional, Tuple
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | from .attention import MultiheadAttention
12 | from .low_precision_layernorm import LPLayerNorm
13 | 
14 | 
15 | class GPTMLP(nn.Module):
16 |     def __init__(self, d_model: int, mlp_ratio: int, device: Optional[str] = None):
17 |         super().__init__()
18 |         self.mlp_up = nn.Linear(d_model, mlp_ratio * d_model, device=device)
19 |         self.mlp_act = nn.GELU()
20 |         self.mlp_down = nn.Linear(mlp_ratio * d_model, d_model, device=device)
21 |         self.mlp_down._is_residual = True  # type: ignore
22 | 
23 |     def forward(self, x):
24 |         return self.mlp_down(self.mlp_act(self.mlp_up(x)))
25 | 
26 | 
27 | class GPTBlock(nn.Module):
28 |     def __init__(
29 |         self,
30 |         attn_impl: str,
31 |         d_model: int,
32 |         n_heads: int,
33 |         mlp_ratio: int,
34 |         attn_clip_qkv: Optional[float] = None,
35 |         attn_qk_ln: bool = False,
36 |         softmax_scale: Optional[float] = None,
37 |         attn_pdrop: float = 0.0,
38 |         alibi: bool = False,
39 |         resid_pdrop: float = 0.0,
40 |         low_precision_layernorm: bool = False,
41 |         device: Optional[str] = None,
42 |         **kwargs
43 |     ):
44 |         del kwargs  # unused, just to capture any extra args from the config
45 |         super().__init__()
46 | 
47 |         layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
48 | 
49 |         self.ln_1 = layernorm_class(d_model, device=device)
50 |         self.attn = MultiheadAttention(
51 |             attn_impl=attn_impl,
52 |             attn_clip_qkv=attn_clip_qkv,
53 |             attn_qk_ln=attn_qk_ln,
54 |             softmax_scale=softmax_scale,
55 |             attn_pdrop=attn_pdrop,
56 |             d_model=d_model,
57 |             n_heads=n_heads,
58 |             device=device,
59 |         )
60 |         self.ln_2 = layernorm_class(d_model, device=device)
61 |         self.mlp = GPTMLP(
62 |             d_model=d_model,
63 |             mlp_ratio=mlp_ratio,
64 |             device=device,
65 |         )
66 |         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
67 |         self.resid_mlp_dropout = nn.Dropout(resid_pdrop)
68 | 
69 |     def forward(
70 |         self,
71 |         x: torch.Tensor,
72 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
73 |         attn_bias: Optional[torch.Tensor] = None,
74 |         attention_mask: Optional[torch.ByteTensor] = None,
75 |         is_causal: bool = True,
76 |     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
77 |         a = self.ln_1(x)
78 |         b, _, past_key_value = self.attn(
79 |             a,
80 |             past_key_value=past_key_value,
81 |             attn_bias=attn_bias,
82 |             attention_mask=attention_mask,
83 |             is_causal=is_causal,
84 |         )
85 |         x = x + self.resid_attn_dropout(b)
86 |         m = self.ln_2(x)
87 |         n = self.mlp(m)
88 |         x = x + self.resid_mlp_dropout(n)
89 |         return x, past_key_value
90 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/mpt_redpajama/low_precision_layernorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class LPLayerNorm(torch.nn.LayerNorm):
 6 |     def __init__(
 7 |         self,
 8 |         normalized_shape,
 9 |         eps=1e-05,
10 |         elementwise_affine=True,
11 |         device=None,
12 |         dtype=None,
13 |     ):
14 |         super().__init__(
15 |             normalized_shape=normalized_shape,
16 |             eps=eps,
17 |             elementwise_affine=elementwise_affine,
18 |             device=device,
19 |             dtype=dtype,
20 |         )
21 | 
22 |     def forward(self, x):
23 |         module_device = x.device
24 |         downcast_x = _cast_if_autocast_enabled(x)
25 |         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
26 |         downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
27 |         with torch.autocast(enabled=False, device_type=module_device.type):
28 |             return F.layer_norm(
29 |                 downcast_x,
30 |                 self.normalized_shape,
31 |                 downcast_weight,
32 |                 downcast_bias,
33 |                 self.eps,
34 |             )
35 | 
36 | 
37 | def _cast_if_autocast_enabled(tensor):
38 |     if torch.is_autocast_enabled():
39 |         if tensor.device.type == "cuda":
40 |             dtype = torch.get_autocast_gpu_dtype()
41 |         elif tensor.device.type == "cpu":
42 |             dtype = torch.get_autocast_cpu_dtype()
43 |         else:
44 |             raise NotImplementedError()
45 |         return tensor.to(dtype=dtype)
46 |     return tensor
47 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/Otter-MPT7B-config.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/otter/Otter-MPT7B-config.json


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from transformers.utils import (
 4 |     OptionalDependencyNotAvailable,
 5 |     _LazyModule,
 6 |     is_torch_available,
 7 | )
 8 | 
 9 | 
10 | _import_structure = {
11 |     "configuration_otter": [
12 |         "OtterConfig",
13 |     ],
14 | }
15 | 
16 | try:
17 |     if not is_torch_available():
18 |         raise OptionalDependencyNotAvailable()
19 | except OptionalDependencyNotAvailable:
20 |     pass
21 | else:
22 |     _import_structure["modeling_otter"] = [
23 |         "OtterModel",
24 |         "OtterPreTrainedModel",
25 |         "OtterForConditionalGeneration",
26 |     ]
27 | 
28 | if TYPE_CHECKING:
29 |     from .configuration_otter import OtterConfig
30 | 
31 |     # from .processing_otter import OtterProcessor
32 | 
33 |     try:
34 |         if not is_torch_available():
35 |             raise OptionalDependencyNotAvailable()
36 |     except OptionalDependencyNotAvailable:
37 |         pass
38 |     else:
39 |         from .modeling_otter import (
40 |             OtterForConditionalGeneration,
41 |             OtterModel,
42 |             OtterPreTrainedModel,
43 |         )
44 | 
45 | else:
46 |     import sys
47 | 
48 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
49 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "otter",
 3 |     "cross_attn_every_n_layers": 4,
 4 |     "tie_word_embeddings": false,
 5 |     "use_media_placement_augmentation": true,
 6 |     "only_attend_previous": true,
 7 |     "text_config": {
 8 |         "_name_or_path": "luodian/llama-7b-hf",
 9 |         "model_type": "llama"
10 |     },
11 |     "vision_config": {
12 |         "_name_or_path": "openai/clip-vit-large-patch14",
13 |         "model_type": "clip_vision_model",
14 |         "hidden_size": 1024,
15 |         "intermediate_size": 4096,
16 |         "num_attention_heads": 16,
17 |         "num_hidden_layers": 24,
18 |         "image_size": 224,
19 |         "patch_size": 14
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/converting_flamingo_to_otter.py:
--------------------------------------------------------------------------------
 1 | # This script is used to convert the huggingface format Open-Flamingo model to the Otter model.
 2 | # You can use it in parent folder by running: python -m models.otter.converting_flamingo_to_otter --checkpoint_path <path_to_flamingo_checkpoint> --save_path <path_to_save_otter_checkpoint>
 3 | import argparse
 4 | import torch
 5 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration
 6 | from otter_ai.models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration
 7 | 
 8 | # Define argument parser
 9 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
10 | 
11 | # Add arguments
12 | parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the pre-trained Open-Flamingo model checkpoint.")
13 | parser.add_argument("--save_path", type=str, default=None, help="Path to the converted Otter model checkpoint.")
14 | 
15 | # Parse the input arguments
16 | args = parser.parse_args()
17 | 
18 | # Load the model
19 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto")
20 | model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]})
21 | if model.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
22 |     model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer))
23 | 
24 | # Save the model
25 | checkpoint_path = args.save_path
26 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
27 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/converting_otter_fp32_to_fp16.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration
 4 | 
 5 | # Define argument parser
 6 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
 7 | 
 8 | # Add arguments
 9 | parser.add_argument(
10 |     "--load_bit",
11 |     type=str,
12 |     choices=["fp16", "bf16"],
13 |     default="fp16",
14 |     help="Precision of the loaded model. Either 'fp16' or 'bf16'. Default is 'fp16'.",
15 | )
16 | parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the pre-trained model checkpoint.")
17 | parser.add_argument("--save_path", type=str, default=None, help="Path to the converted model checkpoint.")
18 | 
19 | # Parse the input arguments
20 | args = parser.parse_args()
21 | 
22 | # Set precision based on load_bit argument
23 | if args.load_bit == "fp16":
24 |     precision = {"torch_dtype": torch.float16}
25 | elif args.load_bit == "bf16":
26 |     precision = {"torch_dtype": torch.bfloat16}
27 | 
28 | # Load the model
29 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
30 | 
31 | # Save the model
32 | if args.save_path is None:
33 |     checkpoint_path = args.checkpoint_path + f"-{args.load_bit}"
34 | else:
35 |     checkpoint_path = args.save_path
36 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
37 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/converting_otter_pt_to_hf.py:
--------------------------------------------------------------------------------
 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train.
 2 | """
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | import torch
 8 | 
 9 | from modeling_otter import OtterForConditionalGeneration
10 | 
11 | 
12 | # The function is to inject newly trained otter perceiver parameters into the pretrained otter init model.
13 | @torch.no_grad()
14 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
15 |     old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
16 |     if old_ckpt.get("model_state_dict", None) is not None:
17 |         old_ckpt = old_ckpt["model_state_dict"]
18 |     new_ckpt = old_ckpt
19 |     # folder_path = os.path.dirname(old_ckpt_path)
20 |     # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "otter/config.json"
21 |     model = OtterForConditionalGeneration.from_pretrained(
22 |         args.pretrained_model_path,
23 |         device_map="auto",
24 |     )
25 | 
26 |     if "flamingo" in args.pretrained_model_path:
27 |         model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<answer>"]})
28 |         if "LlamaForCausalLM" in model.lang_encoder.__class__.__name__:
29 |             model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer))
30 | 
31 |     _ = model.load_state_dict(new_ckpt, strict=False)
32 |     print(f"Saving HF model to {new_folder_path}")
33 |     model.save_pretrained(new_folder_path)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument(
39 |         "--old_ckpt_path",
40 |         "-old",
41 |         type=str,
42 |         required=True,
43 |         help="Path to the pt checkpoint",
44 |     )
45 |     parser.add_argument(
46 |         "--new_hf_path",
47 |         "-new",
48 |         type=str,
49 |         required=True,
50 |         help="Path to the hf folder",
51 |     )
52 |     parser.add_argument(
53 |         "--pretrained_model_path",
54 |         "-pretrained",
55 |         type=str,
56 |         default="luodian/OTTER-MPT7B-Init",
57 |         required=True,
58 |         help="Path to the pretrained model folder.",
59 |     )
60 |     args = parser.parse_args()
61 |     if not os.path.exists(os.path.dirname(args.new_hf_path)):
62 |         os.makedirs(os.path.dirname(args.new_hf_path))
63 |     dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)
64 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/converting_otter_to_lora.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import sys
 4 | 
 5 | from .modeling_otter import OtterForConditionalGeneration
 6 | from peft import get_peft_model, LoraConfig, TaskType
 7 | 
 8 | MODEL_CLASSES = {
 9 |     "LlamaForCausalLM": "llama",
10 |     "OPTForCausalLM": "opt",
11 |     "GPTJForCausalLM": "gptj",
12 |     "GPTNeoXForCausalLM": "gpt_neox",
13 |     "MPTForCausalLM": "mpt",
14 | }
15 | 
16 | # Define argument parser
17 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
18 | 
19 | # Add arguments
20 | parser.add_argument(
21 |     "--checkpoint_path",
22 |     type=str,
23 |     help="Path to the pre-trained model checkpoint.",
24 |     default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705",
25 | )
26 | parser.add_argument(
27 |     "--save_path",
28 |     type=str,
29 |     default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705-LoRA",
30 |     help="Path to the converted model checkpoint.",
31 | )
32 | 
33 | # Parse the input arguments
34 | args = parser.parse_args()
35 | 
36 | # Load the model
37 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto")
38 | 
39 | # adding lora
40 | standard_modules = ["q_proj", "v_proj"]
41 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
42 | model_to_lora_modules = {
43 |     "llama": standard_modules,
44 |     "opt": standard_modules,
45 |     "gptj": standard_modules,
46 |     "gpt_neox": ["query_key_value"],
47 |     "mpt": ["Wqkv"],
48 | }
49 | lora_config = LoraConfig(
50 |     r=16,
51 |     lora_alpha=32,
52 |     lora_dropout=0.05,
53 |     task_type=TaskType.CAUSAL_LM,
54 |     target_modules=model_to_lora_modules[lang_encoder_short_name],
55 | )
56 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
57 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
58 | 
59 | # Save the model
60 | checkpoint_path = args.save_path
61 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
62 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/utils/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "otter",
 3 |     "cross_attn_every_n_layers": 4,
 4 |     "tie_word_embeddings": false,
 5 |     "use_media_placement_augmentation": true,
 6 |     "only_attend_previous": true,
 7 |     "text_config": {
 8 |         "_name_or_path": "luodian/llama-7b-hf",
 9 |         "model_type": "llama"
10 |     },
11 |     "vision_config": {
12 |         "_name_or_path": "openai/clip-vit-large-patch14",
13 |         "model_type": "clip_vision_model",
14 |         "hidden_size": 1024,
15 |         "intermediate_size": 4096,
16 |         "num_attention_heads": 16,
17 |         "num_hidden_layers": 24,
18 |         "image_size": 224,
19 |         "patch_size": 14
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/utils/converting_flamingo_to_otter.py:
--------------------------------------------------------------------------------
 1 | # This script is used to convert the huggingface format Open-Flamingo model to the Otter model.
 2 | # You can use it in parent folder by running: python -m models.otter.converting_flamingo_to_otter --checkpoint_path <path_to_flamingo_checkpoint> --save_path <path_to_save_otter_checkpoint>
 3 | import argparse
 4 | import torch
 5 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration
 6 | from otter_ai.models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration
 7 | 
 8 | # Define argument parser
 9 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
10 | 
11 | # Add arguments
12 | parser.add_argument(
13 |     "--checkpoint_path",
14 |     type=str,
15 |     required=True,
16 |     help="Path to the pre-trained Open-Flamingo model checkpoint.",
17 | )
18 | parser.add_argument(
19 |     "--save_path",
20 |     type=str,
21 |     default=None,
22 |     help="Path to the converted Otter model checkpoint.",
23 | )
24 | 
25 | # Parse the input arguments
26 | args = parser.parse_args()
27 | 
28 | # Load the model
29 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto")
30 | model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]})
31 | if model.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
32 |     model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer))
33 | 
34 | # Save the model
35 | checkpoint_path = args.save_path
36 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
37 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/utils/converting_otter_fp32_to_fp16.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration
 4 | 
 5 | # Define argument parser
 6 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
 7 | 
 8 | # Add arguments
 9 | parser.add_argument(
10 |     "--load_bit",
11 |     type=str,
12 |     choices=["fp16", "bf16"],
13 |     default="fp16",
14 |     help="Precision of the loaded model. Either 'fp16' or 'bf16'. Default is 'fp16'.",
15 | )
16 | parser.add_argument(
17 |     "--checkpoint_path",
18 |     type=str,
19 |     required=True,
20 |     help="Path to the pre-trained model checkpoint.",
21 | )
22 | parser.add_argument(
23 |     "--save_path",
24 |     type=str,
25 |     default=None,
26 |     help="Path to the converted model checkpoint.",
27 | )
28 | 
29 | # Parse the input arguments
30 | args = parser.parse_args()
31 | 
32 | # Set precision based on load_bit argument
33 | if args.load_bit == "fp16":
34 |     precision = {"torch_dtype": torch.float16}
35 | elif args.load_bit == "bf16":
36 |     precision = {"torch_dtype": torch.bfloat16}
37 | 
38 | # Load the model
39 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
40 | 
41 | # Save the model
42 | if args.save_path is None:
43 |     checkpoint_path = args.checkpoint_path + f"-{args.load_bit}"
44 | else:
45 |     checkpoint_path = args.save_path
46 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
47 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/utils/converting_otter_pt_to_hf.py:
--------------------------------------------------------------------------------
 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train.
 2 | """
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | import torch
 8 | 
 9 | from modeling_otter import OtterForConditionalGeneration
10 | 
11 | 
12 | # The function is to inject newly trained otter perceiver parameters into the pretrained otter init model.
13 | @torch.no_grad()
14 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
15 |     old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
16 |     if old_ckpt.get("model_state_dict", None) is not None:
17 |         old_ckpt = old_ckpt["model_state_dict"]
18 |     new_ckpt = old_ckpt
19 |     # folder_path = os.path.dirname(old_ckpt_path)
20 |     # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "otter/config.json"
21 |     model = OtterForConditionalGeneration.from_pretrained(
22 |         args.pretrained_model_path,
23 |         device_map="auto",
24 |     )
25 | 
26 |     if "flamingo" in args.pretrained_model_path:
27 |         model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<answer>"]})
28 |         if "LlamaForCausalLM" in model.lang_encoder.__class__.__name__:
29 |             model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer))
30 | 
31 |     _ = model.load_state_dict(new_ckpt, strict=False)
32 |     print(f"Saving HF model to {new_folder_path}")
33 |     model.save_pretrained(new_folder_path)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument(
39 |         "--old_ckpt_path",
40 |         "-old",
41 |         type=str,
42 |         required=True,
43 |         help="Path to the pt checkpoint",
44 |     )
45 |     parser.add_argument(
46 |         "--new_hf_path",
47 |         "-new",
48 |         type=str,
49 |         required=True,
50 |         help="Path to the hf folder",
51 |     )
52 |     parser.add_argument(
53 |         "--pretrained_model_path",
54 |         "-pretrained",
55 |         type=str,
56 |         default="luodian/OTTER-MPT7B-Init",
57 |         required=True,
58 |         help="Path to the pretrained model folder.",
59 |     )
60 |     args = parser.parse_args()
61 |     if not os.path.exists(os.path.dirname(args.new_hf_path)):
62 |         os.makedirs(os.path.dirname(args.new_hf_path))
63 |     dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)
64 | 


--------------------------------------------------------------------------------
/src/otter_ai/models/otter/utils/converting_otter_to_lora.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import sys
 4 | 
 5 | from .modeling_otter import OtterForConditionalGeneration
 6 | from peft import get_peft_model, LoraConfig, TaskType
 7 | 
 8 | MODEL_CLASSES = {
 9 |     "LlamaForCausalLM": "llama",
10 |     "OPTForCausalLM": "opt",
11 |     "GPTJForCausalLM": "gptj",
12 |     "GPTNeoXForCausalLM": "gpt_neox",
13 |     "MPTForCausalLM": "mpt",
14 | }
15 | 
16 | # Define argument parser
17 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
18 | 
19 | # Add arguments
20 | parser.add_argument(
21 |     "--checkpoint_path",
22 |     type=str,
23 |     help="Path to the pre-trained model checkpoint.",
24 |     default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705",
25 | )
26 | parser.add_argument(
27 |     "--save_path",
28 |     type=str,
29 |     default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705-LoRA",
30 |     help="Path to the converted model checkpoint.",
31 | )
32 | 
33 | # Parse the input arguments
34 | args = parser.parse_args()
35 | 
36 | # Load the model
37 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto")
38 | 
39 | # adding lora
40 | standard_modules = ["q_proj", "v_proj"]
41 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
42 | model_to_lora_modules = {
43 |     "llama": standard_modules,
44 |     "opt": standard_modules,
45 |     "gptj": standard_modules,
46 |     "gpt_neox": ["query_key_value"],
47 |     "mpt": ["Wqkv"],
48 | }
49 | lora_config = LoraConfig(
50 |     r=16,
51 |     lora_alpha=32,
52 |     lora_dropout=0.05,
53 |     task_type=TaskType.CAUSAL_LM,
54 |     target_modules=model_to_lora_modules[lang_encoder_short_name],
55 | )
56 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
57 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
58 | 
59 | # Save the model
60 | checkpoint_path = args.save_path
61 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
62 | 


--------------------------------------------------------------------------------
/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/unit_tests/__init__.py


--------------------------------------------------------------------------------
/unit_tests/test_mmc4_dataset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import Mock
 3 | from pipeline.mimicit_utils.data import get_mmc4_dataset
 4 | 
 5 | 
 6 | class TestGetMMC4Dataset(unittest.TestCase):
 7 |     def test_get_mmc4_dataset(self):
 8 |         # Mock the required inputs
 9 |         args = Mock(
10 |             mmc4_shards="/home/luodian/projects/Otter/archived/000000000.tar",
11 |             train_num_samples_mmc4=1000,
12 |             mmc4_textsim_threshold=0.32,
13 |             batch_size_mmc4=10,
14 |             seed=0,
15 |             workers=2,
16 |             world_size=1,
17 |         )
18 |         image_processor = Mock()
19 |         tokenizer = Mock()
20 | 
21 |         # Call the function to test
22 |         data_info = get_mmc4_dataset(args, image_processor, tokenizer)
23 | 
24 |         # Check if the dataloader's attributes are as expected
25 |         self.assertEqual(data_info.dataloader.num_batches, 100)
26 |         self.assertEqual(data_info.dataloader.num_samples, 1000)
27 | 


--------------------------------------------------------------------------------
/unit_tests/test_prerun.py:
--------------------------------------------------------------------------------
 1 | # Inside tests/unit_tests/test_prerun.py
 2 | import os
 3 | import yaml
 4 | import pytest
 5 | import orjson
 6 | import pandas as pd
 7 | 
 8 | 
 9 | # Define the pytest fixture
10 | @pytest.fixture
11 | def yaml_data(request):
12 |     yaml_path = request.config.getoption("--yaml-path")
13 |     if not yaml_path or not os.path.exists(yaml_path):
14 |         pytest.fail(f"YAML file path '{yaml_path}' does not exist.")
15 |     with open(yaml_path, "r") as f:
16 |         data = yaml.safe_load(f)
17 |     return data
18 | 
19 | 
20 | # Your test function
21 | @pytest.mark.prerun
22 | def test_yaml_structure(yaml_data):
23 |     required_categories = [
24 |         "IMAGE_TEXT",
25 |         "TEXT_ONLY",
26 |         "VIDEO_TEXT",
27 |         "IMAGE_TEXT_IN_CONTEXT",
28 |     ]
29 | 
30 |     for category, datasets in yaml_data.items():
31 |         assert category in required_categories, f"Unexpected category '{category}' in YAML. Expected categories are {required_categories}."
32 | 
33 |         for dataset_name, data in datasets.items():
34 |             for path_key, path_value in data.items():
35 |                 if path_key.endswith("_path"):
36 |                     assert os.path.exists(path_value), f"Dataset path {path_value} specified under {category} -> {dataset_name} does not exist."
37 |                 elif path_key == "num_samples":
38 |                     assert isinstance(path_value, int), f"'num_samples' should be an integer but got {type(path_value)} under {category} -> {dataset_name}."
39 | 
40 |                 # checking mimicit path aligns with corresponding format.
41 |                 if path_key == "mimicit_path":
42 |                     print(f"Checking -> {path_value} in MIMICIT format.")
43 |                     with open(path_value, "rb") as f:
44 |                         data = orjson.loads(f.read())
45 | 
46 |                     assert "data" in data
47 | 
48 |                 if path_key == "images_path":
49 |                     print(f"Checking -> {path_value} in images format.")
50 |                     assert os.path.exists(path_value), f"Dataset path {path_value} specified under {category} -> {dataset_name} does not exist."
51 |                     # # Read the parquet file using pandas
52 |                     # df = pd.read_parquet(path_value)
53 | 
54 |                     # # Check for the 'base64' column
55 |                     # assert "base64" in df.columns, f"The 'base64' column was not found in the dataset {path_value}."
56 | 


--------------------------------------------------------------------------------
/xformers_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import CLIPVisionModel
2 | from .llama import LlamaForCausalLM
3 | 


--------------------------------------------------------------------------------