├── .gitattributes ├── .github ├── issue_template.md ├── pull_request_template.md └── workflows │ └── black.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── conftest.py ├── docs ├── OtterHD.md ├── benchmark_eval.md ├── credits.md ├── huggingface_compatible.md ├── mimicit_format.md └── server_host.md ├── environment.yml ├── mimic-it ├── README.md ├── convert-it │ ├── README.md │ ├── __init__.py │ ├── abstract_dataset.py │ ├── datasets │ │ ├── 2d.py │ │ ├── 3d.py │ │ ├── __init__.py │ │ ├── change.py │ │ ├── fpv.py │ │ ├── utils │ │ │ ├── scene_navigation_utils.py │ │ │ └── visual_story_telling_utils.py │ │ └── video.py │ ├── image_utils.py │ └── main.py └── syphus │ ├── abstract_dataset.py │ ├── datasets │ ├── 3d.py │ ├── __init__.py │ ├── change.py │ ├── fpv.py │ ├── funqa.py │ ├── translate.py │ └── video.py │ ├── file_utils.py │ ├── main.py │ └── prompts │ ├── coco_spot_the_difference_prompt.py │ ├── dense_captions.json │ ├── ego4d.json │ ├── funqa_dia.json │ ├── funqa_mcqa.json │ ├── funqa_translation.json │ ├── scene_navigation.json │ ├── spot_the_difference.json │ ├── translation_prompt.py │ ├── tv_captions.json │ └── visual_story_telling.json ├── pipeline ├── accelerate_configs │ ├── accelerate_config_ddp.yaml │ ├── accelerate_config_fsdp.yaml │ ├── accelerate_config_zero1.yaml │ ├── accelerate_config_zero2.yaml │ ├── accelerate_config_zero2_slurm.yaml │ ├── accelerate_config_zero3.yaml │ ├── accelerate_config_zero3_offload.yaml │ ├── accelerate_config_zero3_slurm.yaml │ └── ds_zero3_config.json ├── benchmarks │ ├── .gitignore │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── base_eval_dataset.py │ │ ├── magnifierbench.py │ │ ├── mathvista.py │ │ ├── mmbench.py │ │ ├── mme.py │ │ ├── mmvet.py │ │ ├── pope.py │ │ ├── scienceqa.py │ │ └── seedbench.py │ ├── evaluate.py │ ├── models │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── frozen_bilm.py │ │ ├── fuyu.py │ │ ├── gpt4v.py │ │ ├── idefics.py │ │ ├── instructblip.py │ │ ├── llama_adapter.py │ │ ├── llava_model.py │ │ ├── mplug_owl.py │ │ ├── otter_image.py │ │ ├── otter_video.py │ │ ├── otterhd.py │ │ ├── qwen_vl.py │ │ ├── video_chat.py │ │ └── video_chatgpt.py │ └── public_datasets_suite │ │ ├── README.md │ │ ├── __init__.py │ │ ├── classification_utils.py │ │ ├── coco_metric.py │ │ ├── config.yaml │ │ ├── eval_datasets.py │ │ ├── eval_model.py │ │ ├── evaluate.py │ │ ├── get_args.ipynb │ │ ├── models │ │ ├── __init__.py │ │ ├── blip.py │ │ ├── idefics.py │ │ ├── open_flamingo.py │ │ ├── otter.py │ │ └── utils.py │ │ ├── ok_vqa_utils.py │ │ ├── run_eval_coco_idefics.sh │ │ ├── run_eval_coco_otter.sh │ │ ├── run_eval_otter.sh │ │ ├── run_eval_otter_slurm.sh │ │ └── vqa_metric.py ├── demos │ ├── __init__.py │ ├── demo_models.py │ ├── demo_utils.py │ ├── inference.py │ └── interactive │ │ ├── otter_image.ipynb │ │ ├── otter_image.py │ │ ├── otter_image_incontext.py │ │ ├── otter_video.ipynb │ │ └── otter_video.py ├── mimicit_utils │ ├── data.py │ ├── mimicit_dataset.py │ └── transforms.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── conversation.py │ ├── deploy │ │ ├── conversation.py │ │ ├── deploy.py │ │ ├── otterhd_endpoint.py │ │ └── utils.py │ ├── gradio_css.py │ ├── gradio_patch.py │ ├── gradio_web_server.py │ ├── gradio_web_server_video.py │ ├── model_worker.py │ ├── multiplex_script │ │ └── otter_image_server.py │ ├── register_worker.py │ ├── serving_utils.py │ └── test_message.py ├── train │ ├── .gitignore │ ├── __init__.py │ ├── config.yaml │ ├── distributed.py │ ├── instruction_following.py │ ├── pretraining.py │ ├── pretraining_cc3m.py │ ├── train_args.py │ └── train_utils.py └── utils │ ├── __init__.py │ ├── apply_delta.py │ ├── convert_laion400m-tsv_to_laion400m-tar_mp_shard.py │ ├── convert_mmc4_to_wds.py │ ├── convert_to_parquet.py │ ├── general.py │ ├── make_a_train.py │ └── modeling_value_head.py ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── setup.py ├── shared_scripts ├── Demo_Data.yaml ├── Demo_OtterHD.sh ├── Demo_OtterMPT.sh └── Otter_MPT7B_Train_Decoder.json ├── src └── otter_ai │ ├── __init__.py │ └── models │ ├── __init__.py │ ├── falcon │ ├── __init__.py │ ├── configuration_RW.py │ └── modelling_RW.py │ ├── flamingo │ ├── __init__.py │ ├── config.json │ ├── configuration_flamingo.py │ ├── converting_flamingo_to_bf16.py │ ├── converting_flamingo_to_hf.py │ ├── converting_flamingo_to_lora.py │ ├── flamingo-falcon-7B.json │ ├── flamingo-llama2-chat-13B.json │ ├── flamingo-llama2-chat-7B.json │ ├── flamingo-mpt-1B-redpajama.json │ ├── flamingo-mpt-30B-bf16.json │ ├── flamingo-mpt-30B.json │ ├── flamingo-mpt-7B.json │ ├── flamingo-vicuna-33B-v1.3.json │ ├── flamingo-vicuna-7B-v1.3.json │ ├── injecting_falcon_into_flamingo.py │ ├── injecting_llama2_into_flamingo.py │ ├── injecting_mpt-1B-redpajama_into_flamingo.py │ ├── injecting_mpt_into_flamingo.py │ ├── injecting_vicuna_into_flamingo.py │ ├── modeling_flamingo.py │ ├── utils.py │ └── utils │ │ ├── converting_flamingo_to_bf16.py │ │ ├── converting_flamingo_to_hf.py │ │ ├── converting_flamingo_to_lora.py │ │ ├── flamingo-falcon-7B.json │ │ ├── flamingo-llama2-chat-13B.json │ │ ├── flamingo-llama2-chat-7B.json │ │ ├── flamingo-mpt-1B-redpajama.json │ │ ├── flamingo-mpt-30B-bf16.json │ │ ├── flamingo-mpt-30B.json │ │ ├── flamingo-mpt-7B.json │ │ ├── flamingo-vicuna-33B-v1.3.json │ │ ├── flamingo-vicuna-7B-v1.3.json │ │ ├── injecting_falcon_into_flamingo.py │ │ ├── injecting_llama2_into_flamingo.py │ │ ├── injecting_mpt-1B-redpajama_into_flamingo.py │ │ ├── injecting_mpt_into_flamingo.py │ │ └── injecting_vicuna_into_flamingo.py │ ├── fuyu │ ├── modeling_fuyu.py │ ├── modeling_persimmon.py │ └── processing_fuyu.py │ ├── mpt │ ├── __init__.py │ ├── adapt_tokenizer.py │ ├── attention.py │ ├── blocks.py │ ├── configuration_mpt.py │ ├── custom_embedding.py │ ├── flash_attn_triton.py │ ├── hf_prefixlm_converter.py │ ├── meta_init_context.py │ ├── modeling_mpt.py │ ├── norm.py │ └── param_init_fns.py │ ├── mpt_redpajama │ ├── __init__.py │ ├── attention.py │ ├── configuration_mosaic_gpt.py │ ├── gpt_blocks.py │ ├── low_precision_layernorm.py │ ├── mosaic_gpt.py │ └── param_init_fns.py │ └── otter │ ├── Otter-MPT7B-config.json │ ├── Otter-MPT7B-config.json~0d12192f665f5e9da1ecb2f23d6a360eb7753771 │ ├── Otter-MPT7B-config.json~HEAD │ ├── __init__.py │ ├── config.json │ ├── configuration_otter.py │ ├── converting_flamingo_to_otter.py │ ├── converting_otter_fp32_to_fp16.py │ ├── converting_otter_pt_to_hf.py │ ├── converting_otter_to_lora.py │ ├── flamingo_pt2otter_hf.py │ ├── modeling_otter.py │ └── utils │ ├── Otter-MPT7B-config.json │ ├── config.json │ ├── converting_flamingo_to_otter.py │ ├── converting_otter_fp32_to_fp16.py │ ├── converting_otter_pt_to_hf.py │ ├── converting_otter_to_lora.py │ └── flamingo_pt2otter_hf.py ├── unit_tests ├── __init__.py ├── test_mmc4_dataset.py └── test_prerun.py └── xformers_model ├── __init__.py ├── clip.py └── llama.py /.gitattributes: -------------------------------------------------------------------------------- 1 | example_unified_data/negative_sample/all_captions.txt filter=lfs diff=lfs merge=lfs -text 2 | LAVIS/coco-caption/annotations/captions_val2014.json filter=lfs diff=lfs merge=lfs -text 3 | LAVIS/coco-caption/annotations/caption_flickr30k.json filter=lfs diff=lfs merge=lfs -text -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | Before you open an issue, please check if a similar issue already exists or has been closed before. 2 | 3 | ### When you open an issue, please be sure to include the following 4 | 5 | - [ ] A descriptive title: [xxx] XXXX 6 | - [ ] A detailed description 7 | - [ ] Assign an issue type tag (label): 8 | - `dataset` (mimic-it download, usage, etc.), 9 | - `demo` (online demo), `doc` (readme, wiki, paper, video etc.), 10 | - `evaluation` (evaluation result, performance of Otter etc.), 11 | - `model` (model configuration, components, etc.), 12 | - `train` (training configuration, process, code, etc.) 13 | 14 | Thank you for your contributions! 15 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Before you open a pull-request, please check if a similar issue already exists or has been closed before. 2 | 3 | ### When you open a pull-request, please be sure to include the following 4 | 5 | - [ ] A descriptive title: [xxx] XXXX 6 | - [ ] A detailed description 7 | - [ ] Assign an issue type tag (label): 8 | - `dataset` (mimic-it download, usage, etc.), 9 | - `demo` (online demo), 10 | - `doc` (readme, wiki, paper, video, etc.), 11 | - `evaluation` (evaluation result, performance of Otter, etc.), 12 | - `model` (model configuration, components, etc.), 13 | - `train` (training configuration, process, code, etc.) 14 | 15 | Thank you for your contributions! 16 | -------------------------------------------------------------------------------- /.github/workflows/black.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - uses: psf/black@stable -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | *.py[cod] 3 | __pycache__/ 4 | *$py.class 5 | *.sage.py 6 | 7 | # Special and Backup files 8 | *.bak 9 | *.log 10 | *.tsv 11 | *.gz 12 | *.zip 13 | *.dat 14 | *.dir 15 | *.html 16 | *.mp4 17 | *.MP4 18 | *.png 19 | *.pt 20 | *.bin 21 | *.mo 22 | *.pot 23 | *.manifest 24 | *.spec 25 | *.egg 26 | *.so 27 | checkpoint 28 | LICENSE 29 | 30 | # Editor and IDE configurations 31 | .vscode/ 32 | .spyderproject 33 | .spyproject 34 | .ropeproject 35 | 36 | # Packaging and Distribution 37 | *.egg-info/ 38 | dist/ 39 | build/ 40 | develop-eggs/ 41 | downloads/ 42 | eggs/ 43 | .eggs/ 44 | lib/ 45 | lib64/ 46 | sdist/ 47 | wheels/ 48 | parts/ 49 | share/python-wheels/ 50 | .installed.cfg 51 | MANIFEST 52 | 53 | # Unit Test and Coverage 54 | .coverage* 55 | .cache 56 | *.cover 57 | *.py,cover 58 | .htmlcov/ 59 | .tox/ 60 | .nox/ 61 | .hypothesis/ 62 | .pytest_cache/ 63 | cover/ 64 | mypy_cache/ 65 | .dmypy.json 66 | dmypy.json 67 | .pyre/ 68 | .pytype/ 69 | cython_debug/ 70 | */__pycache__/** 71 | */.pytest_cache/** 72 | */.mypy_cache/** 73 | */.cache/** 74 | */.coverage.* 75 | */.c 76 | 77 | # Environments and Dependencies 78 | .env 79 | .venv 80 | .env.bak/ 81 | .venv.bak/ 82 | env/ 83 | venv/ 84 | ENV/ 85 | pip-log.txt 86 | pip-delete-this-directory.txt 87 | Pipfile.lock 88 | poetry.lock 89 | __pypackages__/ 90 | 91 | # Web Frameworks 92 | local_settings.py 93 | db.sqlite3 94 | db.sqlite3-journal 95 | instance/ 96 | .webassets-cache 97 | .scrapy 98 | .site 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # Documentation and Notebooks 103 | docs/_build/ 104 | .ipynb_checkpoints 105 | profile_default/ 106 | ipython_config.py 107 | 108 | # Project specific 109 | output/ 110 | debug*/ 111 | wandb/ 112 | archived/ 113 | amlt/ 114 | scripts/ 115 | nginx/ 116 | logs/ 117 | ofa_compress/ 118 | train_*.sh 119 | gpt_playground/ 120 | data/ 121 | azure/ 122 | .deepspeed_env 123 | checkpoints/ 124 | pipeline/serve/examples/ 125 | mimic-it/syphus/annotations/** 126 | tools/ 127 | otter9B-mpt7b-0705/ 128 | tokenizer_checklist.chk 129 | tokenizer.model 130 | download.sh 131 | USE_POLICY.md 132 | 133 | # Miscellaneous 134 | *.pyc 135 | open_flamingo.egg-info 136 | llama-7b-hf/* 137 | cache/ 138 | *.code-workspace 139 | pipeline/benchmarks/ckpts/* 140 | pipeline/benchmarks/models/Ask_Anything 141 | pipeline/benchmarks/models/FrozenBiLM 142 | pipeline/benchmarks/models/LLaMA_Adapter 143 | 144 | shared_scripts/gcp_instance/** 145 | shared_scripts/shai_instance/** 146 | pipeline/benchmarks/models/Video_ChatGPT 147 | pipeline/benchmarks/models/LLaVA 148 | pipeline/benchmarks/models/llava 149 | pipeline/benchmarks/evaluation_result 150 | pipeline/serve/user_logs 151 | config.yaml 152 | 153 | azure_storage/ 154 | checkpoints/ 155 | *.ttf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. 21 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--yaml-path", 7 | action="store", 8 | default="default_yaml_path.yaml", 9 | help="Path to the YAML file", 10 | ) 11 | 12 | 13 | @pytest.fixture 14 | def yaml_path(request): 15 | return request.config.getoption("--yaml-path") 16 | -------------------------------------------------------------------------------- /docs/benchmark_eval.md: -------------------------------------------------------------------------------- 1 | # Welcome to the benchmark evaluation page! 2 | 3 | The evaluation pipeline is designed to be one-clickable and easy to use. However, you may encounter some problems when running the models (e.g. LLaVA, LLaMA-Adapter) that require you to clone their repo to local path. Please feel free to contact us if you have any questions. 4 | 5 | We support the following benchmarks: 6 | - MagnifierBench 7 | - MMBench 8 | - MM-VET 9 | - MathVista 10 | - POPE 11 | - MME 12 | - SicenceQA 13 | - SeedBench 14 | 15 | And following models: 16 | - LLaVA 17 | - Fuyu 18 | - OtterHD 19 | - Otter-Image 20 | - Otter-Video 21 | - Idefics 22 | - LLaMA-Adapter 23 | - Qwen-VL 24 | 25 | many more, see `/pipeline/benchmarks/models` 26 | 27 | https://github.com/Luodian/Otter/tree/main/pipeline/benchmarks/models 28 | 29 | Create a yaml file `benchmark.yaml` with below content: 30 | ```yaml 31 | datasets: 32 | - name: magnifierbench 33 | split: test 34 | data_path: Otter-AI/MagnifierBench 35 | prompt: Answer with the option letter from the given choices directly. 36 | api_key: [You GPT-4 API] 37 | - name: mme 38 | split: test 39 | - name: pope 40 | split: test 41 | default_output_path: ./logs 42 | - name: mmvet 43 | split: test 44 | api_key: [You GPT-4 API] 45 | gpt_model: gpt-4-0613 46 | - name: mathvista 47 | split: test 48 | api_key: [You GPT-4 API] 49 | gpt_model: gpt-4-0613 50 | - name: mmbench 51 | split: test 52 | models: 53 | - name: fuyu 54 | model_path: adept/fuyu-8b 55 | ``` 56 | 57 | Then run 58 | 59 | ```python 60 | python -m pipeline.benchmarks.evaluate --confg benchmark.yaml 61 | ``` 62 | -------------------------------------------------------------------------------- /docs/credits.md: -------------------------------------------------------------------------------- 1 | # Credits 🌟 2 | 3 | The Otter/MIMIC-IT/OtterHD project is a collaborative effort involving many distinguished contributors. Below are their contributions across various aspects of the project. 4 | 5 | ## Otter/MIMIC-IT 🚀 6 | - **[Otter Team Lead/Model/Training]**: [Bo Li](https://brianboli.com/) 🧠💻, [Yuanhan Zhang](https://zhangyuanhan-ai.github.io/) 🧠💻 7 | - **[MIMIC-IT Dataset Construction]**: [Liangyu Chen](https://cliangyu.com/) 📊, [Jinghao Wang](https://king159.github.io/) 📊🛠️, [Fanyi Pu](https://pufanyi.github.io/) 📊🛠️, [Jingkang Yang](https://jingkang50.github.io/) 📝 8 | - **[Training Dataset Maintenance]**: [Fanyi Pu](https://pufanyi.github.io/) 📚, [Joshua Adrian Cahyono](https://www.linkedin.com/in/joshua-adrian-cahyono-5230b814b/?originalSubdomain=sg) 🛠️ 9 | - **[Evaluation Suite Maintenance]**: [Joshua Adrian Cahyono](https://www.linkedin.com/in/joshua-adrian-cahyono-5230b814b/?originalSubdomain=sg) 🖥️🛠️, [Fanyi Pu](https://pufanyi.github.io/) 🖥️ 10 | 11 | ## OtterHD 🎥 12 | - **[OtterHD Model/Training/Optimization]**: [Bo Li](https://brianboli.com/) 🧠💻, [Peiyuan Zhang](https://www.linkedin.com/in/lance-peiyuan-zhang-5b2886194/?originalSubdomain=sg) 🧠⚙️ 13 | - **[Magnifier Dataset Construction]**: [Jingkang Yang](https://jingkang50.github.io/) 🛠️, [Fanyi Pu](https://pufanyi.github.io/) 🖥️ 14 | - **[Evaluation Suite Maintenance]**: [Joshua Adrian Cahyono](https://www.linkedin.com/in/joshua-adrian-cahyono-5230b814b/?originalSubdomain=sg) 🖥️🛠️, [Fanyi Pu](https://pufanyi.github.io/) 🖥️ 15 | 16 | ## Academic/Directional Guidance 📚 17 | - [Ziwei Liu](https://liuziwei7.github.io/) 👨‍🏫, [Chunyuan Li](https://chunyuan.li/) 👨‍🏫. 18 | 19 | --- 20 | 21 | Emojis to symbolize the type of contribution (e.g., brain for leadership, tools for construction, books for maintenance, etc.), and should make the document more visually appealing and enjoyable to read. Remember to check for any potential accessibility issues that might arise from using emojis, as some readers might rely on screen readers or other assistive technologies. 22 | -------------------------------------------------------------------------------- /docs/huggingface_compatible.md: -------------------------------------------------------------------------------- 1 | ## 🤗 Hugging Face Model 2 | 3 | You can use the 🦩 Flamingo model / 🦦 Otter model as a 🤗 Hugging Face model with only a few lines! One-click and then model configs/weights are downloaded automatically. 4 | 5 | ``` python 6 | from flamingo import FlamingoModel 7 | flamingo_model = FlamingoModel.from_pretrained("luodian/openflamingo-9b-hf", device_map=auto) 8 | 9 | from otter import OtterModel 10 | otter_model = OtterModel.from_pretrained("luodian/otter-9b-hf", device_map=auto) 11 | ``` 12 | 13 | Previous [OpenFlamingo](https://github.com/mlfoundations/open_flamingo) was developed with [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) (DDP) on A100 cluster. Loading OpenFlamingo-9B to GPU requires **at least 33G GPU memory**, which is only available on A100 GPUs. 14 | 15 | In order to allow more researchers without access to A100 machines to try training OpenFlamingo, we wrap the OpenFlamingo model into a 🤗 hugging Face model ([Jinghao](https://king159.github.io/) has submitted a [PR](https://github.com/huggingface/transformers/pull/23063) to the /huggingface/transformers!). Via `device_map=auto`, the large model is sharded across multiple GPUs when loading and training. This can help researchers who do not have access to A100-80G GPUs to achieve similar throughput in training, testing on 4x RTX-3090-24G GPUs, and model deployment on 2x RTX-3090-24G GPUs. Specific details are below (may vary depending on the CPU and disk performance, as we conducted training on different machines). 16 | 17 |
18 | 19 |
20 | 21 | 25 | 26 | Our Otter model is also developed in this way and it's deployed on the 🤗 Hugging Face model hub. Our model can be hosted on two RTX-3090-24G GPUs and achieve a similar speed to one A100-80G machine. -------------------------------------------------------------------------------- /docs/server_host.md: -------------------------------------------------------------------------------- 1 | ## 🪩 Serving Demo 2 | 3 | We will show you how to host a demo on your own computer using gradio. 4 | 5 | ## Preparation 6 | 7 | ### Warnings: Newest `gradio` and `gradio_client` versions may cause errors ❗❗❗ 8 | 9 | Please keep the packages fixed with the following versions (my local verified model serving environment). 10 | ``` 11 | braceexpand==0.1.7 12 | einops==0.7.0 13 | fastapi==0.104.1 14 | gradio==4.7.1 15 | horovod==0.27.0 16 | huggingface_hub==0.14.0 17 | ijson==3.2.3 18 | importlib_metadata==6.6.0 19 | inflection==0.5.1 20 | markdown2==2.4.8 21 | natsort==8.4.0 22 | nltk==3.8.1 23 | numpy==1.26.2 24 | openai==1.3.7 25 | opencv_python==4.8.1.78 26 | opencv_python_headless==4.8.1.78 27 | orjson==3.9.10 28 | packaging==23.2 29 | Pillow==10.1.0 30 | pycocoevalcap==1.2 31 | pycocotools==2.0.7 32 | Requests==2.31.0 33 | tqdm==4.65.0 34 | transformers==4.35.0 35 | uvicorn==0.24.0.post1 36 | webdataset==0.2.79 37 | ``` 38 | 39 | ### Download the checkpoints 40 | 41 | The 🦦 Otter checkpoint and the 🦩 Open Flamingo checkpoint can be auto-downloaded with the code below. 42 | 43 | ## Start Demo 44 | 45 | ### Launch a controller 46 | 47 | ```Shell 48 | python -m pipeline.serve.controller --host 0.0.0.0 --port 10000 49 | ``` 50 | 51 | ### Launch a model worker 52 | 53 | ```Shell 54 | # Init our 🦦 Otter model on GPU 55 | CUDA_VISIBLE_DEVICES=0,1 python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model_name otter --checkpoint_path luodian/otter-9b-hf --num_gpus 2 --limit_model_concurrency 200 56 | # Init our 🦦 Otter video model on CPU 57 | CUDA_VISIBLE_DEVICES=0,1 python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40002 --worker http://localhost:40002 --model_name otter_video --checkpoint_path checkpoint/otter9B_DC_fullset_16frames/ --num_gpus 2 --limit_model_concurrency 200 --load_bit 16 58 | # Init original open flamingo model on GPU 59 | CUDA_VISIBLE_DEVICES=2,3 python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model_name open_flamingo --checkpoint_path luodian/openflamingo-9b-hf --num_gpus 2 --limit_model_concurrency 200 60 | 61 | # Init original open flamingo model on CPU 62 | python -m pipeline.serve.model_worker --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model_name open_flamingo_original --checkpoint_path luodian/openflamingo-9b-hf --num_gpus 0 63 | ``` 64 | 65 | Wait until the process finishes loading the model and you see "Uvicorn running on ...". 66 | 67 | ### Launch a gradio web server 68 | 69 | ```Shell 70 | # Image demo 71 | python -m pipeline.serve.gradio_web_server --controller http://localhost:10000 --port 7861 72 | # Video demo 73 | python -m pipeline.serve.gradio_web_server_video --controller http://localhost:10000 --port 7862 74 | ``` 75 | 76 | Now, you can open your browser and chat with the model! 77 | 78 | ### Examples 79 | If you encounter error stating `FileNotFoundError: [Errno 2] No such file or directory: '/home/luodian/projects/Otter/pipeline/serve/examples/Apple Vision Pro - Reveal Trailer.mp4'` 80 | 81 | That's because we didnt upload the video examples on Github. You could visit the following [folder](https://entuedu-my.sharepoint.com/:f:/g/personal/libo0013_e_ntu_edu_sg/EjjDhJm4G35EgVHo0Pxi7dEBM7rqdN3e0ZcBCskWuIubUQ?e=C58jI3) to download our used examples and put them to the right place. 82 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: otter 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.9 6 | - conda-forge::openjdk 7 | - pip 8 | - pip: 9 | - -r requirements.txt -------------------------------------------------------------------------------- /mimic-it/convert-it/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/mimic-it/convert-it/__init__.py -------------------------------------------------------------------------------- /mimic-it/convert-it/datasets/2d.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from abstract_dataset import AbstractDataset 4 | 5 | 6 | class Llava(AbstractDataset): 7 | def __init__( 8 | self, 9 | name: str = "Llava", 10 | short_name="LA", 11 | *, 12 | image_root: str, 13 | image_path: str, 14 | num_threads: int, 15 | ): 16 | """ 17 | Initializes a Llava in-context dataset. 18 | 19 | Args: 20 | name (str): The name of the dataset. Defaults to "Llava". 21 | short_name (str): The short name of the dataset. Defaults to "LA". 22 | image_root (str): The root path to the COCO image train split. 23 | image_path (str): The path to the JSON file containing the dataset images. 24 | The images can be downloaded from: 25 | https://drive.google.com/file/d/1OVb4_3Uec_xbyUk90aWC6LFpKsIOtR7v/view?usp=sharing. 26 | num_threads (int): The number of threads to use for processing the images. 27 | """ 28 | self.image_root = image_root 29 | super().__init__(name, short_name, image_path, num_threads) 30 | 31 | def _load_images(self, image_path: str, num_thread: int) -> dict[str, bytes]: 32 | """ 33 | Loads the images from the dataset. 34 | 35 | Args: 36 | image_path (str): The path to the JSON file containing the dataset images. 37 | num_threads (int): The number of threads to use for processing the images. 38 | 39 | Returns: 40 | dict[str, bytes]: A dictionary where the keys are image identifiers and the values are bytes objects representing the images. 41 | """ 42 | 43 | def read_image(file_name) -> bytes: 44 | with open(file_name, "rb") as f: 45 | return f.read() 46 | 47 | images = {} 48 | with open(image_path) as f: 49 | image_ids = json.load(f).keys() 50 | 51 | for cur_image_id in image_ids: 52 | images[cur_image_id] = read_image(f"{self.image_root}/{cur_image_id}.jpg") 53 | 54 | return images 55 | -------------------------------------------------------------------------------- /mimic-it/convert-it/datasets/3d.py: -------------------------------------------------------------------------------- 1 | from abstract_dataset import AbstractDataset 2 | 3 | 4 | class SceneNavigation(AbstractDataset): 5 | def __init__( 6 | self, 7 | name: str = "SceneNavigation", 8 | short_name="SN", 9 | *, 10 | image_path: str, 11 | num_threads: int, 12 | ): 13 | """ 14 | Initializes a SceneNavigation dataset. 15 | 16 | Args: 17 | name (str): The name of the dataset. Defaults to "SceneNavigation". 18 | short_name (str): The short name of the dataset. Defaults to "SN". 19 | image_path (str): The directory path of the folder named "scannet_frames_25k" obtained by downloading a compressed file from http://www.scan-net.org/ and extracting it. 20 | num_threads (int): The number of threads to use for processing the images. 21 | """ 22 | super().__init__(name, short_name, image_path, num_threads) 23 | 24 | def _load_images(self, image_path: str, num_thread: int) -> dict[str, bytes]: 25 | """ 26 | Loads the images from the dataset. 27 | 28 | Args: 29 | image_path (str): The path to the directory containing the images downloaded from http://www.scan-net.org/. 30 | num_threads (int): The number of threads to use for processing the images. 31 | 32 | Returns: 33 | dict[str, bytes]: A dictionary where the keys are image identifiers and the values are byte strings representing the images. 34 | """ 35 | from datasets.utils.scene_navigation_utils import process_data 36 | 37 | return process_data(image_path, num_thread) 38 | -------------------------------------------------------------------------------- /mimic-it/convert-it/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/mimic-it/convert-it/datasets/__init__.py -------------------------------------------------------------------------------- /mimic-it/convert-it/datasets/fpv.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from glob import glob 4 | 5 | from abstract_dataset import AbstractDataset 6 | from image_utils import frame_video 7 | 8 | from tqdm import tqdm 9 | from concurrent.futures import ThreadPoolExecutor 10 | 11 | 12 | class EGO4D(AbstractDataset): 13 | def __init__( 14 | self, 15 | name: str = "EGO4D", 16 | short_name="E4D", 17 | *, 18 | image_path: str, 19 | num_threads: int, 20 | ): 21 | """ 22 | Initializes an EGO4D dataset. 23 | 24 | Args: 25 | name (str): The name of the dataset. Defaults to "EGO4D". 26 | short_name (str): The short name of the dataset. Defaults to "E4D". 27 | image_path (str): The directory path of the folder downloaded from https://ego4d-data.org/#download. 28 | num_threads (int): The number of threads to use for processing the images. 29 | """ 30 | super().__init__(name, short_name, image_path, num_threads) 31 | 32 | def _load_images(self, image_path: str, num_thread: int) -> dict[str, bytes]: 33 | """ 34 | Loads the images from the dataset. 35 | 36 | Args: 37 | image_path (str): The path to the directory containing the images downloaded from https://ego4d-data.org/#download. 38 | num_threads (int): The number of threads to use for processing the images. 39 | 40 | Returns: 41 | dict[str, bytes]: A dictionary where the keys are image identifiers and the values are image bytes. 42 | 43 | Raises: 44 | FileNotFoundError: If the specified image path does not exist. 45 | """ 46 | video_paths = glob(os.path.join(image_path, "*")) 47 | 48 | def get_image(video_path): 49 | images = frame_video(video_path) 50 | images_dict = {} 51 | video_name = os.path.basename(video_path).split(".")[0] 52 | for index, image in enumerate(images): 53 | images_dict[f"{video_name}_{index:08d}"] = image 54 | return images_dict 55 | 56 | final_images_dict = {} 57 | 58 | with ThreadPoolExecutor(max_workers=num_thread) as executor: 59 | process_bar = tqdm( 60 | total=len(video_paths), 61 | unit="video", 62 | desc="Processing videos into images", 63 | ) 64 | for images_dict in executor.map(get_image, video_paths): 65 | final_images_dict.update(images_dict) 66 | process_bar.update() 67 | process_bar.close() 68 | 69 | return final_images_dict 70 | -------------------------------------------------------------------------------- /mimic-it/convert-it/datasets/utils/scene_navigation_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from glob import glob 4 | from tqdm import tqdm 5 | from concurrent.futures import ThreadPoolExecutor 6 | 7 | from image_utils import process_image 8 | 9 | 10 | def process(cur_dir, img_root): 11 | """ 12 | Process images in a directory. 13 | 14 | Args: 15 | cur_dir (str): The name of the current directory. 16 | img_root (str): The root directory of the images. 17 | 18 | Returns: 19 | dict: A dictionary containing processed images. The keys are unique identifiers 20 | for each image, and the values are the processed images. 21 | 22 | """ 23 | root = os.path.join(img_root, cur_dir, "color") 24 | file_list = os.listdir(root) 25 | images = {} 26 | for cur_file in file_list: 27 | file_name = os.path.join(img_root, cur_dir, "color", cur_file) 28 | with open(file_name, "rb") as f: 29 | img = f.read() 30 | image_id = f"{cur_dir}_color_{cur_file[:-4]}" 31 | images[image_id] = process_image(img) 32 | return images 33 | 34 | 35 | def process_data(img_root: str, num_threads: int): 36 | """ 37 | Process images in parallel using multiple threads. 38 | 39 | Args: 40 | img_root (str): The root directory of the images. 41 | num_threads (int): The number of threads to use for parallel processing. 42 | 43 | Returns: 44 | dict: A dictionary containing processed images. The keys are unique identifiers 45 | for each image, and the values are the processed images. 46 | 47 | """ 48 | keys_dir = glob(os.path.join(img_root, "scene*_00")) 49 | keys = list(map(os.path.basename, keys_dir)) 50 | all_images = {} 51 | process_bar = tqdm(total=len(keys), unit="image", desc="Loading images") 52 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 53 | for images in executor.map(process, keys, [img_root] * len(keys)): 54 | all_images.update(images) 55 | process_bar.update() 56 | process_bar.close() 57 | return all_images 58 | -------------------------------------------------------------------------------- /mimic-it/convert-it/datasets/utils/visual_story_telling_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | 4 | from tqdm import tqdm 5 | from concurrent.futures import ThreadPoolExecutor 6 | from image_utils import resize_image, create_folder 7 | 8 | 9 | def get_url(image: dict[str]): 10 | """ 11 | Retrieve the URL of the image. 12 | 13 | Args: 14 | image: A dictionary containing image information. 15 | 16 | Returns: 17 | The URL of the image. 18 | 19 | """ 20 | if "url_o" in image: 21 | return image["url_o"] 22 | else: 23 | return image["url_m"] 24 | 25 | 26 | def download_single_image(image: dict[str]) -> tuple[str, bytes]: 27 | """ 28 | Download a single image and resize it. 29 | 30 | Args: 31 | image: A dictionary containing image information. 32 | 33 | Returns: 34 | A tuple containing the image ID and the resized image as bytes. 35 | 36 | """ 37 | url = get_url(image) 38 | id = image["id"] 39 | try: 40 | pic = requests.get(url) 41 | return id, resize_image(pic.content) 42 | except: 43 | return id, None 44 | 45 | 46 | def download(images: list[dict[str]], num_threads: int): 47 | """ 48 | Download multiple images concurrently using thread pooling. 49 | 50 | Args: 51 | images: A list of dictionaries, each containing image information. 52 | num_threads: The number of threads to use for concurrent downloading. 53 | 54 | Returns: 55 | A dictionary mapping image IDs to their corresponding resized images as bytes. 56 | 57 | """ 58 | output = {} 59 | process_bar = tqdm(total=len(images), unit="image", desc="Downloading images") 60 | expired_images = [] 61 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 62 | for id, image in executor.map(download_single_image, images): 63 | if image is not None: 64 | output[id] = image 65 | else: 66 | expired_images.append(id) 67 | process_bar.update(1) 68 | process_bar.close() 69 | create_folder("output") 70 | with open("output/expired_images.json", "w") as f: 71 | json.dump(expired_images, f, indent=4) 72 | return output 73 | -------------------------------------------------------------------------------- /mimic-it/convert-it/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import orjson 3 | 4 | from abstract_dataset import get_dataset_by_path 5 | from image_utils import get_json_data_generator, create_folder 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--name", type=str, required=True, help="Path to the dataset class.") 11 | parser.add_argument("--num_threads", type=int, default=8, help="Number of threads.") 12 | parser.add_argument("--image_path", help="Path to the prompt file.") 13 | parser.add_argument("--image_root", default=None, help="Path to the image root.") 14 | 15 | args = parser.parse_args() 16 | dataset_args = {} 17 | if args.image_path is not None: 18 | dataset_args["image_path"] = args.image_path 19 | if args.num_threads is not None: 20 | dataset_args["num_threads"] = args.num_threads 21 | if args.image_root is not None: 22 | dataset_args["image_root"] = args.image_root 23 | dataset = get_dataset_by_path(args.name, dataset_args) 24 | dataset_short_name = dataset.short_name 25 | dataset = dict(dataset) 26 | create_folder("output") 27 | 28 | # Open the output JSON file in text mode, since we'll be writing strings 29 | with open(f"output/{dataset_short_name}.json", "w") as f: 30 | # Write the opening brace for the JSON object 31 | f.write("{") 32 | 33 | # Use a flag to track whether a comma is needed before the next key-value pair 34 | need_comma = False 35 | 36 | # Iterate over the generator, which yields key-value pairs one at a time 37 | for image_key, base64_data in get_json_data_generator(dataset, dataset_short_name, args.num_threads): 38 | # Write a comma before the next key-value pair if needed 39 | if need_comma: 40 | f.write(", ") 41 | 42 | # Write the key-value pair as a string to the file 43 | f.write(f'"{image_key}": "{base64_data}"') 44 | 45 | # Set the flag to True so that a comma is written before the next key-value pair 46 | need_comma = True 47 | 48 | # Write the closing brace for the JSON object 49 | f.write("}") 50 | -------------------------------------------------------------------------------- /mimic-it/syphus/datasets/3d.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the implementation of the SceneNavigation and SceneRef and Scene QA datasets. 3 | """ 4 | 5 | import json 6 | import random 7 | 8 | from abstract_dataset import AbstractDataset 9 | 10 | 11 | class SceneNavigation(AbstractDataset): 12 | def __init__( 13 | self, 14 | name: str = "SceneNavigation", 15 | in_context_path: str = "prompts/scene_navigation.json", 16 | query_inputs_path: str = "annotations/scene_navigation/scan_info.json", 17 | ): 18 | super().__init__(name, in_context_path, query_inputs_path) 19 | 20 | def _load_query_inputs(self, path: str) -> list[str]: 21 | with open(path, "r") as f: 22 | json_data = json.load(f) 23 | results = [] 24 | counter = 0 25 | for scene_id, inner_dict in json_data.items(): 26 | # if counter > 7: 27 | # break 28 | descriptions = inner_dict["description"] 29 | random.shuffle(descriptions) 30 | real_descriptions = [] 31 | for cur_description in descriptions[:50]: 32 | real_descriptions.append(cur_description[1]) 33 | results.append( 34 | { 35 | "id": scene_id, 36 | "sentences": "\n".join(real_descriptions), 37 | } 38 | ) 39 | counter += 1 40 | return results 41 | -------------------------------------------------------------------------------- /mimic-it/syphus/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/mimic-it/syphus/datasets/__init__.py -------------------------------------------------------------------------------- /mimic-it/syphus/datasets/change.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the implementation of the SpotTheDifference and CleverChange datasets. 3 | """ 4 | 5 | import importlib 6 | import json 7 | 8 | from abstract_dataset import AbstractDataset 9 | 10 | 11 | class SpotTheDifference(AbstractDataset): 12 | def __init__( 13 | self, 14 | name: str = "SpotTheDifference", 15 | prompt_path: str = "prompts/spot_the_difference.json", 16 | query_inputs_path: str = "annotations/spot_the_difference/train.json", 17 | ): 18 | super().__init__(name, prompt_path, query_inputs_path) 19 | 20 | def _load_query_inputs(self, path: str) -> list[str]: 21 | with open(path, "r") as f: 22 | json_data = json.load(f) 23 | results = [] 24 | for inner_dict in json_data: 25 | file_id = inner_dict["img_id"] 26 | sentences = inner_dict["sentences"] 27 | results.append( 28 | { 29 | "id": file_id, 30 | "sentences": "\n".join(sentences), 31 | } 32 | ) 33 | return results 34 | 35 | 36 | class CocoSpotTheDifference(AbstractDataset): 37 | def __init__( 38 | self, 39 | name: str = "CocoSpotTheDifference", 40 | prompt_path: str = "prompts.coco_spot_the_difference_prompt", 41 | query_inputs_path: str = "annotations/coco_spot_the_difference/csd_query.json", 42 | ): 43 | super().__init__(name, prompt_path, query_inputs_path) 44 | 45 | def _load_query_inputs(self, path: str) -> list[dict[str, str]]: 46 | with open(path) as f: 47 | json_data = json.load(f) 48 | results = [] 49 | for file_id, inner_dict in json_data.items(): 50 | sentences = inner_dict["sentences"] 51 | results.append( 52 | { 53 | "id": file_id, 54 | "sentences": sentences, 55 | } 56 | ) 57 | return results 58 | 59 | def _load_prompt(self, path: str) -> dict[str, str]: 60 | prompt_file = importlib.import_module(path) 61 | return { 62 | "system_message": prompt_file.system_message, 63 | "in_context": prompt_file.in_context, 64 | } 65 | -------------------------------------------------------------------------------- /mimic-it/syphus/datasets/translate.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import json 3 | 4 | from abstract_dataset import AbstractDataset 5 | 6 | 7 | class TranslationDataset(AbstractDataset): 8 | def __init__( 9 | self, 10 | name: str = "Translations", 11 | prompt_path: str = "prompts.translation_prompt", 12 | query_inputs_path: str = None, 13 | ): 14 | super().__init__(name, prompt_path, query_inputs_path) 15 | 16 | def _load_prompt(self, path: str) -> dict[str, str]: 17 | prompt_file = importlib.import_module(path) 18 | return { 19 | "system_message": prompt_file.system_message, 20 | "in_context": prompt_file.in_context, 21 | } 22 | 23 | def _load_query_inputs(self, path: str) -> list[str]: 24 | with open(path, "r") as f: 25 | json_data = json.load(f)["data"] 26 | temp = [] 27 | for file_id, i in json_data.items(): 28 | all_string = "" + i["instruction"] + "" + "" + i["answer"] + "" 29 | temp.extend([{"id": file_id, "sentences": all_string}]) 30 | return temp 31 | -------------------------------------------------------------------------------- /mimic-it/syphus/prompts/funqa_dia.json: -------------------------------------------------------------------------------- 1 | { 2 | "system_message": "You are now my dialogue generation assistant, you have done a lot of video understanding and mastered this ability, I will give you a 'description' of the counter-intuitive video and a 'counter-intuitive reason' of why it is counter-intuitive.\nWhere 'description' is an objective description of a video, including what happens in the video and 'counter-intuitive reason' is an explanation of why the video is counter-intuitive.\nYou're going to use that information to generate a question and answer for me, using the text that I give you, start with descriptive questions like what, when, which, who, where, and then start with why, how, and then you're going to give some open questions,\nThat is to say, the question is gradual, and you should try to use the previous information later. All of the above answers are short sentences, and try not to exceed 30 words" 3 | , 4 | "in_context": [ 5 | { 6 | "role": "user", 7 | "content": "description: The barber gave the woman a haircut. After the woman cut her hair, her hair stood on end. Next to her appeared a parrot with the same hairstyle as the woman\n why counter-intuitive: Women look funny when they get their hair cut and look like parrot hair." 8 | }, 9 | { 10 | "role": "assistant", 11 | "content": [ 12 | { 13 | "question": "What is happening in the video?", 14 | "answer": "A girl is getting her hair cut by a hairdresser. After the haircut, her hair stands up and a parrot appears next to her with the same hairstyle." 15 | }, 16 | { 17 | "question": "Why is the video humorous?", 18 | "answer": "The girl's hair standing up and matching the parrot's hairstyle is unexpected and funny." 19 | }, 20 | { 21 | "question": "What this girl's mood might be like?", 22 | "answer": "She may be dissatisfied because generally speaking, a human's hair does not look like a parrot, which is not considered aesthetically pleasing." 23 | }, 24 | { 25 | "question": "Does this parrot feel humor because the girl's hair?", 26 | "answer": "Probably not, it does not feel this, probably because its intelligence can not understand the relationship between the girl's hair and its hair." 27 | }, 28 | { 29 | "question": "Do you think the parrot intentionally matched the girl's hairstyle?", 30 | "answer": "It's unlikely that the parrot intentionally matched the girl's hairstyle. It's more likely a coincidence or the result of the parrot's natural feathers." 31 | }, 32 | { 33 | "question": "What do you think the girl will do afterwards?", 34 | "answer": "It's unlikely that the parrot intentionally matched the girl's hairstyle. It's more likely a coincidence or the result of the parrot's natural feathers." 35 | } 36 | ] 37 | } 38 | ] 39 | } -------------------------------------------------------------------------------- /mimic-it/syphus/prompts/funqa_mcqa.json: -------------------------------------------------------------------------------- 1 | { 2 | "system_message": "You are now a data augmentation assistant. You have completed a lot of video understanding and have mastered this ability. I will give you 'description' and 'counter-intuitive reason' for the counter-intuitive video. You will help me to complete the understanding of this paragraph of text and generate 5 English question and answer pairs from it. The form is multiple choice. Each question has 5 choices, one of which is the correct answer and the other three are interference items.\n You need to mark the correct answer. The question can only be what, how, which or who. The answer is a word, phrase, or sentence of 15 words or less." 3 | , 4 | "in_context": [ 5 | { 6 | "role": "user", 7 | "content": "description: A white cat is lying flat inside the toilet bowl.\n counter-intuitive reason: The cat is so comfortable lying on its stomach in the toilet that it is both helpless and funny." 8 | }, 9 | { 10 | "role": "assistant", 11 | "content": [ 12 | { 13 | "Question": "What is the main subject of the video?", 14 | "Options 1": "A black cat", 15 | "Options 2": "A white cat", 16 | "Options 3": "A dog", 17 | "Options 4": "A bird", 18 | "Options 5": "A fish", 19 | "Answer": "2" 20 | }, 21 | { 22 | "Question": "How is the white cat positioned inside the toilet bowl?", 23 | "Options 1": "Standing on its hind legs", 24 | "Options 2": "Sleeping on its side", 25 | "Options 3": "Lying flat on its stomach", 26 | "Options 4": "Sitting upright", 27 | "Options 5": "Hanging upside down", 28 | "Answer": "3" 29 | }, 30 | { 31 | "Question": "Which word best describes the cat's behavior in the video?", 32 | "Options 1": "Annoyed", 33 | "Options 2": "Excited", 34 | "Options 3": "Curious", 35 | "Options 4": "Comfortable", 36 | "Options 5": "Scared", 37 | "Answer": "4" 38 | }, 39 | { 40 | "Question": "What is the cat's posture inside the toilet bowl?", 41 | "Options 1": "Upside down", 42 | "Options 2": "Leaning against the edge", 43 | "Options 3": "Sitting like a human", 44 | "Options 4": "Balanced on one leg", 45 | "Options 5": "Lying flat on its stomach", 46 | "answer": "5" 47 | }, 48 | { 49 | "Question": "Who or what finds the cat's position in the toilet bowl funny?", 50 | "Options 1": "The cat itself", 51 | "Options 2": "A person watching the video", 52 | "Options 3": "Another animal in the room", 53 | "Options 4": "Nobody finds it funny", 54 | "Options 5": "The cat\"s owner", 55 | "Answer": "2" 56 | } 57 | ] 58 | } 59 | ] 60 | } -------------------------------------------------------------------------------- /mimic-it/syphus/prompts/funqa_translation.json: -------------------------------------------------------------------------------- 1 | { 2 | "system_message": "Now I have two tasks for you to complete at once. Task 1. 'CH2Eng Translation': Please translate the content into English. Task 2. 'Expansion_with_Eng': Please generate five synonyms according to the content just translated into English.", 3 | "in_context": [ 4 | { 5 | "role": "user", 6 | "content": "一只白色的猫咪整个趴在厕所蹲厕里。" 7 | }, 8 | { 9 | "role": "assistant", 10 | "content": [ 11 | { 12 | "CH2Eng Translation": "A white cat is lying flat inside the toilet bowl.", 13 | "Expansion_with_Eng_1": "A white cat resting entirely in a toilet bowl.", 14 | "Expansion_with_Eng_2": "The entire body of a white cat is sprawled across the toilet seat.", 15 | "Expansion_with_Eng_3": "The white cat is The white cat is lying flat within the toilet's squatting position.", 16 | "Expansion_with_Eng_4": "A white-coloured feline is stretched out inside the toilet bowl.", 17 | "Expansion_with_Eng_5": "The complete posture of a white cat can be seen in the toilet, lying down." 18 | } 19 | ] 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /mimic-it/syphus/prompts/spot_the_difference.json: -------------------------------------------------------------------------------- 1 | { 2 | "system_message": "You are an AI assistant playing the spot the difference game. You will be presented with several sentences that describe the differences between two images. The two images are frames extracted from the video-surveillance footage.\n\nYour response shall always contain a pair of question and answer. The question you generate should have definite and concrete answer according to the description. When generating answer, you should answer as if you are watching the images. You are not allowed to say \"according to descrption\" or similar words in your outputs. You do not have to provide a specific answer and you should not ask for more details.\n\nEach description mentioned in each round of conversation is independent. You should not assume that the descriptions in a new round of conversation are related to the previous context.\n\nIf there are no differences, you still have to follow the previous format.", 3 | "in_context": [ 4 | { 5 | "role": "user", 6 | "content": "the car in the upper left corner of the picture is gone\nthe two people in the upper part of the picture are gone\nthe person near the handicapped parking place is not in the second picture\nthere appears to be a figure maybe man that was near handicapped parking place straining near a pole in the bottom of the picture\nthe two people who were on th sidewalk are not in the second picture\nthere are two people in the seconde picture that are near the handicapped parking place" 7 | }, 8 | { 9 | "role": "assistant", 10 | "content": [ 11 | { 12 | "Question": "Could you summarize the differences between the two images?", 13 | "Answer": "The left image has five people and the right image has only three. The people are not the same. The car in the upper left corner of the picture is gone. The remaining elements in the two images are the same." 14 | }, 15 | { 16 | "Question": "Does the right image have more people than the left image?", 17 | "Answer": "Yes, the left image has two more people than the right image." 18 | }, 19 | { 20 | "Question": "Does the car in the upper left corner of the picture still exist in the second picture?", 21 | "Answer": "No, the car in the upper left corner of the picture is gone." 22 | }, 23 | { 24 | "Question": "Does the right image have more cars than the left image?", 25 | "Answer": "Yes, the right image has one car while the left image has no car." 26 | }, 27 | { 28 | "Question": "Does the people in the left image the same as the people in the right image?", 29 | "Answer": "No, the people in the left image are not the same as the people in the right image." 30 | } 31 | ] 32 | }, 33 | { 34 | "role": "user", 35 | "content": "There is no difference between the two images." 36 | }, 37 | { 38 | "role": "assistant", 39 | "content": [ 40 | { 41 | "Question": "Could you summarize the differences between the two images?", 42 | "Answer": "There is no difference between the two images." 43 | } 44 | ] 45 | } 46 | ] 47 | } -------------------------------------------------------------------------------- /mimic-it/syphus/prompts/translation_prompt.py: -------------------------------------------------------------------------------- 1 | system_message = """As an AI assistant, you are an expert in translating English to natural Chinese(zh), Spanish(es), Japanese(ja), German(de), French(fr), Korean(ko), and Arabic(ar). You will provide professional translations without any grammar mistakes. Your translation should be in a tone of native speaker. The input format will be xxxxxx. You should translate the content inside the and tag. Your output should strictly follow the format of this json file:\n 2 | { 3 | "a": { 4 | "zh": "xxx", 5 | "es": "xxx", 6 | "ja": "xxx", 7 | "de": "xxx", 8 | "fr": "xxx", 9 | "ko": "xxx", 10 | "ar": "xxx" 11 | }, 12 | "b": { 13 | "zh": "xxx", 14 | "es": "xxx", 15 | "ja": "xxx", 16 | "de": "xxx", 17 | "fr": "xxx", 18 | "ko": "xxx", 19 | "ar": "xxx" 20 | } 21 | }""" 22 | in_context = [ 23 | { 24 | "role": "user", 25 | "content": "Hey, bro, I feel like having a cup of tea here!Having tea is a great choice! Right now, you have two options in front of you: green tea and oolong tea. Which one would you like to have? Of course, if you want to enhance the tea-drinking experience even more, you can dim the lights here and play some classic Chinese music on your phone!", 26 | }, 27 | { 28 | "role": "assistant", 29 | "content": """{ 30 | "a": { 31 | "zh": "嘿,兄弟,我想在这里喝口茶!", 32 | "es": "¡eh, tío, me apetece tomar una taza de café aquí!", 33 | "ja": "ねえ、ここでお茶を飲みたいな!", 34 | "de": "Hey, ich möchte hier gerne eine Tasse Tee trinken!", 35 | "fr": "Bonjour, je voudrais prendre une tasse de thé ici!", 36 | "ko": "헤이, 브로, 난 여기에 자 한 잔 바시고 싶어!", 37 | "ar": "يا أخي ، أشعر وكأنني أرغب في شرب كوب من الشاي هنا!", 38 | }, 39 | "b":{ 40 | "zh": "嗨!喝茶是个不错的选择!现在你面前有绿茶和乌龙茶,你想喝哪种呢?当然,如果你想有更好的品茶氛围的话,你可以把这里的灯光调暗,再用你的手机来点经典的中国音乐", 41 | "es": "¡Tomar té es una muy buena elección! ¿Cuál te gustaría tomar? Por supuesto, si quieres disfrutar aún más de la experiencia de tomar té, puedes atenuar las luces de aquí y reproducir música clásica china desde tu teléfono móvil.", 42 | "ja": "お茶を飲むのはいい選択だよ!今、目の前には緑茶とウーロン茶がありますが、どちらが飲みたいですか?もちろん、もっと良いお茶の時間を過ごしたいなら、ここでの照明を暗くして、あなたの携帯で中国のクラシック音楽を流すといいよ!", 43 | "de": "Tee zu trinken ist eine gute Wahl! Vor dir hast du jetzt die Wahl zwischen grünem Tee und Oolong-Tee. Welche Sorte möchtest du trinken? Natürlich, wenn du ein noch besseres Teetrinkerlebnis haben möchtest, kannst du hier das Licht abdunkeln und mit deinem Handy klassische chinesische Musik abspielen!", 44 | "fr": "Boire du thé est un excellent choix ! Maintenant, vous avez deux options devant vous, le thé vert et le thé oolong. Lequel aimeriez-vous prendre? Bien sûr, si vous souhaitez améliorer votre expérience de dégustation, vous pouvez baisser les lumières ici et écouter de la musique chinoise classique sur votre téléphone!", 45 | "ko": "하이, 좋은 선택이야. 지금 녹차와 우롱차가 있는데 뭘 마시고 싶어? 물론 차를 더욱 풍미있게 즐기고 싶으면 여기 조명을 어둡게 조절하고, 핸드폰으로 중국 클래식 음악을 틀어도 좋아!", 46 | "ar": "شرب الشاي هو اختيار رائع! الآن ، لديك خياران أمامك: الشاي الأخضر والشاي الأوولونغ. أيهما تفضل؟ بالطبع ، إذا أردت تعزيز تجربة شرب الشاي أكثر ، يمكنك خفت الأنوار هنا وتشغيل بعض الموسيقى الصينية الكلاسيكية على هاتفك!" 47 | } 48 | } 49 | """, 50 | }, 51 | ] 52 | -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | distributed_type: MULTI_GPU 3 | downcast_bf16: false 4 | machine_rank: 0 5 | main_training_function: main 6 | mixed_precision: bf16 7 | num_machines: 1 8 | num_processes: 2 9 | rdzv_backend: static 10 | same_network: false 11 | tpu_use_cluster: false 12 | tpu_use_sudo: false 13 | use_cpu: false 14 | main_process_port: 20685 15 | -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | distributed_type: no 3 | downcast_bf16: true 4 | machine_rank: 0 5 | main_training_function: main 6 | mixed_precision: bf16 7 | num_machines: 1 8 | num_processes: 1 9 | rdzv_backend: static 10 | same_network: true 11 | tpu_use_cluster: false 12 | tpu_use_sudo: false 13 | use_cpu: false 14 | main_process_port: 20687 15 | -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_zero1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 1 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_process_ip: null 13 | main_process_port: null 14 | main_training_function: main 15 | mixed_precision: bf16 16 | num_machines: 1 17 | num_processes: 8 18 | use_cpu: false -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 4 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_process_ip: null 13 | main_process_port: null 14 | main_training_function: main 15 | mixed_precision: bf16 16 | num_machines: 1 17 | num_processes: 8 18 | use_cpu: false -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_zero2_slurm.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: cpu 6 | offload_param_device: cpu 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_process_ip: null 13 | main_process_port: null 14 | main_training_function: main 15 | mixed_precision: bf16 16 | num_machines: 1 17 | num_processes: 8 18 | use_cpu: false 19 | -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | fsdp_config: {} 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: 20333 15 | main_training_function: main 16 | mixed_precision: bf16 17 | num_machines: 1 18 | num_processes: 8 19 | use_cpu: false -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_zero3_offload.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: cpu 6 | offload_param_device: cpu 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | fsdp_config: {} 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: 20333 15 | main_training_function: main 16 | mixed_precision: bf16 17 | num_machines: 1 18 | num_processes: 8 19 | use_cpu: false -------------------------------------------------------------------------------- /pipeline/accelerate_configs/accelerate_config_zero3_slurm.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | gradient_accumulation_steps: 2 5 | gradient_clipping: 1.0 6 | offload_optimizer_device: none 7 | offload_param_device: none 8 | zero3_init_flag: true 9 | zero3_save_16bit_model: true 10 | zero_stage: 3 11 | distributed_type: DEEPSPEED 12 | fsdp_config: {} 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 2 16 | num_processes: 16 17 | use_cpu: false 18 | -------------------------------------------------------------------------------- /pipeline/accelerate_configs/ds_zero3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /pipeline/benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | config.yaml -------------------------------------------------------------------------------- /pipeline/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/__init__.py -------------------------------------------------------------------------------- /pipeline/benchmarks/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/datasets/__init__.py -------------------------------------------------------------------------------- /pipeline/benchmarks/datasets/base_eval_dataset.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from PIL import Image 3 | from typing import Dict, List, Any 4 | 5 | import importlib 6 | 7 | AVAILABLE_EVAL_DATASETS: Dict[str, str] = { 8 | "mmbench": "MMBenchDataset", 9 | "mme": "MMEDataset", 10 | "mathvista": "MathVistaDataset", 11 | "mmvet": "MMVetDataset", 12 | "seedbench": "SEEDBenchDataset", 13 | "pope": "PopeDataset", 14 | "scienceqa": "ScienceQADataset", 15 | "magnifierbench": "MagnifierBenchDataset", 16 | } 17 | 18 | 19 | class BaseEvalDataset(ABC): 20 | def __init__(self, name: str, dataset_path: str, *, max_batch_size: int = 1): 21 | self.name = name 22 | self.dataset_path = dataset_path 23 | self.max_batch_size = max_batch_size 24 | 25 | def evaluate(self, model, **kwargs): 26 | return self._evaluate(model, **kwargs) 27 | # batch = min(model.max_batch_size, self.max_batch_size) 28 | # if batch == 1: 29 | # return self._evaluate(model, **kwargs) 30 | # else: 31 | # kwargs["batch"] = batch 32 | # return self._evaluate(model, **kwargs) 33 | 34 | @abstractmethod 35 | def _evaluate(self, model: str): 36 | pass 37 | 38 | 39 | def load_dataset(dataset_name: str, dataset_args: Dict[str, str] = {}) -> BaseEvalDataset: 40 | assert dataset_name in AVAILABLE_EVAL_DATASETS, f"{dataset_name} is not an available eval dataset." 41 | module_path = "pipeline.benchmarks.datasets." + dataset_name 42 | dataset_formal_name = AVAILABLE_EVAL_DATASETS[dataset_name] 43 | imported_module = importlib.import_module(module_path) 44 | dataset_class = getattr(imported_module, dataset_formal_name) 45 | print(f"Imported class: {dataset_class}") 46 | # import pdb;pdb.set_trace() 47 | # get dataset args without "name" 48 | init_args = dataset_args.copy() 49 | init_args.pop("name") 50 | return dataset_class(**init_args) 51 | -------------------------------------------------------------------------------- /pipeline/benchmarks/datasets/seedbench.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | from .base_eval_dataset import BaseEvalDataset 4 | from datasets import load_dataset 5 | import json 6 | import os 7 | import datetime 8 | 9 | 10 | class SEEDBenchDataset(BaseEvalDataset): 11 | def __init__(self, data_path: str = "Otter-AI/SEEDBench", split="test", default_output_path="./logs", cache_dir=None): 12 | super().__init__("SEEDBenchDataset", data_path) 13 | print("Loading dataset from", data_path) 14 | self.data = load_dataset(data_path, split=split, cache_dir=cache_dir) 15 | self.default_output_path = default_output_path 16 | if not os.path.exists(default_output_path): 17 | os.makedirs(default_output_path) 18 | 19 | def _evaluate(self, model): 20 | count = 0 21 | num_correct = 0 22 | cur_datetime = datetime.datetime.now().strftime("%Y%m%d-%H%M") 23 | output_path = os.path.join(self.default_output_path, f"seedbench_{model.name}_test_submit_{cur_datetime}.json") 24 | output_f = open(output_path, "a") 25 | with tqdm(total=len(self.data), desc="Evaluating") as pbar: 26 | for data_dict in self.data: 27 | image = data_dict["image"] 28 | question = data_dict["question"] + " There are several options:" 29 | option_index = ["A", "B", "C", "D"] 30 | for cur_idx in range(4): 31 | question += f" {option_index[cur_idx]}. {data_dict[f'choice_{option_index[cur_idx].lower()}']}" 32 | 33 | answer = data_dict["answer"] 34 | options = [ 35 | data_dict["choice_a"], 36 | data_dict["choice_b"], 37 | data_dict["choice_c"], 38 | data_dict["choice_d"], 39 | ] 40 | 41 | option_losses = [] 42 | for idx, option in enumerate(options): 43 | option = option_index[idx] + ". " + option 44 | loss = model.eval_forward(question, option, image) 45 | option_losses.append(loss.item()) 46 | 47 | prediction_idx = np.argmin(option_losses) 48 | prediction = ["A", "B", "C", "D"][prediction_idx] 49 | if prediction == answer: 50 | num_correct += 1 51 | count += 1 52 | 53 | answer_record = {"question_id": data_dict["question_id"], "prediction": prediction} 54 | output_f.write(json.dumps(answer_record) + "\n") 55 | 56 | answer_record = {"question_id": data_dict["question_id"], "prediction": prediction} 57 | output_f.write(json.dumps(answer_record) + "\n") 58 | 59 | accuracy = num_correct / count * 100 60 | pbar.set_postfix(accuracy=f"{accuracy:.2f}") 61 | pbar.update(1) 62 | 63 | accuracy = num_correct / count * 100 64 | print(f"Accuracy: {accuracy:.2f}%") 65 | return accuracy 66 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/models/__init__.py -------------------------------------------------------------------------------- /pipeline/benchmarks/models/base_model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from PIL import Image 3 | from typing import Dict 4 | 5 | import importlib 6 | 7 | AVAILABLE_MODELS: Dict[str, str] = { 8 | "video_chat": "VideoChat", 9 | "otter_video": "OtterVideo", 10 | "llama_adapter": "LlamaAdapter", 11 | "mplug_owl": "mPlug_owl", 12 | "video_chatgpt": "Video_ChatGPT", 13 | "otter_image": "OtterImage", 14 | "frozen_bilm": "FrozenBilm", 15 | "idefics": "Idefics", 16 | "fuyu": "Fuyu", 17 | "otterhd": "OtterHD", 18 | "instructblip": "InstructBLIP", 19 | "qwen_vl": "QwenVL", 20 | "llava_model": "LLaVA_Model", 21 | "instructblip": "InstructBLIP", 22 | "gpt4v": "OpenAIGPT4Vision", 23 | } 24 | 25 | 26 | class BaseModel(ABC): 27 | def __init__(self, model_name: str, model_path: str, *, max_batch_size: int = 1): 28 | self.name = model_name 29 | self.model_path = model_path 30 | self.max_batch_size = max_batch_size 31 | 32 | @abstractmethod 33 | def generate(self, **kwargs): 34 | pass 35 | 36 | @abstractmethod 37 | def eval_forward(self, **kwargs): 38 | pass 39 | 40 | 41 | def load_model(model_name: str, model_args: Dict[str, str]) -> BaseModel: 42 | assert model_name in AVAILABLE_MODELS, f"{model_name} is not an available model." 43 | module_path = "pipeline.benchmarks.models." + model_name 44 | model_formal_name = AVAILABLE_MODELS[model_name] 45 | imported_module = importlib.import_module(module_path) 46 | model_class = getattr(imported_module, model_formal_name) 47 | print(f"Imported class: {model_class}") 48 | model_args.pop("name") 49 | return model_class(**model_args) 50 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/frozen_bilm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/models/frozen_bilm.py -------------------------------------------------------------------------------- /pipeline/benchmarks/models/fuyu.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from transformers import AutoTokenizer, FuyuImageProcessor 3 | from transformers import FuyuForCausalLM 4 | from src.otter_ai.models.fuyu.processing_fuyu import FuyuProcessor 5 | from PIL import Image 6 | from .base_model import BaseModel 7 | import torch 8 | import numpy as np 9 | import warnings 10 | import io 11 | import base64 12 | import math 13 | 14 | warnings.filterwarnings("ignore") 15 | 16 | 17 | def get_pil_image(raw_image_data) -> Image.Image: 18 | if isinstance(raw_image_data, Image.Image): 19 | return raw_image_data 20 | 21 | elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data: 22 | return Image.open(io.BytesIO(raw_image_data["bytes"])) 23 | 24 | elif isinstance(raw_image_data, str): # Assuming this is a base64 encoded string 25 | image_bytes = base64.b64decode(raw_image_data) 26 | return Image.open(io.BytesIO(image_bytes)) 27 | 28 | else: 29 | raise ValueError("Unsupported image data format") 30 | 31 | 32 | class Fuyu(BaseModel): 33 | def __init__(self, model_path: str = "adept/fuyu-8b", cuda_id: int = 0, resolution: int = -1, max_new_tokens=256): 34 | super().__init__("fuyu", model_path) 35 | self.resolution = resolution 36 | self.device = f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu" 37 | self.model = FuyuForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(self.device) 38 | self.tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b") 39 | self.image_processor = FuyuImageProcessor() 40 | self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer) 41 | self.max_new_tokens = max_new_tokens 42 | self.bad_words_list = ["User:", "Assistant:"] 43 | self.bad_words_ids = self.tokenizer(self.bad_words_list, add_special_tokens=False).input_ids 44 | 45 | def generate(self, text_prompt: str, raw_image_data: str): 46 | raw_image_data = get_pil_image(raw_image_data) 47 | raw_image_data = raw_image_data.convert("RGB") 48 | # make sure the image is in RGB format and resize to match the width 49 | if self.resolution != -1: 50 | width, height = raw_image_data.size 51 | short_edge = min(width, height) 52 | scaling_factor = self.resolution / short_edge 53 | new_width = math.ceil(width * scaling_factor) 54 | new_height = math.ceil(height * scaling_factor) 55 | raw_image_data = raw_image_data.resize((new_width, new_height), Image.ANTIALIAS) 56 | # formated_prompt = f"User: {text_prompt} Assistant:" 57 | model_inputs = self.processor(text=text_prompt, images=[raw_image_data], device=self.device) 58 | for k, v in model_inputs.items(): 59 | model_inputs[k] = v.to(self.device) 60 | 61 | model_inputs["image_patches"] = model_inputs["image_patches"].to(dtype=next(self.model.parameters()).dtype) 62 | generation_output = self.model.generate(**model_inputs, max_new_tokens=self.max_new_tokens, pad_token_id=self.tokenizer.eos_token_id, bad_words_ids=self.bad_words_ids) 63 | generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True) 64 | return generation_text[0].split("\x04")[1].strip(" ").strip("\n") 65 | 66 | def eval_forward(self, **kwargs): 67 | return super().eval_forward(**kwargs) 68 | 69 | 70 | if __name__ == "__main__": 71 | model = Fuyu() 72 | print(model.generate("Generate a coco-style caption.\n", Image.open("/home/luodian/projects/Otter/archived/test_images/rabbit.png").convert("RGB"))) 73 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/gpt4v.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import base64 3 | from .base_model import BaseModel 4 | from PIL import Image 5 | import io 6 | import time 7 | 8 | 9 | def get_pil_image(raw_image_data) -> Image.Image: 10 | if isinstance(raw_image_data, Image.Image): 11 | return raw_image_data 12 | 13 | elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data: 14 | return Image.open(io.BytesIO(raw_image_data["bytes"])) 15 | 16 | elif isinstance(raw_image_data, str): # Assuming this is a base64 encoded string 17 | image_bytes = base64.b64decode(raw_image_data) 18 | return Image.open(io.BytesIO(image_bytes)) 19 | 20 | else: 21 | raise ValueError("Unsupported image data format") 22 | 23 | 24 | class OpenAIGPT4Vision(BaseModel): 25 | def __init__(self, api_key: str, max_new_tokens: int = 256): 26 | super().__init__("openai-gpt4", "gpt-4-vision-preview") 27 | self.api_key = api_key 28 | self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} 29 | self.max_new_tokens = max_new_tokens 30 | 31 | @staticmethod 32 | def encode_image_to_base64(raw_image_data) -> str: 33 | if isinstance(raw_image_data, Image.Image): 34 | buffered = io.BytesIO() 35 | raw_image_data.save(buffered, format="JPEG") 36 | return base64.b64encode(buffered.getvalue()).decode("utf-8") 37 | raise ValueError("The input image data must be a PIL.Image.Image") 38 | 39 | def generate(self, text_prompt: str, raw_image_data): 40 | raw_image_data = get_pil_image(raw_image_data).convert("RGB") 41 | base64_image = self.encode_image_to_base64(raw_image_data) 42 | 43 | payload = { 44 | "model": "gpt-4-vision-preview", 45 | "messages": [ 46 | { 47 | "role": "user", 48 | "content": [ 49 | {"type": "text", "text": text_prompt}, 50 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}, 51 | ], 52 | } 53 | ], 54 | "max_tokens": self.max_new_tokens, 55 | } 56 | 57 | retry = True 58 | retry_times = 0 59 | while retry and retry_times < 5: 60 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload) 61 | if response.status_code == 200: 62 | response_data = response.json() 63 | return response_data["choices"][0]["message"]["content"] 64 | else: 65 | print(f"Failed to connect to OpenAI API: {response.status_code} - {response.text}. Retrying...") 66 | time.sleep(10) 67 | retry_times += 1 68 | return "Failed to connect to OpenAI GPT4V API" 69 | 70 | def eval_forward(self, **kwargs): 71 | return super().eval_forward(**kwargs) 72 | 73 | 74 | if __name__ == "__main__": 75 | # Use your own API key here 76 | api_key = "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" 77 | model = OpenAIGPT4Vision(api_key) 78 | image = Image.open("/home/luodian/projects/Otter/archived/data/G4_IMG_00001.png").convert("RGB") 79 | print(model.generate("What’s in this image?", image)) 80 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/instructblip.py: -------------------------------------------------------------------------------- 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration 2 | from PIL import Image 3 | from .base_model import BaseModel 4 | import torch 5 | import numpy as np 6 | import warnings 7 | import io 8 | import base64 9 | 10 | warnings.filterwarnings("ignore") 11 | 12 | 13 | def get_pil_image(raw_image_data) -> Image.Image: 14 | if isinstance(raw_image_data, Image.Image): 15 | return raw_image_data 16 | 17 | elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data: 18 | return Image.open(io.BytesIO(raw_image_data["bytes"])) 19 | 20 | elif isinstance(raw_image_data, str): # Assuming this is a base64 encoded string 21 | image_bytes = base64.b64decode(raw_image_data) 22 | return Image.open(io.BytesIO(image_bytes)) 23 | 24 | else: 25 | raise ValueError("Unsupported image data format") 26 | 27 | 28 | class InstructBLIP(BaseModel): 29 | def __init__(self, model_path: str = "Salesforce/instructblip-vicuna-7b", cuda_id: int = 0, max_new_tokens=32): 30 | super().__init__("instructblip", model_path) 31 | self.device = f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu" 32 | self.model = InstructBlipForConditionalGeneration.from_pretrained(model_path).to(self.device) 33 | self.processor = InstructBlipProcessor.from_pretrained(model_path) 34 | self.max_new_tokens = max_new_tokens 35 | 36 | def generate(self, text_prompt: str, raw_image_data: str): 37 | raw_image_data = get_pil_image(raw_image_data) 38 | raw_image_data = raw_image_data.convert("RGB") 39 | formatted_prompt = f"{text_prompt}\nAnswer:" 40 | # Accordling to https://huggingface.co/Salesforce/instructblip-vicuna-7b . Seems that is is no special prompt format for instruct blip 41 | model_inputs = self.processor(images=raw_image_data, text=formatted_prompt, return_tensors="pt").to(self.device) 42 | # We follow the recommended parameter here:https://huggingface.co/Salesforce/instructblip-vicuna-7b 43 | generation_output = self.model.generate(**model_inputs, do_sample=False, max_new_tokens=self.max_new_tokens, min_length=1) 44 | generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True) 45 | return generation_text[0] 46 | 47 | def eval_forward(self, question, answer, image): 48 | raise NotImplementedError 49 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/llama_adapter.py: -------------------------------------------------------------------------------- 1 | from .LLaMA_Adapter.imagebind_LLM.ImageBind import data as data_utils 2 | from .LLaMA_Adapter.imagebind_LLM import llama 3 | 4 | from .base_model import BaseModel 5 | 6 | import os 7 | 8 | 9 | llama_dir = "/mnt/petrelfs/share_data/zhangyuanhan/llama_adapter_v2_multimodal" 10 | 11 | 12 | class LlamaAdapter(BaseModel): 13 | # checkpoint will be automatically downloaded 14 | def __init__(self, model_path: str): 15 | super().__init__("llama_adapter", model_path) 16 | self.model = llama.load(model_path, llama_dir) 17 | self.model.eval() 18 | 19 | def generate(self, input_data): 20 | inputs = {} 21 | video_dir = input_data.get("video_root", "") 22 | image = data_utils.load_and_transform_video_data([input_data["video_path"]], device="cuda") 23 | inputs["Image"] = [image, 1] 24 | 25 | object_description = input_data["object_description"] 26 | if object_description != "None": 27 | context = f"Given context:{object_description}. " 28 | else: 29 | context = "" 30 | prompts_input = context + input_data["question"] 31 | 32 | results = self.model.generate(inputs, [llama.format_prompt(prompts_input)], max_gen_len=256) 33 | result = results[0].strip() 34 | return result 35 | 36 | 37 | if __name__ == "__main__": 38 | model = LlamaAdapter("", "") 39 | data = { 40 | "video_idx": "03f2ed96-1719-427d-acf4-8bf504f1d66d.mp4", 41 | "question": "What is in this image?", 42 | } 43 | print(model.generate(data)) 44 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/llava_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torchvision.transforms as T 4 | from torchvision.io import read_video 5 | 6 | from .base_model import BaseModel 7 | from .llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 8 | from .llava.conversation import conv_templates, SeparatorStyle 9 | from .llava.model.builder import load_pretrained_model 10 | from .llava.utils import disable_torch_init 11 | from .llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 12 | 13 | default_model_path = "liuhaotian/llava-v1.5-7b" 14 | 15 | 16 | class LLaVA_Model(BaseModel): 17 | def __init__( 18 | self, 19 | model_path: str = default_model_path, 20 | model_base: str = None, 21 | model_name: str = "llava-v1.5", 22 | conv_mode: str = "llava_v1", 23 | ): 24 | super().__init__(model_name, model_path) 25 | init_model_name = get_model_name_from_path(model_path) 26 | self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(model_path, model_base, init_model_name) 27 | self.conv_mode = conv_mode 28 | 29 | def generate(self, text_prompt: str, raw_image_data: str): 30 | if self.model.config.mm_use_im_start_end: 31 | prompts_input = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + text_prompt 32 | else: 33 | prompts_input = DEFAULT_IMAGE_TOKEN + "\n" + text_prompt 34 | 35 | input_data = self.image_processor.preprocess(raw_image_data, return_tensors="pt")["pixel_values"][0] 36 | 37 | conv = conv_templates[self.conv_mode].copy() 38 | conv.append_message(conv.roles[0], prompts_input) 39 | conv.append_message(conv.roles[1], None) 40 | prompt = conv.get_prompt() 41 | input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() 42 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 43 | keywords = [stop_str] 44 | stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) 45 | 46 | with torch.inference_mode(): 47 | output_ids = self.model.generate( 48 | input_ids, 49 | images=input_data.unsqueeze(0).half().cuda(), 50 | do_sample=True, 51 | temperature=0.2, 52 | top_p=None, 53 | num_beams=1, 54 | # no_repeat_ngram_size=3, 55 | max_new_tokens=512, 56 | use_cache=True, 57 | ) 58 | 59 | input_token_len = input_ids.shape[1] 60 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 61 | if n_diff_input_output > 0: 62 | print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids") 63 | outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 64 | outputs = outputs.strip() 65 | if outputs.endswith(stop_str): 66 | outputs = outputs[: -len(stop_str)] 67 | outputs = outputs.strip() 68 | 69 | return outputs 70 | 71 | def eval_forward(self, text_prompt: str, raw_image_data: str): 72 | pass 73 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/mplug_owl.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from transformers import AutoTokenizer 5 | from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration 6 | from mplug_owl_video.processing_mplug_owl import ( 7 | MplugOwlImageProcessor, 8 | MplugOwlProcessor, 9 | ) 10 | 11 | from .base_model import BaseModel 12 | 13 | pretrained_ckpt = "MAGAer13/mplug-owl-llama-7b-video" 14 | 15 | 16 | class mPlug_owl(BaseModel): 17 | def __init__(self, model_path: str): 18 | super().__init__("mplug_owl", model_path) 19 | self.model = MplugOwlForConditionalGeneration.from_pretrained( 20 | pretrained_ckpt, 21 | torch_dtype=torch.bfloat16, 22 | ) 23 | self.image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt) 24 | self.tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt) 25 | self.processor = MplugOwlProcessor(self.image_processor, self.tokenizer) 26 | self.model.eval() 27 | 28 | def format_prompt(self, question): 29 | prompts = [f" <|video|> Question : {question} Answer : "] 30 | return prompts 31 | 32 | def generate(self, input_data: dict): 33 | questions = input_data["question"] 34 | video_dir = input_data.get("video_root", "") 35 | video_list = input_data["video_path"] 36 | generate_kwargs = {"do_sample": True, "top_k": 5, "max_length": 512} 37 | 38 | object_description = input_data["object_description"] 39 | if object_description != "None": 40 | context = f"Given context:{object_description}. " 41 | else: 42 | context = "" 43 | prompts_input = context + input_data["question"] 44 | 45 | prompt = self.format_prompt(prompts_input) 46 | inputs = self.processor(text=prompt, videos=video_list, num_frames=4, return_tensors="pt") 47 | inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()} 48 | inputs = {k: v.to(self.model.device) for k, v in inputs.items()} 49 | with torch.no_grad(): 50 | res = self.model.generate(**inputs, **generate_kwargs) 51 | sentence = self.tokenizer.decode(res.tolist()[0], skip_special_tokens=True) 52 | return sentence 53 | 54 | 55 | if __name__ == "__main__": 56 | model = mPlug_owl("") 57 | device = torch.device("cuda") 58 | model.model = model.model.to(device) 59 | data = { 60 | "video_idx": ["./data_source/multi_hop_reasoning/03f2ed96-1719-427d-acf4-8bf504f1d66d.mp4"], 61 | "question": "What is in this image?", 62 | } 63 | print(model.generate(data)) 64 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/otterhd.py: -------------------------------------------------------------------------------- 1 | from transformers import FuyuForCausalLM, AutoTokenizer, FuyuImageProcessor, FuyuProcessor 2 | from PIL import Image 3 | from .base_model import BaseModel 4 | import torch 5 | import numpy as np 6 | import warnings 7 | import io 8 | import base64 9 | 10 | warnings.filterwarnings("ignore") 11 | 12 | 13 | def get_pil_image(raw_image_data) -> Image.Image: 14 | if isinstance(raw_image_data, Image.Image): 15 | return raw_image_data 16 | 17 | elif isinstance(raw_image_data, dict) and "bytes" in raw_image_data: 18 | return Image.open(io.BytesIO(raw_image_data["bytes"])) 19 | 20 | elif isinstance(raw_image_data, str): # Assuming this is a base64 encoded string 21 | image_bytes = base64.b64decode(raw_image_data) 22 | return Image.open(io.BytesIO(image_bytes)) 23 | 24 | else: 25 | raise ValueError("Unsupported image data format") 26 | 27 | 28 | import math 29 | 30 | 31 | class OtterHD(BaseModel): 32 | def __init__(self, model_path: str = "Otter-AI/OtterHD-8B", cuda_id: int = 0, resolution: int = -1, max_new_tokens=256): 33 | super().__init__("otterhd", model_path) 34 | self.resolution = resolution 35 | self.device = f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu" 36 | self.model = FuyuForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=self.device) 37 | self.model.eval() 38 | self.tokenizer = AutoTokenizer.from_pretrained(model_path) 39 | self.image_processor = FuyuImageProcessor() 40 | self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer) 41 | self.max_new_tokens = max_new_tokens 42 | 43 | def generate(self, text_prompt: str, raw_image_data: str): 44 | raw_image_data = get_pil_image(raw_image_data) 45 | # make sure the image is in RGB format and resize to match the width 46 | raw_image_data = raw_image_data.convert("RGB") 47 | if self.resolution != -1: 48 | width, height = raw_image_data.size 49 | short_edge = min(width, height) 50 | scaling_factor = self.resolution / short_edge 51 | new_width = math.ceil(width * scaling_factor) 52 | new_height = math.ceil(height * scaling_factor) 53 | raw_image_data = raw_image_data.resize((new_width, new_height), Image.ANTIALIAS) 54 | 55 | formated_prompt = f"User: {text_prompt} Assistant:" 56 | model_inputs = self.processor(text=formated_prompt, images=[raw_image_data], device=self.device) 57 | for k, v in model_inputs.items(): 58 | model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v] 59 | 60 | model_inputs["image_patches"][0] = model_inputs["image_patches"][0].to(dtype=next(self.model.parameters()).dtype) 61 | generation_output = self.model.generate(**model_inputs, max_new_tokens=self.max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) 62 | generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True) 63 | response = generation_text[0].split("\x04")[1].strip(" ").strip("\n") 64 | return response 65 | 66 | def eval_forward(self, text_prompt: str, image_path: str): 67 | # Similar to the Idefics' eval_forward but adapted for Fuyu 68 | pass 69 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/qwen_vl.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | from transformers.generation import GenerationConfig 6 | 7 | from .base_model import BaseModel 8 | 9 | default_path = "Qwen/Qwen-VL-Chat" 10 | 11 | 12 | class QwenVL(BaseModel): 13 | def __init__(self, model_name: str = "qwen_vl", model_path: str = default_path): 14 | super().__init__(model_name, model_path) 15 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 16 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval() 17 | self.model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True) 18 | self.temp_dir = ".log/temp" 19 | if not os.path.exists(self.temp_dir): 20 | os.makedirs(self.temp_dir) 21 | 22 | def generate(self, text_prompt: str, raw_image_data: str): 23 | image_path = os.path.join(self.temp_dir, "temp.jpg") 24 | raw_image_data.save(image_path) 25 | query = [] 26 | query.append({"image": image_path}) 27 | query.append({"text": text_prompt}) 28 | query = self.tokenizer.from_list_format(query) 29 | response, history = self.model.chat(self.tokenizer, query=query, history=None) 30 | return response 31 | 32 | def eval_forward(self, text_prompt: str, image_path: str): 33 | # Similar to the Idefics' eval_forward but adapted for QwenVL 34 | pass 35 | -------------------------------------------------------------------------------- /pipeline/benchmarks/models/video_chatgpt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .video_chatgpt.eval.model_utils import load_video, initialize_model 3 | from .video_chatgpt.inference import video_chatgpt_infer 4 | 5 | from .base_model import BaseModel 6 | 7 | model_name = "/mnt/lustre/yhzhang/kaichen/video_ChatGPT/LLaVA-Lightening-7B-v1-1." 8 | projection_path = "/mnt/lustre/yhzhang/kaichen/video_ChatGPT/video_chatgpt-7B.bin" 9 | 10 | 11 | class Video_ChatGPT(BaseModel): 12 | def __init__(self, model_path: str): 13 | super().__init__("video_chatgpt", model_path) 14 | ( 15 | self.model, 16 | self.vision_tower, 17 | self.tokenizer, 18 | self.image_processor, 19 | self.video_token_len, 20 | ) = initialize_model(model_name, projection_path) 21 | 22 | def generate(self, input_data: dict): 23 | video_dir = input_data.get("video_root", "") 24 | video_frames = load_video(input_data["video_path"]) 25 | 26 | object_description = input_data["object_description"] 27 | if object_description != "None": 28 | context = f"Given context:{object_description}. " 29 | else: 30 | context = "" 31 | prompts_input = context + input_data["question"] 32 | 33 | output = video_chatgpt_infer( 34 | video_frames, prompts_input, conv_mode="video-chatgpt_v1", model=self.model, vision_tower=self.vision_tower, tokenizer=self.tokenizer, image_processor=self.image_processor, video_token_len=self.video_token_len 35 | ) 36 | return output 37 | 38 | 39 | if __name__ == "__main__": 40 | model = Video_ChatGPT("") 41 | device = torch.device("cuda") 42 | model.model = model.model.to(device) 43 | model.vision_tower = model.vision_tower.to(device) 44 | data = { 45 | "video_idx": "./data_source/multi_hop_reasoning/03f2ed96-1719-427d-acf4-8bf504f1d66d.mp4", 46 | "question": "What is in this image?", 47 | } 48 | print(model.generate(data)) 49 | -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/README.md: -------------------------------------------------------------------------------- 1 | # OpenFlamingo Evaluation Suite 2 | 3 | This is the evaluation module of OpenFlamingo. It contains a set of utilities for evaluating multimodal models on various benchmarking datasets. 4 | 5 | *This module is a work in progress! We will be updating this README as it develops. In the meantime, if you notice an issue, please file a Bug Report or Feature Request [here](https://github.com/mlfoundations/open_flamingo/issues/new/choose).* 6 | 7 | ## Supported datasets 8 | 9 | |Dataset|Task|Metric|Evaluation method| 10 | |-------|----|------|-----------------| 11 | |[COCO](https://arxiv.org/abs/1405.0312)|Captioning|CIDEr|Generation| 12 | |[Flickr-30K](https://aclanthology.org/Q14-1006/)|Captioning|CIDEr|Generation| 13 | |[VQAv2](https://arxiv.org/abs/1612.00837v3)|VQA|VQA accuracy|Generation| 14 | |[OK-VQA](https://arxiv.org/abs/1906.00067)|VQA|VQA accuracy|Generation| 15 | |[TextVQA](https://arxiv.org/abs/1904.08920)|VQA|VQA accuracy|Generation| 16 | |[VizWiz](https://arxiv.org/abs/1802.08218)|VQA|VQA accuracy|Generation| 17 | |[Hateful Memes](https://arxiv.org/abs/2005.04790)|Classification|ROC AUC|Logprobs| 18 | |[ImageNet](https://arxiv.org/abs/1409.0575)|Classification|Top-1 accuracy|Logprobs| 19 | 20 | When evaluating a model using `num_shots` shots, we sample the exemplars from the training split. Performance is evaluated on a disjoint test split, subsampled to `--num_samples` examples (or using the full test split if `--num_samples=-1`). 21 | 22 | Warning: we have found that classification evaluation does not work as well as expected for MPT models (e.g. OpenFlamingo-3B, OpenFlamingo-9B) and we are not sure why yet. We will update this README when we have more information. 23 | 24 | ## Sample scripts 25 | Our codebase uses DistributedDataParallel to parallelize evaluation by default, so please make sure to set the `MASTER_ADDR` and `MASTER_PORT` environment variables or use `torchrun`. We provide a sample Slurm evaluation script in `open_flamingo/open_flamingo/scripts/run_eval.sh`. 26 | 27 | We also support evaluating at a lower precision using the `--precision` flag. We find minimal difference between evaluating at full precision vs. amp_bf16. 28 | 29 | To evaluate one of our pretrained checkpoints, we suggest first downloading a local copy of the weights, as follows: 30 | 31 | ``` 32 | # grab model checkpoint from huggingface hub 33 | from huggingface_hub import hf_hub_download 34 | HF_TOKEN="" 35 | 36 | checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt") 37 | checkpoint_path= hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", 38 | "checkpoint.pt", 39 | local_dir="openflamingo/OpenFlamingo-3B-vitl-mpt1b", 40 | cache_dir="openflamingo/OpenFlamingo-3B-vitl-mpt1b", 41 | local_dir_use_symlinks=False, 42 | token=HF_TOKEN) 43 | print(checkpoint_path) 44 | ## openflamingo/OpenFlamingo-3B-vitl-mpt1b/checkpoint.pt 45 | ``` 46 | 47 | This should place the OpenFlamingo model at the expected location in the evaluation script. 48 | 49 | For TextVQA and VizWiz we expect annotations to be formatted differently than the original datasets. We provide the custom annotations in `open_flamingo/open_flamingo/eval/data/`. -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/public_datasets_suite/__init__.py -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/coco_metric.py: -------------------------------------------------------------------------------- 1 | from pycocoevalcap.eval import COCOEvalCap 2 | from pycocotools.coco import COCO 3 | 4 | 5 | def compute_cider( 6 | result_path, 7 | annotations_path, 8 | ): 9 | # create coco object and coco_result object 10 | coco = COCO(annotations_path) 11 | coco_result = coco.loadRes(result_path) 12 | 13 | # create coco_eval object by taking coco and coco_result 14 | coco_eval = COCOEvalCap(coco, coco_result) 15 | coco_eval.params["image_id"] = coco_result.getImgIds() 16 | coco_eval.evaluate() 17 | 18 | return coco_eval.eval 19 | 20 | 21 | def postprocess_captioning_generation(predictions): 22 | return predictions.split("Output", 1)[0] 23 | -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name: otter 3 | path: /data/bli/checkpoints/OTTER-Image-MPT7B 4 | checkpoint: /data/bli/checkpoints/OTTER-Image-MPT7B/final_weights.pt 5 | device_map: auto 6 | precision: fp32 7 | batch_size: 8 8 | 9 | dataset: 10 | coco: 11 | test: true 12 | train_image: /path/to/mscoco_karpathy/train2014 13 | val_image: /path/to/mscoco_karpathy/val2014 14 | karpathy: /path/to/mscoco_karpathy/dataset_coco.json 15 | annotations: /path/to/mscoco_karpathy/annotations/captions_val2014.json 16 | vqav2: 17 | test: false 18 | train_image: /path/to/vqav2/train2014 19 | train_annotations: /path/to/vqav2/v2_mscoco_train2014_annotations.json 20 | train_questions: /path/to/vqav2/v2_OpenEnded_mscoco_train2014_questions.json 21 | test_image: /path/to/vqav2/val2014 22 | test_annotations: /path/to/vqav2/v2_mscoco_val2014_annotations.json 23 | test_questions: /path/to/vqav2/v2_OpenEnded_mscoco_val2014_questions.json 24 | flickr: 25 | test: false 26 | image: /path/to/flickr30k/flickr30k-images 27 | karpathy: /path/to/flickr30k/dataset_flickr30k.json 28 | annotations: /path/to/flickr30k/dataset_flickr30k_coco_style.json 29 | ok_vqa: 30 | test: false 31 | train_image: /path/to/okvqa/train2014 32 | train_annotations: /path/to/okvqa/mscoco_train2014_annotations.json 33 | train_questions: /path/to/okvqa/OpenEnded_mscoco_train2014_questions.json 34 | test_image: /path/to/okvqa/val2014 35 | test_annotations: /path/to/okvqa/mscoco_val2014_annotations.json 36 | test_questions: /path/to/okvqa/OpenEnded_mscoco_val2014_questions.json 37 | textvqa: 38 | test: false 39 | train_image: /path/to/textvqa/train_images 40 | train_annotations: /path/to/textvqa/train_annotations_vqa_format.json 41 | train_questions: /path/to/textvqa/train_questions_vqa_format.json 42 | test_image: /path/to/textvqa/val_images 43 | test_annotations: /path/to/textvqa/val_annotations_vqa_format.json 44 | test_questions: /path/to/textvqa/val_questions_vqa_format.json 45 | vizwiz: 46 | test: false 47 | train_image: /path/to/vizwiz/train 48 | train_annotations: /path/to/vizwiz/train_annotations_vqa_format.json 49 | train_questions: /path/to/vizwiz/train_questions_vqa_format.json 50 | test_image: /path/to/vizwiz/val 51 | test_annotations: /path/to/vizwiz/val_annotations_vqa_format.json 52 | test_questions: /path/to/vizwiz/val_questions_vqa_format.json 53 | hateful_memes: 54 | test: false 55 | image: /path/to/hateful_memes/img 56 | train_annotations: /path/to/hateful_memes/train.json 57 | test_annotations: /path/to/hateful_memes/dev.json 58 | -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/eval_model.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import argparse 3 | from typing import List 4 | from torch.nn.parallel import DistributedDataParallel as DDP 5 | from PIL import Image 6 | 7 | 8 | class BaseEvalModel(abc.ABC): 9 | """Base class encapsulating functionality needed to evaluate a model.""" 10 | 11 | def __init__(self, args: List[str]): 12 | """Initialize model. 13 | 14 | Args: 15 | args: arguments to model. These should be parsed, or if the model 16 | has no applicable arguments, an error should be thrown if `args` 17 | is non-empty. 18 | """ 19 | 20 | def init_distributed(self): 21 | """Wrap model as DDP.""" 22 | self.model = DDP(self.model, device_ids=[self.device]) 23 | 24 | def set_device(self, device): 25 | """Set device for model.""" 26 | self.device = device 27 | self.model = self.model.to(device) 28 | 29 | def get_outputs( 30 | self, 31 | batch_text: List[str], 32 | batch_images: List[List[Image.Image]], 33 | min_generation_length: int, 34 | max_generation_length: int, 35 | num_beams: int, 36 | length_penalty: float, 37 | ) -> List[str]: 38 | """Get outputs for a batch of images and text. 39 | 40 | Args: 41 | batch_text: list of text strings, with the text "" in place 42 | of any images to be included. 43 | batch_images: images to provide to model. Should be a list of lists, 44 | where each list contains the images for a single example. 45 | max_generation_length: maximum length of the generated caption. 46 | Defaults to 10. 47 | num_beams: number of beams to use for beam search. Defaults to 3. 48 | length_penalty: length penalty for beam search. Defaults to -2.0. 49 | 50 | Returns: 51 | List of decoded output strings. 52 | """ 53 | 54 | def vqa_prompt(self, question, answer=None) -> str: 55 | """Get the prompt to use for VQA evaluation. If the answer is not provided, it should be left blank to be generated by the model. 56 | 57 | Returns: 58 | The prompt to use for VQA. 59 | """ 60 | 61 | def caption_prompt(self, caption=None) -> str: 62 | """Get the prompt to use for caption evaluation. If the caption is not provided, it should be left blank to be generated by the model. 63 | 64 | Returns: 65 | The prompt to use for captioning. 66 | """ 67 | 68 | def classification_prompt(self, class_str=None) -> str: 69 | """Get the prompt to use for classification evaluation. If the class_str is not provided, it should be left blank to be generated by the model. 70 | 71 | Returns: 72 | The prompt to use for classification. 73 | """ 74 | -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/get_args.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "file_name = \"./run_eval_coco.sh\"\n", 19 | "args = []\n", 20 | "with open(file_name, \"r\") as f:\n", 21 | " lines = f.readlines()\n", 22 | " for line in lines:\n", 23 | " line = line.strip()\n", 24 | " if line.endswith(\"\\\\\"):\n", 25 | " line = line[:-1].strip()\n", 26 | " if line.startswith(\"--\"):\n", 27 | " args.append(line)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "print(json.dumps(args, indent=4))" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.10.12" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/benchmarks/public_datasets_suite/models/__init__.py -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/models/blip.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from PIL import Image 4 | import torch 5 | 6 | from transformers import Blip2Processor, Blip2ForConditionalGeneration 7 | from pipeline.benchmarks.public_datasets_suite.eval_model import BaseEvalModel 8 | from pipeline.benchmarks.public_datasets_suite.models.utils import unwrap_model 9 | 10 | 11 | class EvalModel(BaseEvalModel): 12 | """BLIP-2 model evaluation. 13 | 14 | Attributes: 15 | model (nn.Module): Underlying Torch model. 16 | tokenizer (transformers.PreTrainedTokenizer): Tokenizer for model. 17 | device: Index of GPU to use, or the string "cpu" 18 | """ 19 | 20 | def __init__(self, model_args): 21 | assert "processor_path" in model_args and "lm_path" in model_args and "device" in model_args, "BLIP-2 requires processor_path, lm_path, and device arguments to be specified" 22 | 23 | self.device = int(model_args["device"]) if ("device" in model_args and model_args["device"] >= 0) else "cpu" 24 | self.processor = Blip2Processor.from_pretrained(model_args["processor_path"]) 25 | self.model = Blip2ForConditionalGeneration.from_pretrained(model_args["lm_path"]) 26 | self.model.to(self.device) 27 | self.model.eval() 28 | self.processor.tokenizer.padding_side = "left" 29 | 30 | def _prepare_images(self, batch: List[List[torch.Tensor]]) -> torch.Tensor: 31 | """Preprocess images and stack them. 32 | 33 | Args: 34 | batch: A list of lists of images. 35 | 36 | Returns: 37 | A Tensor of shape 38 | (batch_size, channels, height, width). 39 | """ 40 | batch_images = None 41 | assert all(len(example) == 1 for example in batch), "BLIP-2 only supports one image per example" 42 | 43 | for example in batch: 44 | assert len(example) == 1, "BLIP-2 only supports one image per example" 45 | batch_images = torch.cat( 46 | [ 47 | batch_images, 48 | self.processor.image_processor(example, return_tensors="pt")["pixel_values"], 49 | ] 50 | if batch_images is not None 51 | else [self.processor.image_processor(example, return_tensors="pt")["pixel_values"]], 52 | dim=0, 53 | ) 54 | return batch_images 55 | 56 | def get_outputs( 57 | self, 58 | batch_text: List[str], 59 | batch_images: List[List[Image.Image]], 60 | max_generation_length: int, 61 | num_beams: int, 62 | length_penalty: float, 63 | ) -> List[str]: 64 | encodings = self.processor.tokenizer( 65 | batch_text, 66 | padding="longest", 67 | truncation=True, 68 | return_tensors="pt", 69 | max_length=2000, 70 | ) 71 | input_ids = encodings["input_ids"] 72 | attention_mask = encodings["attention_mask"] 73 | 74 | with torch.inference_mode(): 75 | outputs = unwrap_model(self.model).generate( 76 | self._prepare_images(batch_images).to(self.device), 77 | input_ids.to(self.device), 78 | attention_mask=attention_mask.to(self.device), 79 | max_new_tokens=max_generation_length, 80 | min_new_tokens=8, 81 | num_beams=num_beams, 82 | length_penalty=length_penalty, 83 | ) 84 | 85 | return self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True) 86 | 87 | def get_vqa_prompt(self, question, answer=None) -> str: 88 | return f"Question:{question} Short answer:{answer if answer is not None else ''}" 89 | 90 | def get_caption_prompt(self, caption=None) -> str: 91 | return f"A photo of {caption if caption is not None else ''}" 92 | 93 | def get_classification_prompt(self, class_str=None) -> str: 94 | raise NotImplementedError 95 | -------------------------------------------------------------------------------- /pipeline/benchmarks/public_datasets_suite/models/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def unwrap_model(model): 5 | """ 6 | Unwrap a model from a DataParallel or DistributedDataParallel wrapper. 7 | """ 8 | if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): 9 | return model.module 10 | else: 11 | return model 12 | -------------------------------------------------------------------------------- /pipeline/demos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/pipeline/demos/__init__.py -------------------------------------------------------------------------------- /pipeline/demos/demo_utils.py: -------------------------------------------------------------------------------- 1 | import mimetypes 2 | import sys 3 | from typing import Union 4 | 5 | import requests 6 | from PIL import Image 7 | 8 | requests.packages.urllib3.disable_warnings() 9 | 10 | 11 | # --- Utility Functions --- 12 | def print_colored(text, color_code): 13 | end_code = "\033[0m" # Reset to default color 14 | print(f"{color_code}{text}{end_code}") 15 | 16 | 17 | def get_content_type(file_path): 18 | content_type, _ = mimetypes.guess_type(file_path) 19 | return content_type 20 | 21 | 22 | def get_image(url: str) -> Union[Image.Image, list]: 23 | if not url.strip(): # Blank input, return a blank Image 24 | return Image.new("RGB", (224, 224)) # Assuming 224x224 is the default size for the model. Adjust if needed. 25 | elif "://" not in url: # Local file 26 | content_type = get_content_type(url) 27 | else: # Remote URL 28 | content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type") 29 | 30 | if "image" in content_type: 31 | if "://" not in url: # Local file 32 | return Image.open(url) 33 | else: # Remote URL 34 | return Image.open(requests.get(url, stream=True, verify=False).raw) 35 | else: 36 | raise ValueError("Invalid content type. Expected image.") 37 | -------------------------------------------------------------------------------- /pipeline/demos/inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import sys 5 | 6 | import requests 7 | import yaml 8 | 9 | from .demo_models import TestIdefics, TestOtter, TestOtterHD 10 | from .demo_utils import get_image, print_colored 11 | 12 | requests.packages.urllib3.disable_warnings() 13 | 14 | import pytz 15 | 16 | # Initialize the time zone 17 | utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc. 18 | # Get the current time in UTC 19 | utc_now = pytz.utc.localize(datetime.datetime.utcnow()) 20 | # Convert to UTC+8 21 | utc_plus_8_time = utc_now.astimezone(utc_plus_8) 22 | 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--model_name", type=str, default="otter", required=True, help="The model name.") 27 | parser.add_argument("--checkpoint", type=str, help="The path to the checkpoint.") 28 | parser.add_argument("--output_dir", type=str, help="The dir path to the output file.", default="./logs") 29 | parser.add_argument("--yaml_file", type=str, help="The dir path to the eval yaml, contains question, answer pairs.", default="") 30 | args = parser.parse_args() 31 | return args 32 | 33 | 34 | def eval_yaml(args, yaml_file, model): 35 | with open(yaml_file, "r") as file: 36 | test_data_list = yaml.safe_load(file) 37 | 38 | cur_date = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S") 39 | log_json_path = f"{args.output_dir}/inference_log_{cur_date}.json" 40 | log_json = { 41 | "model_name": args.model_name, 42 | "checkpoint": args.checkpoint, 43 | "results": {}, 44 | } 45 | for test_id, test_data in enumerate(test_data_list): 46 | image_path = test_data.get("image_path", "") 47 | question = test_data.get("question", "") 48 | 49 | image = get_image(image_path) 50 | no_image_flag = not bool(image_path) 51 | 52 | response = model.generate(prompt=question, image=image, no_image_flag=no_image_flag) 53 | 54 | # Print results to console 55 | print(f"image_path: {image_path}") 56 | print_colored(f"question: {question}", color_code="\033[92m") 57 | print_colored(f"answer: {response}", color_code="\033[94m") 58 | print("-" * 150) 59 | 60 | log_json["results"].update( 61 | { 62 | str(test_id).zfill(3): { 63 | "image_path": image_path, 64 | "question": question, 65 | "answer": response, 66 | } 67 | } 68 | ) 69 | 70 | with open(log_json_path, "w") as file: 71 | json.dump(log_json, file, indent=4, sort_keys=False) 72 | 73 | 74 | def main(): 75 | args = parse_args() 76 | if args.model_name == "otter": 77 | model = TestOtter(checkpoint=args.checkpoint) 78 | elif args.model_name == "otterhd": 79 | model = TestOtterHD(checkpoint=args.checkpoint) 80 | elif args.model_name == "idefics": 81 | model = TestIdefics(checkpoint=args.checkpoint) 82 | else: 83 | raise NotImplementedError(f"model_name: {args.model_name} is not implemented.") 84 | 85 | if args.yaml_file: 86 | eval_yaml(args, args.yaml_file, model) 87 | else: 88 | while True: 89 | yaml_file = input("Enter the path to the yaml file: (or 'q' to quit): ") 90 | if yaml_file == "q": 91 | break 92 | eval_yaml(args, yaml_file, model) 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /pipeline/serve/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pipeline/serve/deploy/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | import os 4 | import sys 5 | 6 | handler = None 7 | 8 | 9 | def build_logger(logger_name, logger_dir): 10 | global handler 11 | 12 | formatter = logging.Formatter( 13 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 14 | datefmt="%Y-%m-%d %H:%M:%S", 15 | ) 16 | 17 | # Set the format of root handlers 18 | if not logging.getLogger().handlers: 19 | logging.basicConfig(level=logging.INFO) 20 | logging.getLogger().handlers[0].setFormatter(formatter) 21 | 22 | # Redirect stdout and stderr to loggers 23 | stdout_logger = logging.getLogger("stdout") 24 | stdout_logger.setLevel(logging.INFO) 25 | sl = StreamToLogger(stdout_logger, logging.INFO) 26 | sys.stdout = sl 27 | 28 | stderr_logger = logging.getLogger("stderr") 29 | stderr_logger.setLevel(logging.ERROR) 30 | sl = StreamToLogger(stderr_logger, logging.ERROR) 31 | sys.stderr = sl 32 | 33 | # Get logger 34 | logger = logging.getLogger(logger_name) 35 | logger.setLevel(logging.INFO) 36 | 37 | # Add a file handler for all loggers 38 | if handler is None: 39 | os.makedirs(logger_dir, exist_ok=True) 40 | filename = os.path.join(logger_dir, logger_name + ".log") 41 | handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True) 42 | handler.setFormatter(formatter) 43 | 44 | for name, item in logging.root.manager.loggerDict.items(): 45 | if isinstance(item, logging.Logger): 46 | item.addHandler(handler) 47 | 48 | return logger 49 | 50 | 51 | class StreamToLogger(object): 52 | """ 53 | Fake file-like stream object that redirects writes to a logger instance. 54 | """ 55 | 56 | def __init__(self, logger, log_level=logging.INFO): 57 | self.terminal = sys.stdout 58 | self.logger = logger 59 | self.log_level = log_level 60 | self.linebuf = "" 61 | 62 | def __getattr__(self, attr): 63 | return getattr(self.terminal, attr) 64 | 65 | def write(self, buf): 66 | temp_linebuf = self.linebuf + buf 67 | self.linebuf = "" 68 | for line in temp_linebuf.splitlines(True): 69 | # From the io.TextIOWrapper docs: 70 | # On output, if newline is None, any '\n' characters written 71 | # are translated to the system default line separator. 72 | # By default sys.stdout.write() expects '\n' newlines and then 73 | # translates them so this is still cross platform. 74 | if line[-1] == "\n": 75 | self.logger.log(self.log_level, line.rstrip()) 76 | else: 77 | self.linebuf += line 78 | 79 | def flush(self): 80 | if self.linebuf != "": 81 | self.logger.log(self.log_level, self.linebuf.rstrip()) 82 | self.linebuf = "" 83 | -------------------------------------------------------------------------------- /pipeline/serve/gradio_css.py: -------------------------------------------------------------------------------- 1 | code_highlight_css = """ 2 | #chatbot .hll { background-color: #ffffcc } 3 | #chatbot .c { color: #408080; font-style: italic } 4 | #chatbot .err { border: 1px solid #FF0000 } 5 | #chatbot .k { color: #008000; font-weight: bold } 6 | #chatbot .o { color: #666666 } 7 | #chatbot .ch { color: #408080; font-style: italic } 8 | #chatbot .cm { color: #408080; font-style: italic } 9 | #chatbot .cp { color: #BC7A00 } 10 | #chatbot .cpf { color: #408080; font-style: italic } 11 | #chatbot .c1 { color: #408080; font-style: italic } 12 | #chatbot .cs { color: #408080; font-style: italic } 13 | #chatbot .gd { color: #A00000 } 14 | #chatbot .ge { font-style: italic } 15 | #chatbot .gr { color: #FF0000 } 16 | #chatbot .gh { color: #000080; font-weight: bold } 17 | #chatbot .gi { color: #00A000 } 18 | #chatbot .go { color: #888888 } 19 | #chatbot .gp { color: #000080; font-weight: bold } 20 | #chatbot .gs { font-weight: bold } 21 | #chatbot .gu { color: #800080; font-weight: bold } 22 | #chatbot .gt { color: #0044DD } 23 | #chatbot .kc { color: #008000; font-weight: bold } 24 | #chatbot .kd { color: #008000; font-weight: bold } 25 | #chatbot .kn { color: #008000; font-weight: bold } 26 | #chatbot .kp { color: #008000 } 27 | #chatbot .kr { color: #008000; font-weight: bold } 28 | #chatbot .kt { color: #B00040 } 29 | #chatbot .m { color: #666666 } 30 | #chatbot .s { color: #BA2121 } 31 | #chatbot .na { color: #7D9029 } 32 | #chatbot .nb { color: #008000 } 33 | #chatbot .nc { color: #0000FF; font-weight: bold } 34 | #chatbot .no { color: #880000 } 35 | #chatbot .nd { color: #AA22FF } 36 | #chatbot .ni { color: #999999; font-weight: bold } 37 | #chatbot .ne { color: #D2413A; font-weight: bold } 38 | #chatbot .nf { color: #0000FF } 39 | #chatbot .nl { color: #A0A000 } 40 | #chatbot .nn { color: #0000FF; font-weight: bold } 41 | #chatbot .nt { color: #008000; font-weight: bold } 42 | #chatbot .nv { color: #19177C } 43 | #chatbot .ow { color: #AA22FF; font-weight: bold } 44 | #chatbot .w { color: #bbbbbb } 45 | #chatbot .mb { color: #666666 } 46 | #chatbot .mf { color: #666666 } 47 | #chatbot .mh { color: #666666 } 48 | #chatbot .mi { color: #666666 } 49 | #chatbot .mo { color: #666666 } 50 | #chatbot .sa { color: #BA2121 } 51 | #chatbot .sb { color: #BA2121 } 52 | #chatbot .sc { color: #BA2121 } 53 | #chatbot .dl { color: #BA2121 } 54 | #chatbot .sd { color: #BA2121; font-style: italic } 55 | #chatbot .s2 { color: #BA2121 } 56 | #chatbot .se { color: #BB6622; font-weight: bold } 57 | #chatbot .sh { color: #BA2121 } 58 | #chatbot .si { color: #BB6688; font-weight: bold } 59 | #chatbot .sx { color: #008000 } 60 | #chatbot .sr { color: #BB6688 } 61 | #chatbot .s1 { color: #BA2121 } 62 | #chatbot .ss { color: #19177C } 63 | #chatbot .bp { color: #008000 } 64 | #chatbot .fm { color: #0000FF } 65 | #chatbot .vc { color: #19177C } 66 | #chatbot .vg { color: #19177C } 67 | #chatbot .vi { color: #19177C } 68 | #chatbot .vm { color: #19177C } 69 | #chatbot .il { color: #666666 } 70 | """ 71 | # .highlight { background: #f8f8f8; } 72 | -------------------------------------------------------------------------------- /pipeline/serve/multiplex_script/otter_image_server.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | # Commands to run 4 | commands = ["command1", "command2", "command3"] 5 | 6 | # Iterate over commands and execute them 7 | for command in commands: 8 | # Launch tmux with each command; this will keep the command running 9 | subprocess.Popen(["tmux", "new-session", "-d", command]) 10 | -------------------------------------------------------------------------------- /pipeline/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /pipeline/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from pipeline.serve.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name}) 21 | worker_addr = ret.json()["address"] 22 | print(f"worker_addr: {worker_addr}") 23 | 24 | if worker_addr == "": 25 | return 26 | 27 | conv = default_conversation.copy() 28 | conv.append_message(conv.roles[0], args.message) 29 | prompt = conv.get_prompt() 30 | 31 | headers = {"User-Agent": "Otter Client"} 32 | pload = { 33 | "model": args.model_name, 34 | "prompt": prompt, 35 | "max_new_tokens": args.max_new_tokens, 36 | "temperature": 0.7, 37 | "stop": conv.sep, 38 | } 39 | response = requests.post( 40 | worker_addr + "/worker_generate_stream", 41 | headers=headers, 42 | json=pload, 43 | stream=True, 44 | ) 45 | 46 | print(prompt.replace(conv.sep, "\n"), end="") 47 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 48 | if chunk: 49 | data = json.loads(chunk.decode("utf-8")) 50 | output = data["text"].split(conv.sep)[-1] 51 | print(output, end="\r") 52 | print("") 53 | 54 | 55 | if __name__ == "__main__": 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--controller_address", type=str, default="http://localhost:21001") 58 | parser.add_argument("--worker_address", type=str) 59 | parser.add_argument("--model_name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--max_new_tokens", type=int, default=32) 61 | parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.") 62 | args = parser.parse_args() 63 | 64 | main() 65 | -------------------------------------------------------------------------------- /pipeline/train/.gitignore: -------------------------------------------------------------------------------- 1 | config.yaml -------------------------------------------------------------------------------- /pipeline/train/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pipeline/train/config.yaml: -------------------------------------------------------------------------------- 1 | mimicit_vt_path: 2 | - /data/pufanyi/training_data/SD/SD_instructions.json 3 | - /data/pufanyi/training_data/CGD/CGD_instructions.json 4 | 5 | images_vt_path: 6 | - /data/pufanyi/training_data/SD/SD.json 7 | - /data/pufanyi/training_data/CGD/CGD.json 8 | -------------------------------------------------------------------------------- /pipeline/train/distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | def is_global_master(args): 6 | return args.rank == 0 7 | 8 | 9 | def is_local_master(args): 10 | return args.local_rank == 0 11 | 12 | 13 | def is_master(args, local=False): 14 | return is_local_master(args) if local else is_global_master(args) 15 | 16 | 17 | def is_using_distributed(): 18 | if "WORLD_SIZE" in os.environ: 19 | return int(os.environ["WORLD_SIZE"]) > 1 20 | if "SLURM_NTASKS" in os.environ: 21 | return int(os.environ["SLURM_NTASKS"]) > 1 22 | return False 23 | 24 | 25 | def world_info_from_env(): 26 | local_rank = 0 27 | for v in ( 28 | "LOCAL_RANK", 29 | "MPI_LOCALRANKID", 30 | "SLURM_LOCALID", 31 | "OMPI_COMM_WORLD_LOCAL_RANK", 32 | ): 33 | if v in os.environ: 34 | local_rank = int(os.environ[v]) 35 | break 36 | global_rank = 0 37 | for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"): 38 | if v in os.environ: 39 | global_rank = int(os.environ[v]) 40 | break 41 | world_size = 1 42 | for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"): 43 | if v in os.environ: 44 | world_size = int(os.environ[v]) 45 | break 46 | return local_rank, global_rank, world_size 47 | 48 | 49 | def init_distributed_device(args): 50 | # Distributed training = training on more than one GPU. 51 | # Works in both single and multi-node scenarios. 52 | args.distributed = False 53 | args.world_size = 1 54 | args.rank = 0 # global rank 55 | args.local_rank = 0 56 | if is_using_distributed(): 57 | if "SLURM_PROCID" in os.environ: 58 | # DDP via SLURM 59 | args.local_rank, args.rank, args.world_size = world_info_from_env() 60 | # SLURM var -> torch.distributed vars in case needed 61 | os.environ["LOCAL_RANK"] = str(args.local_rank) 62 | os.environ["RANK"] = str(args.rank) 63 | os.environ["WORLD_SIZE"] = str(args.world_size) 64 | torch.distributed.init_process_group( 65 | backend=args.dist_backend, 66 | init_method=args.dist_url, 67 | world_size=args.world_size, 68 | rank=args.rank, 69 | ) 70 | else: 71 | # DDP via torchrun, torch.distributed.launch 72 | args.local_rank, _, _ = world_info_from_env() 73 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url) 74 | args.world_size = torch.distributed.get_world_size() 75 | args.rank = torch.distributed.get_rank() 76 | args.distributed = True 77 | else: 78 | # needed to run on single gpu 79 | torch.distributed.init_process_group( 80 | backend=args.dist_backend, 81 | init_method=args.dist_url, 82 | world_size=1, 83 | rank=0, 84 | ) 85 | 86 | if torch.cuda.is_available(): 87 | if args.distributed and not args.no_set_device_rank: 88 | device = "cuda:%d" % args.local_rank 89 | else: 90 | device = "cuda:0" 91 | torch.cuda.set_device(device) 92 | else: 93 | device = "cpu" 94 | args.device = device 95 | device = torch.device(device) 96 | return device 97 | -------------------------------------------------------------------------------- /pipeline/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pipeline/utils/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | 11 | 12 | def apply_delta(base_model_path, target_model_path, delta_path): 13 | print("Loading base model") 14 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 15 | 16 | print("Loading delta") 17 | delta = AutoModelForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 18 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 19 | 20 | print("Applying delta") 21 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 22 | if name not in base.state_dict(): 23 | assert name in [ 24 | "model.mm_projector.weight", 25 | "model.mm_projector.bias", 26 | ], f"{name} not in base model" 27 | continue 28 | if param.data.shape == base.state_dict()[name].shape: 29 | param.data += base.state_dict()[name] 30 | else: 31 | assert name in [ 32 | "model.embed_tokens.weight", 33 | "lm_head.weight", 34 | ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}" 35 | bparam = base.state_dict()[name] 36 | param.data[: bparam.shape[0], : bparam.shape[1]] += bparam 37 | 38 | print("Saving target model") 39 | delta.save_pretrained(target_model_path) 40 | delta_tokenizer.save_pretrained(target_model_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument("--base-model-path", type=str, required=True) 46 | parser.add_argument("--target-model-path", type=str, required=True) 47 | parser.add_argument("--delta-path", type=str, required=True) 48 | 49 | args = parser.parse_args() 50 | 51 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 52 | -------------------------------------------------------------------------------- /pipeline/utils/convert_mmc4_to_wds.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import base64 3 | import json 4 | import os 5 | import tarfile 6 | import uuid 7 | import sys 8 | import braceexpand 9 | import webdataset as wds 10 | 11 | arg_parser = argparse.ArgumentParser() 12 | arg_parser.add_argument("--output_dir", type=str) 13 | arg_parser.add_argument( 14 | "--image_shards", 15 | type=str, 16 | help="Pass in a list of shards in the format path_to_shard/shard_{0..23098}_images_v2.tar", 17 | ) 18 | arg_parser.add_argument( 19 | "--doc_shards", 20 | type=str, 21 | help="Pass in a list of shards in the format path_to_shard/docs_shard_{0..23098}_v2.jsonl", 22 | ) 23 | args = arg_parser.parse_args() 24 | 25 | from tqdm import tqdm 26 | 27 | 28 | def main(args, start_number=0): 29 | os.makedirs(args.output_dir, exist_ok=True) 30 | 31 | doc_shards = list(braceexpand.braceexpand(args.doc_shards)) 32 | image_shards = list(braceexpand.braceexpand(args.image_shards)) 33 | 34 | assert len(doc_shards) == len(image_shards), "Each doc shard must have a corresponding image shard" 35 | with wds.ShardWriter(args.output_dir + f"/%09d.tar", maxcount=30000, maxsize=1e10) as sink: 36 | for idx in tqdm(range(start_number, len(doc_shards)), desc="Converting shards"): 37 | try: 38 | image_tar = tarfile.open(image_shards[idx]) 39 | except Exception as e: 40 | print(e) 41 | continue 42 | 43 | # Read the JSONL file 44 | try: 45 | with open(doc_shards[idx], "r") as json_file: 46 | for sample_data in json_file: 47 | # get image names from json 48 | sample_data = json.loads(sample_data) 49 | image_info = sample_data["image_info"] 50 | image_names = [image["image_name"] for image in image_info] 51 | 52 | # Add each image to the tar file 53 | for img_idx, image_name in enumerate(image_names): 54 | image = image_tar.extractfile(f"{image_tar.getnames()[0]}/{image_name}") 55 | 56 | # convert to base64 57 | image_bytes = image.read() 58 | image_base64 = base64.b64encode(image_bytes).decode("utf-8") 59 | sample_data["image_info"][img_idx]["image_base64"] = image_base64 60 | 61 | key_str = uuid.uuid4().hex 62 | sink.write({"__key__": key_str, "json": sample_data}) 63 | except Exception as e: 64 | print(e) 65 | image_tar.close() 66 | continue 67 | 68 | image_tar.close() 69 | 70 | 71 | if __name__ == "__main__": 72 | main(args=args) 73 | -------------------------------------------------------------------------------- /pipeline/utils/general.py: -------------------------------------------------------------------------------- 1 | class DualOutput: 2 | def __init__(self, file, stdout): 3 | self.file = file 4 | self.stdout = stdout 5 | 6 | def write(self, data): 7 | self.file.write(data) 8 | self.stdout.write(data) 9 | 10 | def flush(self): 11 | self.file.flush() 12 | self.stdout.flush() 13 | -------------------------------------------------------------------------------- /pipeline/utils/make_a_train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import orjson 4 | import argparse 5 | from tqdm import tqdm 6 | 7 | 8 | def main(input_file, output_file): 9 | # Load the JSON file 10 | with open(input_file, "rb") as file: 11 | data = orjson.loads(file.read()) 12 | 13 | # Create a set to store seen keys 14 | seen_keys = set() 15 | 16 | # Create a new dictionary with the keys from the original JSON and rel_ins_ids as values 17 | new_dict = {} 18 | for key, value in tqdm(data["data"].items()): 19 | if key not in seen_keys: 20 | try: 21 | # Check if rel_ins_ids are in the original JSON 22 | if args.remove_rel_ins_ids: 23 | valid_rel_ins_ids = [] 24 | else: 25 | valid_rel_ins_ids = [rel_ins_id for rel_ins_id in value["rel_ins_ids"] if rel_ins_id in data["data"]] 26 | 27 | # Add the valid rel_ins_ids to the new_dict 28 | new_dict[key] = valid_rel_ins_ids 29 | seen_keys.update(valid_rel_ins_ids) 30 | except Exception as e: 31 | print("Error with key %s and value %s" % (key, value)) 32 | 33 | # Write the new dictionary to a new JSON file 34 | with open(output_file, "wb") as file: 35 | file.write(orjson.dumps(new_dict)) 36 | 37 | 38 | if __name__ == "__main__": 39 | # Parse command-line arguments 40 | parser = argparse.ArgumentParser(description="Process a JSON file.") 41 | parser.add_argument("--input_file", type=str, help="Path to the input JSON file") 42 | parser.add_argument("--output_file", type=str, help="Path to the output JSON file") 43 | parser.add_argument( 44 | "--remove_rel_ins_ids", 45 | action="store_true", 46 | help="Remove rel_ins_ids from the output JSON file", 47 | ) 48 | 49 | args = parser.parse_args() 50 | 51 | # Run the main function with the provided arguments 52 | main(args.input_file, args.output_file) 53 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 240 3 | 4 | [build-system] 5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | prerun: mark a test as a prerun check. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.19.0 2 | braceexpand>=0.1.7 3 | einops>=0.6.1 4 | einops_exts>=0.0.4 5 | fastapi>=0.95.2 6 | gradio>=3.33.1 7 | huggingface_hub>=0.13.3 8 | importlib_metadata>=6.6.0 9 | inflection>=0.5.1 10 | markdown2>=2.4.8 11 | more_itertools>=9.1.0 12 | nltk>=3.8.1 13 | numpy>=1.23.5 14 | open_clip_torch>=2.16.0 15 | openai>=1.1.1 16 | opencv_python_headless>=4.5.5.64 17 | Pillow>=9.5.0 18 | pycocoevalcap>=1 19 | pycocotools>=2.0.6 20 | Requests>=2.31.0 21 | scipy>=1.10.1 22 | timm>=0.9.2 23 | tqdm>=4.65.0 24 | transformers==4.35.1 25 | uvicorn>=0.22.0 26 | webdataset>=0.2.48 27 | natsort>=8.4.0 28 | peft>=0.4.0 29 | ijson>=3.2.3 30 | yajl>=0.3.5 31 | deepspeed>=0.10.0 32 | wandb>=0.15.8 33 | trl>=0.5.0 34 | cffi>=1.15.1 35 | pyyaml>=6.0.1 36 | pytest>=7.4.2 37 | prettytable>=3.9.0 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import json 2 | from setuptools import setup, find_packages 3 | 4 | 5 | with open("requirements.txt") as f: 6 | requirements = f.read().splitlines() 7 | 8 | setup( 9 | name="otter-ai", 10 | version="0.0.0-alpha-7", 11 | packages=find_packages(where="src"), 12 | package_dir={"": "src"}, 13 | install_requires=requirements, 14 | # package_data={ 15 | # "otter": ["resources/**/*"], 16 | # }, 17 | # include_package_data=True, 18 | author="Otter Team", 19 | author_email="drluodian@gmail.com", 20 | description="Otter: A Multi-Modal Model with In-Context Instruction Tuning", 21 | long_description=open("README.md").read(), 22 | long_description_content_type="text/markdown", 23 | url="https://github.com/Luodian/Otter", 24 | classifiers=[ 25 | "Development Status :: 3 - Alpha", 26 | "Intended Audience :: Developers", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.9", 29 | "Programming Language :: Python :: 3.10", 30 | "Programming Language :: Python :: 3.11", 31 | ], 32 | # entry_points={ 33 | # "console_scripts": [ 34 | # # "syphus = syphus.cli.syphus_cli:main", 35 | # ], 36 | # }, 37 | ) 38 | -------------------------------------------------------------------------------- /shared_scripts/Demo_Data.yaml: -------------------------------------------------------------------------------- 1 | IMAGE_TEXT: # Group name should be in [IMAGE_TEXT, TEXT_ONLY, IMAGE_TEXT_IN_CONTEXT] 2 | LADD: # LLaVA Detailed Description, dataset name can be assigned at any name you want 3 | mimicit_path: azure_storage/json/LA/LADD_instructions.json # Path of the instruction json file 4 | images_path: azure_storage/Parquets/LA.parquet # Path of the image parquet file 5 | num_samples: -1 # Number of samples you want to use, -1 means use all samples, if not set, default is -1. 6 | M3IT_CAPTIONING: 7 | mimicit_path: azure_storage/json/M3IT/captioning/coco/coco_instructions.json 8 | images_path: azure_storage/Parquets/coco.parquet 9 | num_samples: 20000 10 | LACR_T2T: 11 | mimicit_path: azure_storage/json/LA/LACR_T2T_instructions.json 12 | images_path: azure_storage/Parquets/LA.parquet 13 | num_samples: -1 14 | # M3IT_VQA: 15 | # mimicit_path: azure_storage/json/M3IT/vqa/vqav2/vqav2_instructions.json 16 | # images_path: azure_storage/json/M3IT/vqa/vqav2/vqav2.json 17 | # num_samples: 20000 18 | M3IT_COCOGOI: 19 | mimicit_path: azure_storage/json/M3IT/classification/coco-goi/coco-goi_instructions.json 20 | images_path: azure_storage/Parquets/coco-goi.parquet 21 | num_samples: 20000 22 | M3IT_COCOITM: 23 | mimicit_path: azure_storage/json/M3IT/classification/coco-itm/coco-itm_instructions.json 24 | images_path: azure_storage/Parquets/coco-itm.parquet 25 | num_samples: 20000 26 | M3IT_IMAGENET: 27 | mimicit_path: azure_storage/json/M3IT/classification/imagenet/imagenet_instructions.json 28 | images_path: azure_storage/Parquets/imagenet.parquet 29 | num_samples: 20000 30 | # # M3IT_IQA: 31 | # # mimicit_path: azure_storage/json/M3IT/classification/iqa/iqa_instructions.json 32 | # # images_path: azure_storage/json/M3IT/classification/iqa/iqa.json 33 | # # num_samples: 20000 34 | M3IT_REFCOCO: 35 | mimicit_path: azure_storage/json/M3IT/classification/refcoco/refcoco_instructions.json 36 | images_path: azure_storage/Parquets/refcoco.parquet 37 | num_samples: 20000 38 | # M3IT_VSR: 39 | # mimicit_path: azure_storage/json/M3IT/classification/vsr/vsr_instructions.json 40 | # images_path: azure_storage/json/M3IT/classification/vsr/vsr.json 41 | # num_samples: 20000 42 | M3IT_TEXT_VQA: 43 | mimicit_path: azure_storage/json/M3IT/vqa/text-vqa/text-vqa_instructions.json 44 | images_path: azure_storage/Parquets/text-vqa.parquet 45 | num_samples: 20000 46 | M3IT_OKVQA: 47 | mimicit_path: azure_storage/json/M3IT/vqa/okvqa/okvqa_instructions.json 48 | images_path: azure_storage/Parquets/okvqa.parquet 49 | num_samples: 20000 50 | M3IT_A_OKVQA: 51 | mimicit_path: azure_storage/json/M3IT/vqa/a-okvqa/a-okvqa_instructions.json 52 | images_path: azure_storage/Parquets/a-okvqa.parquet 53 | num_samples: 20000 54 | M3IT_SIENCEQA: 55 | mimicit_path: azure_storage/json/M3IT/reasoning/scienceqa/scienceqa_instructions.json 56 | images_path: azure_storage/Parquets/scienceqa.parquet 57 | num_samples: 20000 58 | # SVIT: 59 | # mimicit_path: azure_storage/json/SVIT/SVIT_instructions.json 60 | # images_path: azure_storage/json/SVIT/SVIT.json 61 | # num_samples: 20000 62 | # PF: 63 | # mimicit_path: azure_storage/json/PF/PF_instructions.json 64 | # images_path: azure_storage/json/PF/PF.json 65 | # num_samples: 20000 66 | 67 | # TEXT_ONLY: 68 | # LIMA: 69 | # mimicit_path: azure_storage/json/LANG_Only/LIMA/LIMA_instructions_max_1K_tokens.json 70 | # num_samples: 20000 71 | # SHAREGPT: 72 | # mimicit_path: azure_storage/json/LANG_Only/SHAREGPT/SHAREGPT_instructions_max_1K_tokens.json 73 | # num_samples: 10000 74 | # AL: 75 | # mimicit_path: azure_storage/json/LANG_Only/AL/AL_instructions_max_1K_tokens.json 76 | # num_samples: 20000 77 | 78 | 79 | -------------------------------------------------------------------------------- /shared_scripts/Demo_OtterHD.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /root/of/Otter 3 | 4 | export PYTHONPATH=. 5 | 6 | # sent to sub script 7 | export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") 8 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 9 | export MASTER_PORT=12955 10 | export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) 11 | export NCCL_NET=IB 12 | 13 | echo HOSTNAMES = $HOSTNAMES 14 | echo hostname = $(hostname) 15 | echo MASTER_ADDR= $MASTER_ADDR 16 | echo MASTER_PORT= $MASTER_PORT 17 | 18 | GPU=$((${COUNT_NODE} * 8)) 19 | WORKERS=$((${COUNT_NODE} * 8)) 20 | 21 | if [ $WORKERS -gt 112 ]; then 22 | WORKERS=112 23 | fi 24 | 25 | RUN_NAME="RunNamePlaceHolder" 26 | 27 | echo GPU=${GPU} 28 | echo COUNT_NODE=$COUNT_NODE 29 | echo WORKERS=8 30 | echo "Running ${RUN_NAME}" 31 | 32 | H=$(hostname) 33 | THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]") 34 | export THEID=$THEID 35 | echo $THEID 36 | 37 | pkill python 38 | 39 | 40 | accelerate launch --config_file=./pipeline/accelerate_configs/accelerate_config_zero2.yaml \ 41 | --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ 42 | --num_machines=${COUNT_NODE} --num_processes=${GPU} \ 43 | pipeline/train/instruction_following.py \ 44 | --pretrained_model_name_or_path=adept/fuyu-8b \ 45 | --training_data_yaml=./Demo_Data.yaml \ 46 | --model_name=fuyu \ 47 | --instruction_format=fuyu \ 48 | --batch_size=8 \ 49 | --gradient_accumulation_steps=2 \ 50 | --num_epochs=3 \ 51 | --report_to_wandb \ 52 | --wandb_entity=libo0013 \ 53 | --external_save_dir=./checkpoints \ 54 | --run_name=${RUN_NAME} \ 55 | --wandb_project=Fuyu \ 56 | --workers=${WORKERS} \ 57 | --lr_scheduler=cosine \ 58 | --learning_rate=1e-5 \ 59 | --warmup_steps_ratio=0.03 \ 60 | --save_hf_model \ 61 | --max_seq_len=1024 \ 62 | --logging_steps=1000 \ 63 | --keep_symbols \ 64 | --save_ckpt_each_epoch \ 65 | --dynamic_resolution \ 66 | --with_task_description 67 | -------------------------------------------------------------------------------- /shared_scripts/Demo_OtterMPT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /root/of/Otter 3 | 4 | export PYTHONPATH=. 5 | 6 | # sent to sub script 7 | export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") 8 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 9 | export MASTER_PORT=12955 10 | export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) 11 | export NCCL_NET=IB 12 | 13 | echo HOSTNAMES = $HOSTNAMES 14 | echo hostname = $(hostname) 15 | echo MASTER_ADDR= $MASTER_ADDR 16 | echo MASTER_PORT= $MASTER_PORT 17 | 18 | GPU=$((${COUNT_NODE} * 8)) 19 | WORKERS=$((${COUNT_NODE} * 8)) 20 | 21 | if [ $WORKERS -gt 112 ]; then 22 | WORKERS=112 23 | fi 24 | 25 | RUN_NAME="RunNamePlaceHolder" 26 | 27 | echo GPU=${GPU} 28 | echo COUNT_NODE=$COUNT_NODE 29 | echo WORKERS=8 30 | echo "Running ${RUN_NAME}" 31 | 32 | H=$(hostname) 33 | THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]") 34 | export THEID=$THEID 35 | echo $THEID 36 | 37 | pkill python 38 | 39 | 40 | # --customized_config=./shared_scripts/Otter_MPT7B_Train_Decoder.json 41 | accelerate launch --config_file=./pipeline/accelerate_configs/accelerate_config_zero2.yaml \ 42 | --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ 43 | --num_machines=${COUNT_NODE} --num_processes=${GPU} \ 44 | pipeline/train/instruction_following.py \ 45 | --pretrained_model_name_or_path=adept/fuyu-8b \ 46 | --training_data_yaml=./Demo_Data.yaml \ 47 | --model_name=otter \ 48 | --instruction_format=simple \ 49 | --batch_size=8 \ 50 | --gradient_accumulation_steps=2 \ 51 | --num_epochs=3 \ 52 | --report_to_wandb \ 53 | --wandb_entity=libo0013 \ 54 | --external_save_dir=./checkpoints \ 55 | --run_name=${RUN_NAME} \ 56 | --wandb_project=Fuyu \ 57 | --workers=${WORKERS} \ 58 | --lr_scheduler=cosine \ 59 | --learning_rate=1e-5 \ 60 | --warmup_steps_ratio=0.03 \ 61 | --save_hf_model \ 62 | --max_seq_len=2048 \ 63 | --logging_steps=1000 \ 64 | --keep_symbols \ 65 | --save_ckpt_each_epoch \ 66 | --with_task_description 67 | -------------------------------------------------------------------------------- /src/otter_ai/__init__.py: -------------------------------------------------------------------------------- 1 | from . import models 2 | from .models.otter.modeling_otter import OtterForConditionalGeneration 3 | from .models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration 4 | -------------------------------------------------------------------------------- /src/otter_ai/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .falcon.modelling_RW import RWForCausalLM 2 | from .mpt.modeling_mpt import MPTForCausalLM 3 | from .mpt_redpajama.mosaic_gpt import MosaicGPT 4 | -------------------------------------------------------------------------------- /src/otter_ai/models/falcon/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/falcon/__init__.py -------------------------------------------------------------------------------- /src/otter_ai/models/falcon/configuration_RW.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Bloom configuration""" 16 | from transformers.configuration_utils import PretrainedConfig 17 | from transformers.utils import logging 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | class RWConfig(PretrainedConfig): 24 | model_type = "RefinedWebModel" 25 | keys_to_ignore_at_inference = ["past_key_values"] 26 | attribute_map = { 27 | "num_hidden_layers": "n_layer", 28 | "num_attention_heads": "n_head", 29 | } 30 | 31 | def __init__( 32 | self, 33 | vocab_size=250880, 34 | hidden_size=64, 35 | n_layer=2, 36 | n_head=8, 37 | layer_norm_epsilon=1e-5, 38 | initializer_range=0.02, 39 | use_cache=True, 40 | bos_token_id=1, 41 | eos_token_id=2, 42 | apply_residual_connection_post_layernorm=False, 43 | hidden_dropout=0.0, 44 | attention_dropout=0.0, 45 | multi_query=False, 46 | alibi=False, 47 | bias=False, 48 | parallel_attn=False, 49 | **kwargs, 50 | ): 51 | self.vocab_size = vocab_size 52 | # Backward compatibility with n_embed kwarg 53 | n_embed = kwargs.pop("n_embed", None) 54 | self.hidden_size = hidden_size if n_embed is None else n_embed 55 | self.n_layer = n_layer 56 | self.n_head = n_head 57 | self.layer_norm_epsilon = layer_norm_epsilon 58 | self.initializer_range = initializer_range 59 | self.use_cache = use_cache 60 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm 61 | self.hidden_dropout = hidden_dropout 62 | self.attention_dropout = attention_dropout 63 | 64 | self.bos_token_id = bos_token_id 65 | self.eos_token_id = eos_token_id 66 | self.multi_query = multi_query 67 | self.alibi = alibi 68 | self.bias = bias 69 | self.parallel_attn = parallel_attn 70 | 71 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 72 | 73 | @property 74 | def head_dim(self): 75 | return self.hidden_size // self.n_head 76 | 77 | @property 78 | def rotary(self): 79 | return not self.alibi 80 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from transformers.utils import ( 4 | OptionalDependencyNotAvailable, 5 | _LazyModule, 6 | is_torch_available, 7 | ) 8 | 9 | 10 | _import_structure = { 11 | "configuration_flamingo": [ 12 | "FlamingoConfig", 13 | ], 14 | } 15 | 16 | try: 17 | if not is_torch_available(): 18 | raise OptionalDependencyNotAvailable() 19 | except OptionalDependencyNotAvailable: 20 | pass 21 | else: 22 | _import_structure["modeling_flamingo"] = [ 23 | "FlamingoModel", 24 | "FlamingoPreTrainedModel", 25 | "FlamingoForConditionalGeneration", 26 | ] 27 | 28 | if TYPE_CHECKING: 29 | from .configuration_flamingo import FlamingoConfig 30 | 31 | # from .processing_flamingo import FlamingoProcessor 32 | 33 | try: 34 | if not is_torch_available(): 35 | raise OptionalDependencyNotAvailable() 36 | except OptionalDependencyNotAvailable: 37 | pass 38 | else: 39 | from .modeling_flamingo import ( 40 | FlamingoForConditionalGeneration, 41 | FlamingoModel, 42 | FlamingoPreTrainedModel, 43 | ) 44 | 45 | else: 46 | import sys 47 | 48 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 49 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "flamingo", 3 | "cross_attn_every_n_layers": 4, 4 | "tie_word_embeddings": false, 5 | "use_media_placement_augmentation": true, 6 | "only_attend_previous": true, 7 | "text_config": { 8 | "_name_or_path": "luodian/llama-7b-hf", 9 | "model_type": "llama" 10 | }, 11 | "vision_config": { 12 | "_name_or_path": "openai/clip-vit-large-patch14", 13 | "model_type": "clip_vision_model", 14 | "hidden_size": 1024, 15 | "intermediate_size": 4096, 16 | "num_attention_heads": 16, 17 | "num_hidden_layers": 24, 18 | "image_size": 224, 19 | "patch_size": 14 20 | } 21 | } -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/converting_flamingo_to_bf16.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from .configuration_flamingo import FlamingoConfig 7 | from .modeling_flamingo import FlamingoForConditionalGeneration 8 | 9 | parser = argparse.ArgumentParser(description="Load model with precision") 10 | parser.add_argument("--load_bit", type=str, choices=["fp16", "bf16"], required=True, help="Choose either 'fp16' or 'bf16'") 11 | parser.add_argument("--pretrained_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True) 12 | parser.add_argument("--saved_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True) 13 | args = parser.parse_args() 14 | 15 | load_bit = args.load_bit 16 | pretrained_model_path = args.pretrained_model_path 17 | 18 | if load_bit == "fp16": 19 | precision = {"torch_dtype": torch.float16} 20 | elif load_bit == "bf16": 21 | precision = {"torch_dtype": torch.bfloat16} 22 | 23 | root_dir = os.environ["AZP"] 24 | print(root_dir) 25 | device_id = "cpu" 26 | model = FlamingoForConditionalGeneration.from_pretrained(pretrained_model_path, device_map={"": device_id}, **precision) 27 | 28 | # save model to same folder 29 | checkpoint_path = pretrained_model_path + f"-{load_bit}" 30 | model.save_pretrained(checkpoint_path, max_shard_size="10GB") 31 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/converting_flamingo_to_hf.py: -------------------------------------------------------------------------------- 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train. 2 | """ 3 | 4 | import re 5 | import argparse 6 | import os 7 | 8 | import torch 9 | import torch.nn as nn 10 | from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer 11 | 12 | import sys 13 | from modeling_flamingo import FlamingoForConditionalGeneration 14 | 15 | from configuration_flamingo import FlamingoConfig 16 | 17 | 18 | @torch.no_grad() 19 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None: 20 | old_ckpt = torch.load(old_ckpt_path, map_location="cpu") 21 | if old_ckpt.get("model_state_dict", None) is not None: 22 | old_ckpt = old_ckpt["model_state_dict"] 23 | new_ckpt = old_ckpt 24 | folder_path = os.path.dirname(old_ckpt_path) 25 | # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "flamingo/config.json" 26 | model = FlamingoForConditionalGeneration.from_pretrained( 27 | args.pretrained_model_path, 28 | device_map="auto", 29 | ) 30 | _ = model.load_state_dict(new_ckpt, strict=False) 31 | print(f"Saving HF model to {new_folder_path}") 32 | model.save_pretrained(new_folder_path) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument( 38 | "--old_ckpt_path", 39 | "-old", 40 | type=str, 41 | required=True, 42 | help="Path to the pt checkpoint", 43 | ) 44 | parser.add_argument( 45 | "--new_hf_path", 46 | "-new", 47 | type=str, 48 | required=True, 49 | help="Path to the hf folder", 50 | ) 51 | parser.add_argument( 52 | "--pretrained_model_path", 53 | "-pretrained", 54 | type=str, 55 | required=True, 56 | help="Path to the pretrained model folder", 57 | ) 58 | args = parser.parse_args() 59 | if not os.path.exists(os.path.dirname(args.new_hf_path)): 60 | os.makedirs(os.path.dirname(args.new_hf_path)) 61 | dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path) 62 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/converting_flamingo_to_lora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import sys 4 | 5 | from .modeling_flamingo import FlamingoForConditionalGeneration 6 | from peft import get_peft_model, LoraConfig, TaskType 7 | 8 | MODEL_CLASSES = { 9 | "LlamaForCausalLM": "llama", 10 | "OPTForCausalLM": "opt", 11 | "GPTJForCausalLM": "gptj", 12 | "GPTNeoXForCausalLM": "gpt_neox", 13 | "MPTForCausalLM": "mpt", 14 | } 15 | 16 | # Define argument parser 17 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 18 | 19 | # Add arguments 20 | parser.add_argument( 21 | "--checkpoint_path", 22 | type=str, 23 | help="Path to the pre-trained model checkpoint.", 24 | default="", 25 | ) 26 | parser.add_argument( 27 | "--save_path", 28 | type=str, 29 | default="", 30 | help="Path to the converted model checkpoint.", 31 | ) 32 | 33 | # Parse the input arguments 34 | args = parser.parse_args() 35 | 36 | load_bit = "bf16" 37 | if load_bit == "fp16": 38 | precision = {"torch_dtype": torch.float16} 39 | elif load_bit == "bf16": 40 | precision = {"torch_dtype": torch.bfloat16} 41 | 42 | # Load the model 43 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision) 44 | 45 | # adding lora 46 | standard_modules = ["q_proj", "v_proj"] 47 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]] 48 | model_to_lora_modules = { 49 | "llama": standard_modules, 50 | "opt": standard_modules, 51 | "gptj": standard_modules, 52 | "gpt_neox": ["query_key_value"], 53 | "mpt": ["Wqkv"], 54 | } 55 | lora_config = LoraConfig( 56 | r=16, 57 | lora_alpha=32, 58 | lora_dropout=0.05, 59 | task_type=TaskType.CAUSAL_LM, 60 | target_modules=model_to_lora_modules[lang_encoder_short_name], 61 | ) 62 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}}) 63 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config) 64 | model.lang_encoder.print_trainable_parameters() 65 | 66 | # Save the model 67 | checkpoint_path = args.save_path 68 | FlamingoForConditionalGeneration.save_pretrained(model, checkpoint_path) 69 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/flamingo-falcon-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "_commit_hash": null, 3 | "architectures": [ 4 | "FlamingoModel" 5 | ], 6 | "cross_attn_every_n_layers": 4, 7 | "model_type": "flamingo", 8 | "text_config": { 9 | "architectures": [ 10 | "RWForCausalLM" 11 | ], 12 | "apply_residual_connection_post_layernorm": false, 13 | "attention_dropout": 0.0, 14 | "bias": false, 15 | "bos_token_id": 11, 16 | "eos_token_id": 11, 17 | "hidden_dropout": 0.0, 18 | "hidden_size": 4544, 19 | "initializer_range": 0.02, 20 | "layer_norm_epsilon": 1e-05, 21 | "model_type": "RefinedWebModel", 22 | "multi_query": true, 23 | "n_head": 71, 24 | "n_layer": 32, 25 | "parallel_attn": true, 26 | "torch_dtype": "bfloat16", 27 | "transformers_version": "4.27.4", 28 | "use_cache": true, 29 | "vocab_size": 65024 30 | }, 31 | "tie_word_embeddings": false, 32 | "torch_dtype": "float32", 33 | "transformers_version": null, 34 | "use_media_placement_augmentation": true, 35 | "vision_config": { 36 | "_name_or_path": "openai/clip-vit-large-patch14", 37 | "add_cross_attention": false, 38 | "architectures": null, 39 | "attention_dropout": 0.0, 40 | "bad_words_ids": null, 41 | "begin_suppress_tokens": null, 42 | "bos_token_id": null, 43 | "chunk_size_feed_forward": 0, 44 | "cross_attention_hidden_size": null, 45 | "decoder_start_token_id": null, 46 | "diversity_penalty": 0.0, 47 | "do_sample": false, 48 | "early_stopping": false, 49 | "encoder_no_repeat_ngram_size": 0, 50 | "eos_token_id": null, 51 | "exponential_decay_length_penalty": null, 52 | "finetuning_task": null, 53 | "forced_bos_token_id": null, 54 | "forced_eos_token_id": null, 55 | "hidden_act": "quick_gelu", 56 | "hidden_size": 1024, 57 | "id2label": { 58 | "0": "LABEL_0", 59 | "1": "LABEL_1" 60 | }, 61 | "image_size": 224, 62 | "initializer_factor": 1.0, 63 | "initializer_range": 0.02, 64 | "intermediate_size": 4096, 65 | "is_decoder": false, 66 | "is_encoder_decoder": false, 67 | "label2id": { 68 | "LABEL_0": 0, 69 | "LABEL_1": 1 70 | }, 71 | "layer_norm_eps": 1e-05, 72 | "length_penalty": 1.0, 73 | "max_length": 20, 74 | "min_length": 0, 75 | "model_type": "clip_vision_model", 76 | "no_repeat_ngram_size": 0, 77 | "num_attention_heads": 16, 78 | "num_beam_groups": 1, 79 | "num_beams": 1, 80 | "num_channels": 3, 81 | "num_hidden_layers": 24, 82 | "num_return_sequences": 1, 83 | "output_attentions": false, 84 | "output_hidden_states": false, 85 | "output_scores": false, 86 | "pad_token_id": null, 87 | "patch_size": 14, 88 | "prefix": null, 89 | "problem_type": null, 90 | "projection_dim": 512, 91 | "pruned_heads": {}, 92 | "remove_invalid_values": false, 93 | "repetition_penalty": 1.0, 94 | "return_dict": true, 95 | "return_dict_in_generate": false, 96 | "sep_token_id": null, 97 | "suppress_tokens": null, 98 | "task_specific_params": null, 99 | "temperature": 1.0, 100 | "tf_legacy_loss": false, 101 | "tie_encoder_decoder": false, 102 | "tie_word_embeddings": true, 103 | "tokenizer_class": null, 104 | "top_k": 50, 105 | "top_p": 1.0, 106 | "torch_dtype": null, 107 | "torchscript": false, 108 | "transformers_version": "4.28.1", 109 | "typical_p": 1.0, 110 | "use_bfloat16": false 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/injecting_falcon_into_flamingo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from .configuration_flamingo import FlamingoConfig 4 | from .modeling_flamingo import FlamingoForConditionalGeneration 5 | 6 | root_dir = os.environ["AZP"] 7 | print(root_dir) 8 | 9 | 10 | config = FlamingoConfig.from_json_file(".flamingo-falcon-7B.json") 11 | model = FlamingoForConditionalGeneration(config=config) 12 | 13 | 14 | state_dict_files = [ 15 | f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00001-of-00002.bin", 16 | f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00002-of-00002.bin", 17 | ] 18 | 19 | state_dict = {} 20 | for file in state_dict_files: 21 | state_dict_part = torch.load(file, map_location="cpu") 22 | state_dict.update(state_dict_part) 23 | 24 | 25 | state_dict_3 = torch.load("{root_dir}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu") 26 | for cur_key in list(state_dict_3.keys()): 27 | if "vision_encoder" not in cur_key: 28 | del state_dict_3[cur_key] 29 | 30 | _ = model.load_state_dict( 31 | state_dict_3, 32 | False, 33 | ) 34 | print(_[1]) 35 | 36 | save_state_dict_1 = {} 37 | for key in state_dict: 38 | if ".h." in key: 39 | _, _, layer_num, *remain_names = key.split(".") 40 | target_key = f"transformer.h.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 41 | else: 42 | target_key = key 43 | save_state_dict_1[f"{target_key}"] = state_dict[key] 44 | _ = model.lang_encoder.load_state_dict( 45 | save_state_dict_1, 46 | False, 47 | ) 48 | print(_[1]) 49 | model.save_pretrained(f"{root_dir}/otter/checkpoints/flamingo-falcon-7b/") 50 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/injecting_llama2_into_flamingo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from tqdm import tqdm 6 | 7 | import sys 8 | 9 | from .configuration_flamingo import FlamingoConfig 10 | from .modeling_flamingo import FlamingoForConditionalGeneration 11 | 12 | # from .configuration_flamingo import FlamingoConfig 13 | # from .modeling_flamingo import FlamingoForConditionalGeneration 14 | 15 | parser = argparse.ArgumentParser(description="Convert Vicuna model") 16 | parser.add_argument("--model_choice", type=str, default="13B", help="Choose either '7B' or '13B'") 17 | parser.add_argument("--llama2_root_dir", type=str, default="/home/luodian/projects/checkpoints") 18 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints") 19 | args = parser.parse_args() 20 | 21 | # os.environ["TOKENIZERS_PARALLELISM"] = "false" 22 | 23 | root_dir = args.llama2_root_dir 24 | model_choice = args.model_choice 25 | save_root_dir = args.save_root_dir 26 | 27 | # prepare vicuna model at first 28 | # you can visit https://huggingface.co/lmsys/Llama-2-33b-chat-hf to download 7B and 30B instruct checkpoints. 29 | if model_choice == "7B": 30 | config_file = "./flamingo/flamingo-llama2-chat-7B.json" 31 | state_dict_files = [ 32 | f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00001-of-00002.bin", 33 | f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00002-of-00002.bin", 34 | ] 35 | save_path = f"{save_root_dir}/flamingo-llama2-chat-7B-init" 36 | elif model_choice == "13B": 37 | config_file = "./flamingo/flamingo-llama2-chat-13B.json" 38 | state_dict_files = [ 39 | f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00001-of-00003.bin", 40 | f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00002-of-00003.bin", 41 | f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00003-of-00003.bin", 42 | ] 43 | save_path = f"{save_root_dir}/flamingo-llama2-chat-13B-init" 44 | else: 45 | raise ValueError("Invalid model_choice. Choose either '13B' or '7B'.") 46 | 47 | config = FlamingoConfig.from_json_file(config_file) 48 | model = FlamingoForConditionalGeneration(config=config) 49 | 50 | # load flamingo's vision encoder from last checkpoint. 51 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint. 52 | # AZP = "os.environ["AZP"]" 53 | AZP = os.environ["AZP"] 54 | state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu") 55 | for cur_key in list(state_dict_3.keys()): 56 | if "vision_encoder" not in cur_key: 57 | del state_dict_3[cur_key] 58 | 59 | load_msg = model.load_state_dict( 60 | state_dict_3, 61 | False, 62 | ) 63 | # print incompatible keys 64 | print(load_msg[1]) 65 | 66 | # Loading vicuna weights 67 | state_dict = {} 68 | for file in tqdm(state_dict_files, desc="Loading state dict"): 69 | state_dict_part = torch.load(file, map_location="cpu") 70 | state_dict.update(state_dict_part) 71 | 72 | save_state_dict_1 = {} 73 | for key in state_dict: 74 | if ".layers." in key: 75 | _, _, layer_num, *remain_names = key.split(".") 76 | target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 77 | else: 78 | target_key = key 79 | save_state_dict_1[f"{target_key}"] = state_dict[key] 80 | 81 | # Reshape the token embedding to 50280 for compatible 82 | model.lang_encoder.resize_token_embeddings(32000) 83 | 84 | load_msg = model.lang_encoder.load_state_dict( 85 | save_state_dict_1, 86 | False, 87 | ) 88 | # Reshape the token embedding to 32002 for compatible 89 | model.lang_encoder.resize_token_embeddings(32002) 90 | # print incompatible keys 91 | print(load_msg[1]) 92 | 93 | 94 | print(f"Saving model to {save_path}...") 95 | model.save_pretrained(save_path, max_shard_size="10GB") 96 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from tqdm import tqdm 6 | 7 | import sys 8 | 9 | from configuration_flamingo import FlamingoConfig 10 | from modeling_flamingo import FlamingoForConditionalGeneration 11 | from utils import rename_flamingo_checkpoint 12 | 13 | 14 | parser = argparse.ArgumentParser(description="Convert MPT model") 15 | parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints") 16 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints") 17 | parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected") 18 | args = parser.parse_args() 19 | 20 | 21 | root_dir = args.mpt_root_dir 22 | save_root_dir = args.save_root_dir 23 | 24 | # prepare mpt model at first 25 | # you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints. 26 | config_file = "./flamingo/flamingo-mpt-1B-redpajama.json" 27 | state_dict_file = f"{root_dir}/pytorch_model.bin" 28 | save_path = f"{save_root_dir}/flamingo-mpt-1b-redpajama-200b-dolly" 29 | 30 | config = FlamingoConfig.from_json_file(config_file) 31 | 32 | model = FlamingoForConditionalGeneration(config=config) 33 | 34 | # Loading mpt weights 35 | state_dict = torch.load(state_dict_file, map_location="cpu") 36 | save_state_dict_1 = {} 37 | for key in state_dict: 38 | if ".blocks." in key: 39 | _, _, layer_num, *remain_names = key.split(".") 40 | target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 41 | else: 42 | target_key = key 43 | save_state_dict_1[f"{target_key}"] = state_dict[key] 44 | 45 | load_msg = model.lang_encoder.load_state_dict( 46 | save_state_dict_1, 47 | False, 48 | ) 49 | 50 | # load flamingo's vision encoder from last checkpoint. 51 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint. 52 | AZP = os.environ["AZP"] 53 | state_dict_3 = torch.load(f"{AZP}/pytorch_model-00004-of-00004.bin", map_location="cpu") 54 | for cur_key in list(state_dict_3.keys()): 55 | if "vision_encoder" not in cur_key: 56 | del state_dict_3[cur_key] 57 | 58 | load_msg = model.load_state_dict( 59 | state_dict_3, 60 | False, 61 | ) 62 | # print incompatible keys 63 | print(load_msg[1]) 64 | 65 | save_state_dict_1 = {} 66 | for key in state_dict: 67 | if ".blocks." in key: 68 | _, _, layer_num, *remain_names = key.split(".") 69 | target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 70 | else: 71 | target_key = key 72 | save_state_dict_1[f"{target_key}"] = state_dict[key] 73 | 74 | load_msg = model.lang_encoder.load_state_dict( 75 | save_state_dict_1, 76 | False, 77 | ) 78 | # print incompatible keys 79 | print(load_msg[1]) 80 | if args.flamingo_dir is not None: 81 | state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu") 82 | save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2) 83 | real_vocab_size = config.text_config.vocab_size 84 | # Reshape the token embedding to 50280 for compatible 85 | model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0]) 86 | 87 | load_msg = model.load_state_dict( 88 | save_state_dict_2, 89 | False, 90 | ) 91 | # print incompatible keys 92 | print(load_msg[1]) 93 | # Reshape the token embedding to 50432 94 | model.lang_encoder.resize_token_embeddings(real_vocab_size) 95 | 96 | print(f"Saving model to {save_path}...") 97 | model.save_pretrained(save_path, max_shard_size="10GB") 98 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import torch 3 | 4 | 5 | def rename_flamingo_checkpoint(old_ckpt: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: 6 | """Rename some keys in the public flamingo checkpoint""" 7 | perceiver_pattern1 = re.compile(r"perceiver\.layers\.[0-9]\.0") 8 | perceiver_pattern2 = re.compile(r"perceiver\.layers\.[0-9]\.1") 9 | new_ckpt = old_ckpt.copy() 10 | for key, value in old_ckpt.items(): 11 | if re.match(perceiver_pattern1, key): 12 | new_key = re.sub(r"([0-9])\.0", r"\1", key) 13 | new_ckpt.pop(key) 14 | new_ckpt[new_key] = value 15 | elif re.match(perceiver_pattern2, key): 16 | new_key = re.sub(r"([0-9])\.1", r"\1.feed_forward", key) 17 | new_ckpt.pop(key) 18 | new_ckpt[new_key] = value 19 | elif key.startswith("lang_encoder.gated_cross_attn_layers."): 20 | new_ckpt.pop(key) 21 | elif key.startswith("lang_encoder.") and "ff_gate" not in key: 22 | new_key = key.replace("ff", "feed_forward") 23 | new_ckpt.pop(key) 24 | new_ckpt[new_key] = value 25 | return new_ckpt 26 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/converting_flamingo_to_bf16.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from ..configuration_flamingo import FlamingoConfig 7 | from ..modeling_flamingo import FlamingoForConditionalGeneration 8 | 9 | parser = argparse.ArgumentParser(description="Load model with precision") 10 | parser.add_argument( 11 | "--load_bit", 12 | type=str, 13 | choices=["fp16", "bf16"], 14 | required=True, 15 | help="Choose either 'fp16' or 'bf16'", 16 | ) 17 | parser.add_argument( 18 | "--pretrained_model_path", 19 | type=str, 20 | default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", 21 | required=True, 22 | ) 23 | parser.add_argument( 24 | "--saved_model_path", 25 | type=str, 26 | default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", 27 | required=True, 28 | ) 29 | args = parser.parse_args() 30 | 31 | load_bit = args.load_bit 32 | pretrained_model_path = args.pretrained_model_path 33 | 34 | if load_bit == "fp16": 35 | precision = {"torch_dtype": torch.float16} 36 | elif load_bit == "bf16": 37 | precision = {"torch_dtype": torch.bfloat16} 38 | 39 | root_dir = os.environ["AZP"] 40 | print(root_dir) 41 | device_id = "cpu" 42 | model = FlamingoForConditionalGeneration.from_pretrained(pretrained_model_path, device_map={"": device_id}, **precision) 43 | 44 | # save model to same folder 45 | checkpoint_path = pretrained_model_path + f"-{load_bit}" 46 | model.save_pretrained(checkpoint_path, max_shard_size="10GB") 47 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/converting_flamingo_to_hf.py: -------------------------------------------------------------------------------- 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train. 2 | """ 3 | 4 | import re 5 | import argparse 6 | import os 7 | 8 | import torch 9 | import torch.nn as nn 10 | from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer 11 | 12 | import sys 13 | from ..configuration_flamingo import FlamingoConfig 14 | from ..modeling_flamingo import FlamingoForConditionalGeneration 15 | 16 | 17 | @torch.no_grad() 18 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None: 19 | old_ckpt = torch.load(old_ckpt_path, map_location="cpu") 20 | if old_ckpt.get("model_state_dict", None) is not None: 21 | old_ckpt = old_ckpt["model_state_dict"] 22 | new_ckpt = old_ckpt 23 | folder_path = os.path.dirname(old_ckpt_path) 24 | # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "flamingo/config.json" 25 | model = FlamingoForConditionalGeneration.from_pretrained( 26 | args.pretrained_model_path, 27 | device_map="auto", 28 | ) 29 | _ = model.load_state_dict(new_ckpt, strict=False) 30 | print(f"Saving HF model to {new_folder_path}") 31 | model.save_pretrained(new_folder_path) 32 | 33 | 34 | if __name__ == "__main__": 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument( 37 | "--old_ckpt_path", 38 | "-old", 39 | type=str, 40 | required=True, 41 | help="Path to the pt checkpoint", 42 | ) 43 | parser.add_argument( 44 | "--new_hf_path", 45 | "-new", 46 | type=str, 47 | required=True, 48 | help="Path to the hf folder", 49 | ) 50 | parser.add_argument( 51 | "--pretrained_model_path", 52 | "-pretrained", 53 | type=str, 54 | required=True, 55 | help="Path to the pretrained model folder", 56 | ) 57 | args = parser.parse_args() 58 | if not os.path.exists(os.path.dirname(args.new_hf_path)): 59 | os.makedirs(os.path.dirname(args.new_hf_path)) 60 | dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path) 61 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/converting_flamingo_to_lora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import sys 4 | 5 | from ..configuration_flamingo import FlamingoConfig 6 | from ..modeling_flamingo import FlamingoForConditionalGeneration 7 | from peft import get_peft_model, LoraConfig, TaskType 8 | 9 | MODEL_CLASSES = { 10 | "LlamaForCausalLM": "llama", 11 | "OPTForCausalLM": "opt", 12 | "GPTJForCausalLM": "gptj", 13 | "GPTNeoXForCausalLM": "gpt_neox", 14 | "MPTForCausalLM": "mpt", 15 | } 16 | 17 | # Define argument parser 18 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 19 | 20 | # Add arguments 21 | parser.add_argument( 22 | "--checkpoint_path", 23 | type=str, 24 | help="Path to the pre-trained model checkpoint.", 25 | default="", 26 | ) 27 | parser.add_argument( 28 | "--save_path", 29 | type=str, 30 | default="", 31 | help="Path to the converted model checkpoint.", 32 | ) 33 | 34 | # Parse the input arguments 35 | args = parser.parse_args() 36 | 37 | load_bit = "bf16" 38 | if load_bit == "fp16": 39 | precision = {"torch_dtype": torch.float16} 40 | elif load_bit == "bf16": 41 | precision = {"torch_dtype": torch.bfloat16} 42 | 43 | # Load the model 44 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision) 45 | 46 | # adding lora 47 | standard_modules = ["q_proj", "v_proj"] 48 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]] 49 | model_to_lora_modules = { 50 | "llama": standard_modules, 51 | "opt": standard_modules, 52 | "gptj": standard_modules, 53 | "gpt_neox": ["query_key_value"], 54 | "mpt": ["Wqkv"], 55 | } 56 | lora_config = LoraConfig( 57 | r=16, 58 | lora_alpha=32, 59 | lora_dropout=0.05, 60 | task_type=TaskType.CAUSAL_LM, 61 | target_modules=model_to_lora_modules[lang_encoder_short_name], 62 | ) 63 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}}) 64 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config) 65 | model.lang_encoder.print_trainable_parameters() 66 | 67 | # Save the model 68 | checkpoint_path = args.save_path 69 | FlamingoForConditionalGeneration.save_pretrained(model, checkpoint_path) 70 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/flamingo-falcon-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "_commit_hash": null, 3 | "architectures": [ 4 | "FlamingoModel" 5 | ], 6 | "cross_attn_every_n_layers": 4, 7 | "model_type": "flamingo", 8 | "text_config": { 9 | "architectures": [ 10 | "RWForCausalLM" 11 | ], 12 | "apply_residual_connection_post_layernorm": false, 13 | "attention_dropout": 0.0, 14 | "bias": false, 15 | "bos_token_id": 11, 16 | "eos_token_id": 11, 17 | "hidden_dropout": 0.0, 18 | "hidden_size": 4544, 19 | "initializer_range": 0.02, 20 | "layer_norm_epsilon": 1e-05, 21 | "model_type": "RefinedWebModel", 22 | "multi_query": true, 23 | "n_head": 71, 24 | "n_layer": 32, 25 | "parallel_attn": true, 26 | "torch_dtype": "bfloat16", 27 | "transformers_version": "4.27.4", 28 | "use_cache": true, 29 | "vocab_size": 65024 30 | }, 31 | "tie_word_embeddings": false, 32 | "torch_dtype": "float32", 33 | "transformers_version": null, 34 | "use_media_placement_augmentation": true, 35 | "vision_config": { 36 | "_name_or_path": "openai/clip-vit-large-patch14", 37 | "add_cross_attention": false, 38 | "architectures": null, 39 | "attention_dropout": 0.0, 40 | "bad_words_ids": null, 41 | "begin_suppress_tokens": null, 42 | "bos_token_id": null, 43 | "chunk_size_feed_forward": 0, 44 | "cross_attention_hidden_size": null, 45 | "decoder_start_token_id": null, 46 | "diversity_penalty": 0.0, 47 | "do_sample": false, 48 | "early_stopping": false, 49 | "encoder_no_repeat_ngram_size": 0, 50 | "eos_token_id": null, 51 | "exponential_decay_length_penalty": null, 52 | "finetuning_task": null, 53 | "forced_bos_token_id": null, 54 | "forced_eos_token_id": null, 55 | "hidden_act": "quick_gelu", 56 | "hidden_size": 1024, 57 | "id2label": { 58 | "0": "LABEL_0", 59 | "1": "LABEL_1" 60 | }, 61 | "image_size": 224, 62 | "initializer_factor": 1.0, 63 | "initializer_range": 0.02, 64 | "intermediate_size": 4096, 65 | "is_decoder": false, 66 | "is_encoder_decoder": false, 67 | "label2id": { 68 | "LABEL_0": 0, 69 | "LABEL_1": 1 70 | }, 71 | "layer_norm_eps": 1e-05, 72 | "length_penalty": 1.0, 73 | "max_length": 20, 74 | "min_length": 0, 75 | "model_type": "clip_vision_model", 76 | "no_repeat_ngram_size": 0, 77 | "num_attention_heads": 16, 78 | "num_beam_groups": 1, 79 | "num_beams": 1, 80 | "num_channels": 3, 81 | "num_hidden_layers": 24, 82 | "num_return_sequences": 1, 83 | "output_attentions": false, 84 | "output_hidden_states": false, 85 | "output_scores": false, 86 | "pad_token_id": null, 87 | "patch_size": 14, 88 | "prefix": null, 89 | "problem_type": null, 90 | "projection_dim": 512, 91 | "pruned_heads": {}, 92 | "remove_invalid_values": false, 93 | "repetition_penalty": 1.0, 94 | "return_dict": true, 95 | "return_dict_in_generate": false, 96 | "sep_token_id": null, 97 | "suppress_tokens": null, 98 | "task_specific_params": null, 99 | "temperature": 1.0, 100 | "tf_legacy_loss": false, 101 | "tie_encoder_decoder": false, 102 | "tie_word_embeddings": true, 103 | "tokenizer_class": null, 104 | "top_k": 50, 105 | "top_p": 1.0, 106 | "torch_dtype": null, 107 | "torchscript": false, 108 | "transformers_version": "4.28.1", 109 | "typical_p": 1.0, 110 | "use_bfloat16": false 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/injecting_falcon_into_flamingo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from ..configuration_flamingo import FlamingoConfig 4 | from ..modeling_flamingo import FlamingoForConditionalGeneration 5 | 6 | root_dir = os.environ["AZP"] 7 | print(root_dir) 8 | 9 | 10 | config = FlamingoConfig.from_json_file(".flamingo-falcon-7B.json") 11 | model = FlamingoForConditionalGeneration(config=config) 12 | 13 | 14 | state_dict_files = [ 15 | f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00001-of-00002.bin", 16 | f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00002-of-00002.bin", 17 | ] 18 | 19 | state_dict = {} 20 | for file in state_dict_files: 21 | state_dict_part = torch.load(file, map_location="cpu") 22 | state_dict.update(state_dict_part) 23 | 24 | 25 | state_dict_3 = torch.load( 26 | "{root_dir}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", 27 | map_location="cpu", 28 | ) 29 | for cur_key in list(state_dict_3.keys()): 30 | if "vision_encoder" not in cur_key: 31 | del state_dict_3[cur_key] 32 | 33 | _ = model.load_state_dict( 34 | state_dict_3, 35 | False, 36 | ) 37 | print(_[1]) 38 | 39 | save_state_dict_1 = {} 40 | for key in state_dict: 41 | if ".h." in key: 42 | _, _, layer_num, *remain_names = key.split(".") 43 | target_key = f"transformer.h.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 44 | else: 45 | target_key = key 46 | save_state_dict_1[f"{target_key}"] = state_dict[key] 47 | _ = model.lang_encoder.load_state_dict( 48 | save_state_dict_1, 49 | False, 50 | ) 51 | print(_[1]) 52 | model.save_pretrained(f"{root_dir}/otter/checkpoints/flamingo-falcon-7b/") 53 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/injecting_llama2_into_flamingo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from tqdm import tqdm 6 | 7 | import sys 8 | 9 | from ..configuration_flamingo import FlamingoConfig 10 | from ..modeling_flamingo import FlamingoForConditionalGeneration 11 | 12 | # from .configuration_flamingo import FlamingoConfig 13 | # from .modeling_flamingo import FlamingoForConditionalGeneration 14 | 15 | parser = argparse.ArgumentParser(description="Convert Vicuna model") 16 | parser.add_argument("--model_choice", type=str, default="13B", help="Choose either '7B' or '13B'") 17 | parser.add_argument("--llama2_root_dir", type=str, default="/home/luodian/projects/checkpoints") 18 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints") 19 | args = parser.parse_args() 20 | 21 | # os.environ["TOKENIZERS_PARALLELISM"] = "false" 22 | 23 | root_dir = args.llama2_root_dir 24 | model_choice = args.model_choice 25 | save_root_dir = args.save_root_dir 26 | 27 | # prepare vicuna model at first 28 | # you can visit https://huggingface.co/lmsys/Llama-2-33b-chat-hf to download 7B and 30B instruct checkpoints. 29 | if model_choice == "7B": 30 | config_file = "./flamingo/flamingo-llama2-chat-7B.json" 31 | state_dict_files = [ 32 | f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00001-of-00002.bin", 33 | f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00002-of-00002.bin", 34 | ] 35 | save_path = f"{save_root_dir}/flamingo-llama2-chat-7B-init" 36 | elif model_choice == "13B": 37 | config_file = "./flamingo/flamingo-llama2-chat-13B.json" 38 | state_dict_files = [ 39 | f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00001-of-00003.bin", 40 | f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00002-of-00003.bin", 41 | f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00003-of-00003.bin", 42 | ] 43 | save_path = f"{save_root_dir}/flamingo-llama2-chat-13B-init" 44 | else: 45 | raise ValueError("Invalid model_choice. Choose either '13B' or '7B'.") 46 | 47 | config = FlamingoConfig.from_json_file(config_file) 48 | model = FlamingoForConditionalGeneration(config=config) 49 | 50 | # load flamingo's vision encoder from last checkpoint. 51 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint. 52 | # AZP = "os.environ["AZP"]" 53 | AZP = os.environ["AZP"] 54 | state_dict_3 = torch.load( 55 | f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", 56 | map_location="cpu", 57 | ) 58 | for cur_key in list(state_dict_3.keys()): 59 | if "vision_encoder" not in cur_key: 60 | del state_dict_3[cur_key] 61 | 62 | load_msg = model.load_state_dict( 63 | state_dict_3, 64 | False, 65 | ) 66 | # print incompatible keys 67 | print(load_msg[1]) 68 | 69 | # Loading vicuna weights 70 | state_dict = {} 71 | for file in tqdm(state_dict_files, desc="Loading state dict"): 72 | state_dict_part = torch.load(file, map_location="cpu") 73 | state_dict.update(state_dict_part) 74 | 75 | save_state_dict_1 = {} 76 | for key in state_dict: 77 | if ".layers." in key: 78 | _, _, layer_num, *remain_names = key.split(".") 79 | target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 80 | else: 81 | target_key = key 82 | save_state_dict_1[f"{target_key}"] = state_dict[key] 83 | 84 | # Reshape the token embedding to 50280 for compatible 85 | model.lang_encoder.resize_token_embeddings(32000) 86 | 87 | load_msg = model.lang_encoder.load_state_dict( 88 | save_state_dict_1, 89 | False, 90 | ) 91 | # Reshape the token embedding to 32002 for compatible 92 | model.lang_encoder.resize_token_embeddings(32002) 93 | # print incompatible keys 94 | print(load_msg[1]) 95 | 96 | 97 | print(f"Saving model to {save_path}...") 98 | model.save_pretrained(save_path, max_shard_size="10GB") 99 | -------------------------------------------------------------------------------- /src/otter_ai/models/flamingo/utils/injecting_mpt-1B-redpajama_into_flamingo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from tqdm import tqdm 6 | 7 | import sys 8 | 9 | from ..configuration_flamingo import FlamingoConfig 10 | from ..modeling_flamingo import FlamingoForConditionalGeneration 11 | from utils import rename_flamingo_checkpoint 12 | 13 | 14 | parser = argparse.ArgumentParser(description="Convert MPT model") 15 | parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints") 16 | parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints") 17 | parser.add_argument( 18 | "--flamingo_dir", 19 | type=str, 20 | default=None, 21 | help="If the pretrained flamingo weights also need to be injected", 22 | ) 23 | args = parser.parse_args() 24 | 25 | 26 | root_dir = args.mpt_root_dir 27 | save_root_dir = args.save_root_dir 28 | 29 | # prepare mpt model at first 30 | # you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints. 31 | config_file = "./flamingo/flamingo-mpt-1B-redpajama.json" 32 | state_dict_file = f"{root_dir}/pytorch_model.bin" 33 | save_path = f"{save_root_dir}/flamingo-mpt-1b-redpajama-200b-dolly" 34 | 35 | config = FlamingoConfig.from_json_file(config_file) 36 | 37 | model = FlamingoForConditionalGeneration(config=config) 38 | 39 | # Loading mpt weights 40 | state_dict = torch.load(state_dict_file, map_location="cpu") 41 | save_state_dict_1 = {} 42 | for key in state_dict: 43 | if ".blocks." in key: 44 | _, _, layer_num, *remain_names = key.split(".") 45 | target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 46 | else: 47 | target_key = key 48 | save_state_dict_1[f"{target_key}"] = state_dict[key] 49 | 50 | load_msg = model.lang_encoder.load_state_dict( 51 | save_state_dict_1, 52 | False, 53 | ) 54 | 55 | # load flamingo's vision encoder from last checkpoint. 56 | # you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint. 57 | AZP = os.environ["AZP"] 58 | state_dict_3 = torch.load(f"{AZP}/pytorch_model-00004-of-00004.bin", map_location="cpu") 59 | for cur_key in list(state_dict_3.keys()): 60 | if "vision_encoder" not in cur_key: 61 | del state_dict_3[cur_key] 62 | 63 | load_msg = model.load_state_dict( 64 | state_dict_3, 65 | False, 66 | ) 67 | # print incompatible keys 68 | print(load_msg[1]) 69 | 70 | save_state_dict_1 = {} 71 | for key in state_dict: 72 | if ".blocks." in key: 73 | _, _, layer_num, *remain_names = key.split(".") 74 | target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}" 75 | else: 76 | target_key = key 77 | save_state_dict_1[f"{target_key}"] = state_dict[key] 78 | 79 | load_msg = model.lang_encoder.load_state_dict( 80 | save_state_dict_1, 81 | False, 82 | ) 83 | # print incompatible keys 84 | print(load_msg[1]) 85 | if args.flamingo_dir is not None: 86 | state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu") 87 | save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2) 88 | real_vocab_size = config.text_config.vocab_size 89 | # Reshape the token embedding to 50280 for compatible 90 | model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0]) 91 | 92 | load_msg = model.load_state_dict( 93 | save_state_dict_2, 94 | False, 95 | ) 96 | # print incompatible keys 97 | print(load_msg[1]) 98 | # Reshape the token embedding to 50432 99 | model.lang_encoder.resize_token_embeddings(real_vocab_size) 100 | 101 | print(f"Saving model to {save_path}...") 102 | model.save_pretrained(save_path, max_shard_size="10GB") 103 | -------------------------------------------------------------------------------- /src/otter_ai/models/mpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/mpt/__init__.py -------------------------------------------------------------------------------- /src/otter_ai/models/mpt/adapt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast 3 | 4 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] 5 | NUM_SENTINEL_TOKENS: int = 100 6 | 7 | 8 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer): 9 | """Adds sentinel tokens and padding token (if missing). 10 | 11 | Expands the tokenizer vocabulary to include sentinel tokens 12 | used in mixture-of-denoiser tasks as well as a padding token. 13 | 14 | All added tokens are added as special tokens. No tokens are 15 | added if sentinel tokens and padding token already exist. 16 | """ 17 | sentinels_to_add = [f"" for i in range(NUM_SENTINEL_TOKENS)] 18 | tokenizer.add_tokens(sentinels_to_add, special_tokens=True) 19 | if tokenizer.pad_token is None: 20 | tokenizer.add_tokens("", special_tokens=True) 21 | tokenizer.pad_token = "" 22 | assert tokenizer.pad_token_id is not None 23 | sentinels = "".join([f"" for i in range(NUM_SENTINEL_TOKENS)]) 24 | _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids 25 | tokenizer.sentinel_token_ids = _sentinel_token_ids 26 | 27 | 28 | class AutoTokenizerForMOD(AutoTokenizer): 29 | """AutoTokenizer + Adaptation for MOD. 30 | 31 | A simple wrapper around AutoTokenizer to make instantiating 32 | an MOD-adapted tokenizer a bit easier. 33 | 34 | MOD-adapted tokenizers have sentinel tokens (e.g., ), 35 | a padding token, and a property to get the token ids of the 36 | sentinel tokens. 37 | """ 38 | 39 | @classmethod 40 | def from_pretrained(cls, *args, **kwargs): 41 | """See `AutoTokenizer.from_pretrained` docstring.""" 42 | tokenizer = super().from_pretrained(*args, **kwargs) 43 | adapt_tokenizer_for_denoising(tokenizer) 44 | return tokenizer 45 | -------------------------------------------------------------------------------- /src/otter_ai/models/mpt/blocks.py: -------------------------------------------------------------------------------- 1 | """GPT Blocks used for the GPT Model.""" 2 | from typing import Dict, Optional, Tuple 3 | import torch 4 | import torch.nn as nn 5 | from .attention import ATTN_CLASS_REGISTRY 6 | from .norm import NORM_CLASS_REGISTRY 7 | 8 | 9 | class MPTMLP(nn.Module): 10 | def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str] = None): 11 | super().__init__() 12 | self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device) 13 | ## yh: hard code 14 | # self.act = nn.GELU(approximate='none') 15 | self.act = nn.GELU() 16 | self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device) 17 | self.down_proj._is_residual = True 18 | 19 | def forward(self, x): 20 | return self.down_proj(self.act(self.up_proj(x))) 21 | 22 | 23 | class MPTBlock(nn.Module): 24 | def __init__( 25 | self, 26 | d_model: int, 27 | n_heads: int, 28 | expansion_ratio: int, 29 | attn_config: Dict = { 30 | "attn_type": "multihead_attention", 31 | "attn_pdrop": 0.0, 32 | "attn_impl": "triton", 33 | "qk_ln": False, 34 | "clip_qkv": None, 35 | "softmax_scale": None, 36 | "prefix_lm": False, 37 | "attn_uses_sequence_id": False, 38 | "alibi": False, 39 | "alibi_bias_max": 8, 40 | }, 41 | resid_pdrop: float = 0.0, 42 | norm_type: str = "low_precision_layernorm", 43 | verbose: int = 0, 44 | device: Optional[str] = None, 45 | **kwargs 46 | ): 47 | del kwargs 48 | super().__init__() 49 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] 50 | attn_class = ATTN_CLASS_REGISTRY[attn_config["attn_type"]] 51 | self.norm_1 = norm_class(d_model, device=device) 52 | self.attn = attn_class( 53 | attn_impl=attn_config["attn_impl"], 54 | clip_qkv=attn_config["clip_qkv"], 55 | qk_ln=attn_config["qk_ln"], 56 | softmax_scale=attn_config["softmax_scale"], 57 | attn_pdrop=attn_config["attn_pdrop"], 58 | d_model=d_model, 59 | n_heads=n_heads, 60 | verbose=verbose, 61 | device=device, 62 | ) 63 | self.norm_2 = norm_class(d_model, device=device) 64 | self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device) 65 | self.resid_attn_dropout = nn.Dropout(resid_pdrop) 66 | self.resid_ffn_dropout = nn.Dropout(resid_pdrop) 67 | 68 | def forward( 69 | self, 70 | x: torch.Tensor, 71 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 72 | attn_bias: Optional[torch.Tensor] = None, 73 | attention_mask: Optional[torch.ByteTensor] = None, 74 | is_causal: bool = True, 75 | ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]: 76 | a = self.norm_1(x) 77 | (b, attn_weights, past_key_value) = self.attn( 78 | a, 79 | past_key_value=past_key_value, 80 | attn_bias=attn_bias, 81 | attention_mask=attention_mask, 82 | is_causal=is_causal, 83 | ) 84 | x = x + self.resid_attn_dropout(b) 85 | m = self.norm_2(x) 86 | n = self.ffn(m) 87 | x = x + self.resid_ffn_dropout(n) 88 | return (x, attn_weights, past_key_value) 89 | -------------------------------------------------------------------------------- /src/otter_ai/models/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | 7 | class SharedEmbedding(nn.Embedding): 8 | def forward(self, input: Tensor, unembed: bool = False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) 12 | -------------------------------------------------------------------------------- /src/otter_ai/models/mpt/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def _cast_if_autocast_enabled(tensor): 5 | if torch.is_autocast_enabled(): 6 | if tensor.device.type == "cuda": 7 | dtype = torch.get_autocast_gpu_dtype() 8 | elif tensor.device.type == "cpu": 9 | dtype = torch.get_autocast_cpu_dtype() 10 | else: 11 | raise NotImplementedError() 12 | return tensor.to(dtype=dtype) 13 | return tensor 14 | 15 | 16 | class LPLayerNorm(torch.nn.LayerNorm): 17 | def __init__( 18 | self, 19 | normalized_shape, 20 | eps=1e-05, 21 | elementwise_affine=True, 22 | device=None, 23 | dtype=None, 24 | ): 25 | super().__init__( 26 | normalized_shape=normalized_shape, 27 | eps=eps, 28 | elementwise_affine=elementwise_affine, 29 | device=device, 30 | dtype=dtype, 31 | ) 32 | 33 | def forward(self, x): 34 | module_device = x.device 35 | downcast_x = _cast_if_autocast_enabled(x) 36 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 37 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias 38 | with torch.autocast(enabled=False, device_type=module_device.type): 39 | return torch.nn.functional.layer_norm( 40 | downcast_x, 41 | self.normalized_shape, 42 | downcast_weight, 43 | downcast_bias, 44 | self.eps, 45 | ) 46 | 47 | 48 | def rms_norm(x, weight=None, eps=1e-05): 49 | output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) 50 | if weight is not None: 51 | return output * weight 52 | return output 53 | 54 | 55 | class RMSNorm(torch.nn.Module): 56 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 57 | super().__init__() 58 | self.eps = eps 59 | if weight: 60 | self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device)) 61 | else: 62 | self.register_parameter("weight", None) 63 | 64 | def forward(self, x): 65 | return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype) 66 | 67 | 68 | class LPRMSNorm(RMSNorm): 69 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 70 | super().__init__( 71 | normalized_shape=normalized_shape, 72 | eps=eps, 73 | weight=weight, 74 | dtype=dtype, 75 | device=device, 76 | ) 77 | 78 | def forward(self, x): 79 | downcast_x = _cast_if_autocast_enabled(x) 80 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 81 | with torch.autocast(enabled=False, device_type=x.device.type): 82 | return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype) 83 | 84 | 85 | NORM_CLASS_REGISTRY = { 86 | "layernorm": torch.nn.LayerNorm, 87 | "low_precision_layernorm": LPLayerNorm, 88 | "rmsnorm": RMSNorm, 89 | "low_precision_rmsnorm": LPRMSNorm, 90 | } 91 | -------------------------------------------------------------------------------- /src/otter_ai/models/mpt_redpajama/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/mpt_redpajama/__init__.py -------------------------------------------------------------------------------- /src/otter_ai/models/mpt_redpajama/gpt_blocks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 MosaicML Examples authors 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | """GPT Blocks used for the GPT Model.""" 5 | 6 | from typing import Optional, Tuple 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | from .attention import MultiheadAttention 12 | from .low_precision_layernorm import LPLayerNorm 13 | 14 | 15 | class GPTMLP(nn.Module): 16 | def __init__(self, d_model: int, mlp_ratio: int, device: Optional[str] = None): 17 | super().__init__() 18 | self.mlp_up = nn.Linear(d_model, mlp_ratio * d_model, device=device) 19 | self.mlp_act = nn.GELU() 20 | self.mlp_down = nn.Linear(mlp_ratio * d_model, d_model, device=device) 21 | self.mlp_down._is_residual = True # type: ignore 22 | 23 | def forward(self, x): 24 | return self.mlp_down(self.mlp_act(self.mlp_up(x))) 25 | 26 | 27 | class GPTBlock(nn.Module): 28 | def __init__( 29 | self, 30 | attn_impl: str, 31 | d_model: int, 32 | n_heads: int, 33 | mlp_ratio: int, 34 | attn_clip_qkv: Optional[float] = None, 35 | attn_qk_ln: bool = False, 36 | softmax_scale: Optional[float] = None, 37 | attn_pdrop: float = 0.0, 38 | alibi: bool = False, 39 | resid_pdrop: float = 0.0, 40 | low_precision_layernorm: bool = False, 41 | device: Optional[str] = None, 42 | **kwargs 43 | ): 44 | del kwargs # unused, just to capture any extra args from the config 45 | super().__init__() 46 | 47 | layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm 48 | 49 | self.ln_1 = layernorm_class(d_model, device=device) 50 | self.attn = MultiheadAttention( 51 | attn_impl=attn_impl, 52 | attn_clip_qkv=attn_clip_qkv, 53 | attn_qk_ln=attn_qk_ln, 54 | softmax_scale=softmax_scale, 55 | attn_pdrop=attn_pdrop, 56 | d_model=d_model, 57 | n_heads=n_heads, 58 | device=device, 59 | ) 60 | self.ln_2 = layernorm_class(d_model, device=device) 61 | self.mlp = GPTMLP( 62 | d_model=d_model, 63 | mlp_ratio=mlp_ratio, 64 | device=device, 65 | ) 66 | self.resid_attn_dropout = nn.Dropout(resid_pdrop) 67 | self.resid_mlp_dropout = nn.Dropout(resid_pdrop) 68 | 69 | def forward( 70 | self, 71 | x: torch.Tensor, 72 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 73 | attn_bias: Optional[torch.Tensor] = None, 74 | attention_mask: Optional[torch.ByteTensor] = None, 75 | is_causal: bool = True, 76 | ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]: 77 | a = self.ln_1(x) 78 | b, _, past_key_value = self.attn( 79 | a, 80 | past_key_value=past_key_value, 81 | attn_bias=attn_bias, 82 | attention_mask=attention_mask, 83 | is_causal=is_causal, 84 | ) 85 | x = x + self.resid_attn_dropout(b) 86 | m = self.ln_2(x) 87 | n = self.mlp(m) 88 | x = x + self.resid_mlp_dropout(n) 89 | return x, past_key_value 90 | -------------------------------------------------------------------------------- /src/otter_ai/models/mpt_redpajama/low_precision_layernorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class LPLayerNorm(torch.nn.LayerNorm): 6 | def __init__( 7 | self, 8 | normalized_shape, 9 | eps=1e-05, 10 | elementwise_affine=True, 11 | device=None, 12 | dtype=None, 13 | ): 14 | super().__init__( 15 | normalized_shape=normalized_shape, 16 | eps=eps, 17 | elementwise_affine=elementwise_affine, 18 | device=device, 19 | dtype=dtype, 20 | ) 21 | 22 | def forward(self, x): 23 | module_device = x.device 24 | downcast_x = _cast_if_autocast_enabled(x) 25 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 26 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias 27 | with torch.autocast(enabled=False, device_type=module_device.type): 28 | return F.layer_norm( 29 | downcast_x, 30 | self.normalized_shape, 31 | downcast_weight, 32 | downcast_bias, 33 | self.eps, 34 | ) 35 | 36 | 37 | def _cast_if_autocast_enabled(tensor): 38 | if torch.is_autocast_enabled(): 39 | if tensor.device.type == "cuda": 40 | dtype = torch.get_autocast_gpu_dtype() 41 | elif tensor.device.type == "cpu": 42 | dtype = torch.get_autocast_cpu_dtype() 43 | else: 44 | raise NotImplementedError() 45 | return tensor.to(dtype=dtype) 46 | return tensor 47 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/Otter-MPT7B-config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/src/otter_ai/models/otter/Otter-MPT7B-config.json -------------------------------------------------------------------------------- /src/otter_ai/models/otter/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from transformers.utils import ( 4 | OptionalDependencyNotAvailable, 5 | _LazyModule, 6 | is_torch_available, 7 | ) 8 | 9 | 10 | _import_structure = { 11 | "configuration_otter": [ 12 | "OtterConfig", 13 | ], 14 | } 15 | 16 | try: 17 | if not is_torch_available(): 18 | raise OptionalDependencyNotAvailable() 19 | except OptionalDependencyNotAvailable: 20 | pass 21 | else: 22 | _import_structure["modeling_otter"] = [ 23 | "OtterModel", 24 | "OtterPreTrainedModel", 25 | "OtterForConditionalGeneration", 26 | ] 27 | 28 | if TYPE_CHECKING: 29 | from .configuration_otter import OtterConfig 30 | 31 | # from .processing_otter import OtterProcessor 32 | 33 | try: 34 | if not is_torch_available(): 35 | raise OptionalDependencyNotAvailable() 36 | except OptionalDependencyNotAvailable: 37 | pass 38 | else: 39 | from .modeling_otter import ( 40 | OtterForConditionalGeneration, 41 | OtterModel, 42 | OtterPreTrainedModel, 43 | ) 44 | 45 | else: 46 | import sys 47 | 48 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 49 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "otter", 3 | "cross_attn_every_n_layers": 4, 4 | "tie_word_embeddings": false, 5 | "use_media_placement_augmentation": true, 6 | "only_attend_previous": true, 7 | "text_config": { 8 | "_name_or_path": "luodian/llama-7b-hf", 9 | "model_type": "llama" 10 | }, 11 | "vision_config": { 12 | "_name_or_path": "openai/clip-vit-large-patch14", 13 | "model_type": "clip_vision_model", 14 | "hidden_size": 1024, 15 | "intermediate_size": 4096, 16 | "num_attention_heads": 16, 17 | "num_hidden_layers": 24, 18 | "image_size": 224, 19 | "patch_size": 14 20 | } 21 | } -------------------------------------------------------------------------------- /src/otter_ai/models/otter/converting_flamingo_to_otter.py: -------------------------------------------------------------------------------- 1 | # This script is used to convert the huggingface format Open-Flamingo model to the Otter model. 2 | # You can use it in parent folder by running: python -m models.otter.converting_flamingo_to_otter --checkpoint_path --save_path 3 | import argparse 4 | import torch 5 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration 6 | from otter_ai.models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration 7 | 8 | # Define argument parser 9 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 10 | 11 | # Add arguments 12 | parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the pre-trained Open-Flamingo model checkpoint.") 13 | parser.add_argument("--save_path", type=str, default=None, help="Path to the converted Otter model checkpoint.") 14 | 15 | # Parse the input arguments 16 | args = parser.parse_args() 17 | 18 | # Load the model 19 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto") 20 | model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "", ""]}) 21 | if model.lang_encoder.__class__.__name__ == "LlamaForCausalLM": 22 | model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer)) 23 | 24 | # Save the model 25 | checkpoint_path = args.save_path 26 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path) 27 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/converting_otter_fp32_to_fp16.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration 4 | 5 | # Define argument parser 6 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 7 | 8 | # Add arguments 9 | parser.add_argument( 10 | "--load_bit", 11 | type=str, 12 | choices=["fp16", "bf16"], 13 | default="fp16", 14 | help="Precision of the loaded model. Either 'fp16' or 'bf16'. Default is 'fp16'.", 15 | ) 16 | parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the pre-trained model checkpoint.") 17 | parser.add_argument("--save_path", type=str, default=None, help="Path to the converted model checkpoint.") 18 | 19 | # Parse the input arguments 20 | args = parser.parse_args() 21 | 22 | # Set precision based on load_bit argument 23 | if args.load_bit == "fp16": 24 | precision = {"torch_dtype": torch.float16} 25 | elif args.load_bit == "bf16": 26 | precision = {"torch_dtype": torch.bfloat16} 27 | 28 | # Load the model 29 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision) 30 | 31 | # Save the model 32 | if args.save_path is None: 33 | checkpoint_path = args.checkpoint_path + f"-{args.load_bit}" 34 | else: 35 | checkpoint_path = args.save_path 36 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path) 37 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/converting_otter_pt_to_hf.py: -------------------------------------------------------------------------------- 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train. 2 | """ 3 | 4 | import argparse 5 | import os 6 | 7 | import torch 8 | 9 | from modeling_otter import OtterForConditionalGeneration 10 | 11 | 12 | # The function is to inject newly trained otter perceiver parameters into the pretrained otter init model. 13 | @torch.no_grad() 14 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None: 15 | old_ckpt = torch.load(old_ckpt_path, map_location="cpu") 16 | if old_ckpt.get("model_state_dict", None) is not None: 17 | old_ckpt = old_ckpt["model_state_dict"] 18 | new_ckpt = old_ckpt 19 | # folder_path = os.path.dirname(old_ckpt_path) 20 | # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "otter/config.json" 21 | model = OtterForConditionalGeneration.from_pretrained( 22 | args.pretrained_model_path, 23 | device_map="auto", 24 | ) 25 | 26 | if "flamingo" in args.pretrained_model_path: 27 | model.text_tokenizer.add_special_tokens({"additional_special_tokens": [""]}) 28 | if "LlamaForCausalLM" in model.lang_encoder.__class__.__name__: 29 | model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer)) 30 | 31 | _ = model.load_state_dict(new_ckpt, strict=False) 32 | print(f"Saving HF model to {new_folder_path}") 33 | model.save_pretrained(new_folder_path) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument( 39 | "--old_ckpt_path", 40 | "-old", 41 | type=str, 42 | required=True, 43 | help="Path to the pt checkpoint", 44 | ) 45 | parser.add_argument( 46 | "--new_hf_path", 47 | "-new", 48 | type=str, 49 | required=True, 50 | help="Path to the hf folder", 51 | ) 52 | parser.add_argument( 53 | "--pretrained_model_path", 54 | "-pretrained", 55 | type=str, 56 | default="luodian/OTTER-MPT7B-Init", 57 | required=True, 58 | help="Path to the pretrained model folder.", 59 | ) 60 | args = parser.parse_args() 61 | if not os.path.exists(os.path.dirname(args.new_hf_path)): 62 | os.makedirs(os.path.dirname(args.new_hf_path)) 63 | dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path) 64 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/converting_otter_to_lora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import sys 4 | 5 | from .modeling_otter import OtterForConditionalGeneration 6 | from peft import get_peft_model, LoraConfig, TaskType 7 | 8 | MODEL_CLASSES = { 9 | "LlamaForCausalLM": "llama", 10 | "OPTForCausalLM": "opt", 11 | "GPTJForCausalLM": "gptj", 12 | "GPTNeoXForCausalLM": "gpt_neox", 13 | "MPTForCausalLM": "mpt", 14 | } 15 | 16 | # Define argument parser 17 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 18 | 19 | # Add arguments 20 | parser.add_argument( 21 | "--checkpoint_path", 22 | type=str, 23 | help="Path to the pre-trained model checkpoint.", 24 | default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705", 25 | ) 26 | parser.add_argument( 27 | "--save_path", 28 | type=str, 29 | default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705-LoRA", 30 | help="Path to the converted model checkpoint.", 31 | ) 32 | 33 | # Parse the input arguments 34 | args = parser.parse_args() 35 | 36 | # Load the model 37 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto") 38 | 39 | # adding lora 40 | standard_modules = ["q_proj", "v_proj"] 41 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]] 42 | model_to_lora_modules = { 43 | "llama": standard_modules, 44 | "opt": standard_modules, 45 | "gptj": standard_modules, 46 | "gpt_neox": ["query_key_value"], 47 | "mpt": ["Wqkv"], 48 | } 49 | lora_config = LoraConfig( 50 | r=16, 51 | lora_alpha=32, 52 | lora_dropout=0.05, 53 | task_type=TaskType.CAUSAL_LM, 54 | target_modules=model_to_lora_modules[lang_encoder_short_name], 55 | ) 56 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}}) 57 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config) 58 | 59 | # Save the model 60 | checkpoint_path = args.save_path 61 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path) 62 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/utils/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "otter", 3 | "cross_attn_every_n_layers": 4, 4 | "tie_word_embeddings": false, 5 | "use_media_placement_augmentation": true, 6 | "only_attend_previous": true, 7 | "text_config": { 8 | "_name_or_path": "luodian/llama-7b-hf", 9 | "model_type": "llama" 10 | }, 11 | "vision_config": { 12 | "_name_or_path": "openai/clip-vit-large-patch14", 13 | "model_type": "clip_vision_model", 14 | "hidden_size": 1024, 15 | "intermediate_size": 4096, 16 | "num_attention_heads": 16, 17 | "num_hidden_layers": 24, 18 | "image_size": 224, 19 | "patch_size": 14 20 | } 21 | } -------------------------------------------------------------------------------- /src/otter_ai/models/otter/utils/converting_flamingo_to_otter.py: -------------------------------------------------------------------------------- 1 | # This script is used to convert the huggingface format Open-Flamingo model to the Otter model. 2 | # You can use it in parent folder by running: python -m models.otter.converting_flamingo_to_otter --checkpoint_path --save_path 3 | import argparse 4 | import torch 5 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration 6 | from otter_ai.models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration 7 | 8 | # Define argument parser 9 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 10 | 11 | # Add arguments 12 | parser.add_argument( 13 | "--checkpoint_path", 14 | type=str, 15 | required=True, 16 | help="Path to the pre-trained Open-Flamingo model checkpoint.", 17 | ) 18 | parser.add_argument( 19 | "--save_path", 20 | type=str, 21 | default=None, 22 | help="Path to the converted Otter model checkpoint.", 23 | ) 24 | 25 | # Parse the input arguments 26 | args = parser.parse_args() 27 | 28 | # Load the model 29 | model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto") 30 | model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "", ""]}) 31 | if model.lang_encoder.__class__.__name__ == "LlamaForCausalLM": 32 | model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer)) 33 | 34 | # Save the model 35 | checkpoint_path = args.save_path 36 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path) 37 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/utils/converting_otter_fp32_to_fp16.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration 4 | 5 | # Define argument parser 6 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 7 | 8 | # Add arguments 9 | parser.add_argument( 10 | "--load_bit", 11 | type=str, 12 | choices=["fp16", "bf16"], 13 | default="fp16", 14 | help="Precision of the loaded model. Either 'fp16' or 'bf16'. Default is 'fp16'.", 15 | ) 16 | parser.add_argument( 17 | "--checkpoint_path", 18 | type=str, 19 | required=True, 20 | help="Path to the pre-trained model checkpoint.", 21 | ) 22 | parser.add_argument( 23 | "--save_path", 24 | type=str, 25 | default=None, 26 | help="Path to the converted model checkpoint.", 27 | ) 28 | 29 | # Parse the input arguments 30 | args = parser.parse_args() 31 | 32 | # Set precision based on load_bit argument 33 | if args.load_bit == "fp16": 34 | precision = {"torch_dtype": torch.float16} 35 | elif args.load_bit == "bf16": 36 | precision = {"torch_dtype": torch.bfloat16} 37 | 38 | # Load the model 39 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision) 40 | 41 | # Save the model 42 | if args.save_path is None: 43 | checkpoint_path = args.checkpoint_path + f"-{args.load_bit}" 44 | else: 45 | checkpoint_path = args.save_path 46 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path) 47 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/utils/converting_otter_pt_to_hf.py: -------------------------------------------------------------------------------- 1 | """convert from otter pt to otter hf. Will remove after we use otter hf model to train. 2 | """ 3 | 4 | import argparse 5 | import os 6 | 7 | import torch 8 | 9 | from modeling_otter import OtterForConditionalGeneration 10 | 11 | 12 | # The function is to inject newly trained otter perceiver parameters into the pretrained otter init model. 13 | @torch.no_grad() 14 | def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None: 15 | old_ckpt = torch.load(old_ckpt_path, map_location="cpu") 16 | if old_ckpt.get("model_state_dict", None) is not None: 17 | old_ckpt = old_ckpt["model_state_dict"] 18 | new_ckpt = old_ckpt 19 | # folder_path = os.path.dirname(old_ckpt_path) 20 | # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "otter/config.json" 21 | model = OtterForConditionalGeneration.from_pretrained( 22 | args.pretrained_model_path, 23 | device_map="auto", 24 | ) 25 | 26 | if "flamingo" in args.pretrained_model_path: 27 | model.text_tokenizer.add_special_tokens({"additional_special_tokens": [""]}) 28 | if "LlamaForCausalLM" in model.lang_encoder.__class__.__name__: 29 | model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer)) 30 | 31 | _ = model.load_state_dict(new_ckpt, strict=False) 32 | print(f"Saving HF model to {new_folder_path}") 33 | model.save_pretrained(new_folder_path) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument( 39 | "--old_ckpt_path", 40 | "-old", 41 | type=str, 42 | required=True, 43 | help="Path to the pt checkpoint", 44 | ) 45 | parser.add_argument( 46 | "--new_hf_path", 47 | "-new", 48 | type=str, 49 | required=True, 50 | help="Path to the hf folder", 51 | ) 52 | parser.add_argument( 53 | "--pretrained_model_path", 54 | "-pretrained", 55 | type=str, 56 | default="luodian/OTTER-MPT7B-Init", 57 | required=True, 58 | help="Path to the pretrained model folder.", 59 | ) 60 | args = parser.parse_args() 61 | if not os.path.exists(os.path.dirname(args.new_hf_path)): 62 | os.makedirs(os.path.dirname(args.new_hf_path)) 63 | dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path) 64 | -------------------------------------------------------------------------------- /src/otter_ai/models/otter/utils/converting_otter_to_lora.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import sys 4 | 5 | from .modeling_otter import OtterForConditionalGeneration 6 | from peft import get_peft_model, LoraConfig, TaskType 7 | 8 | MODEL_CLASSES = { 9 | "LlamaForCausalLM": "llama", 10 | "OPTForCausalLM": "opt", 11 | "GPTJForCausalLM": "gptj", 12 | "GPTNeoXForCausalLM": "gpt_neox", 13 | "MPTForCausalLM": "mpt", 14 | } 15 | 16 | # Define argument parser 17 | parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.") 18 | 19 | # Add arguments 20 | parser.add_argument( 21 | "--checkpoint_path", 22 | type=str, 23 | help="Path to the pre-trained model checkpoint.", 24 | default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705", 25 | ) 26 | parser.add_argument( 27 | "--save_path", 28 | type=str, 29 | default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705-LoRA", 30 | help="Path to the converted model checkpoint.", 31 | ) 32 | 33 | # Parse the input arguments 34 | args = parser.parse_args() 35 | 36 | # Load the model 37 | model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto") 38 | 39 | # adding lora 40 | standard_modules = ["q_proj", "v_proj"] 41 | lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]] 42 | model_to_lora_modules = { 43 | "llama": standard_modules, 44 | "opt": standard_modules, 45 | "gptj": standard_modules, 46 | "gpt_neox": ["query_key_value"], 47 | "mpt": ["Wqkv"], 48 | } 49 | lora_config = LoraConfig( 50 | r=16, 51 | lora_alpha=32, 52 | lora_dropout=0.05, 53 | task_type=TaskType.CAUSAL_LM, 54 | target_modules=model_to_lora_modules[lang_encoder_short_name], 55 | ) 56 | model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}}) 57 | model.lang_encoder = get_peft_model(model.lang_encoder, lora_config) 58 | 59 | # Save the model 60 | checkpoint_path = args.save_path 61 | OtterForConditionalGeneration.save_pretrained(model, checkpoint_path) 62 | -------------------------------------------------------------------------------- /unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolvingLMMs-Lab/Otter/1e7eb9a6fb12ef410082e796c463b99495637b85/unit_tests/__init__.py -------------------------------------------------------------------------------- /unit_tests/test_mmc4_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import Mock 3 | from pipeline.mimicit_utils.data import get_mmc4_dataset 4 | 5 | 6 | class TestGetMMC4Dataset(unittest.TestCase): 7 | def test_get_mmc4_dataset(self): 8 | # Mock the required inputs 9 | args = Mock( 10 | mmc4_shards="/home/luodian/projects/Otter/archived/000000000.tar", 11 | train_num_samples_mmc4=1000, 12 | mmc4_textsim_threshold=0.32, 13 | batch_size_mmc4=10, 14 | seed=0, 15 | workers=2, 16 | world_size=1, 17 | ) 18 | image_processor = Mock() 19 | tokenizer = Mock() 20 | 21 | # Call the function to test 22 | data_info = get_mmc4_dataset(args, image_processor, tokenizer) 23 | 24 | # Check if the dataloader's attributes are as expected 25 | self.assertEqual(data_info.dataloader.num_batches, 100) 26 | self.assertEqual(data_info.dataloader.num_samples, 1000) 27 | -------------------------------------------------------------------------------- /unit_tests/test_prerun.py: -------------------------------------------------------------------------------- 1 | # Inside tests/unit_tests/test_prerun.py 2 | import os 3 | import yaml 4 | import pytest 5 | import orjson 6 | import pandas as pd 7 | 8 | 9 | # Define the pytest fixture 10 | @pytest.fixture 11 | def yaml_data(request): 12 | yaml_path = request.config.getoption("--yaml-path") 13 | if not yaml_path or not os.path.exists(yaml_path): 14 | pytest.fail(f"YAML file path '{yaml_path}' does not exist.") 15 | with open(yaml_path, "r") as f: 16 | data = yaml.safe_load(f) 17 | return data 18 | 19 | 20 | # Your test function 21 | @pytest.mark.prerun 22 | def test_yaml_structure(yaml_data): 23 | required_categories = [ 24 | "IMAGE_TEXT", 25 | "TEXT_ONLY", 26 | "VIDEO_TEXT", 27 | "IMAGE_TEXT_IN_CONTEXT", 28 | ] 29 | 30 | for category, datasets in yaml_data.items(): 31 | assert category in required_categories, f"Unexpected category '{category}' in YAML. Expected categories are {required_categories}." 32 | 33 | for dataset_name, data in datasets.items(): 34 | for path_key, path_value in data.items(): 35 | if path_key.endswith("_path"): 36 | assert os.path.exists(path_value), f"Dataset path {path_value} specified under {category} -> {dataset_name} does not exist." 37 | elif path_key == "num_samples": 38 | assert isinstance(path_value, int), f"'num_samples' should be an integer but got {type(path_value)} under {category} -> {dataset_name}." 39 | 40 | # checking mimicit path aligns with corresponding format. 41 | if path_key == "mimicit_path": 42 | print(f"Checking -> {path_value} in MIMICIT format.") 43 | with open(path_value, "rb") as f: 44 | data = orjson.loads(f.read()) 45 | 46 | assert "data" in data 47 | 48 | if path_key == "images_path": 49 | print(f"Checking -> {path_value} in images format.") 50 | assert os.path.exists(path_value), f"Dataset path {path_value} specified under {category} -> {dataset_name} does not exist." 51 | # # Read the parquet file using pandas 52 | # df = pd.read_parquet(path_value) 53 | 54 | # # Check for the 'base64' column 55 | # assert "base64" in df.columns, f"The 'base64' column was not found in the dataset {path_value}." 56 | -------------------------------------------------------------------------------- /xformers_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import CLIPVisionModel 2 | from .llama import LlamaForCausalLM 3 | --------------------------------------------------------------------------------